In [50]:
# dataset: bank costumers are leaving the bank. your job is to build a model to figure out,
# why, and who will potentially leave the bank (exited column). (the bank can target them with special offer)

In [57]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

# 1. preprocessing

In [58]:
df = pd.read_csv("./Churn_Modelling.csv")
print(df.shape)
df.head()

(10000, 14)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [59]:
# remove unnecessary columns
# no missing data
X_independent = df.iloc[:, 3:-1]
y_dependent = df.iloc[:, -1]
print(X_independent.head())
print("")
print(y_dependent.head())

   CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          619    France  Female   42       2       0.00              1   
1          608     Spain  Female   41       1   83807.86              1   
2          502    France  Female   42       8  159660.80              3   
3          699    France  Female   39       1       0.00              2   
4          850     Spain  Female   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  
0          1               1        101348.88  
1          0               1        112542.58  
2          1               0        113931.57  
3          0               0         93826.63  
4          1               1         79084.10  

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64


In [60]:
X_train, X_test, y_train, y_test = train_test_split(X_independent, y_dependent, test_size=0.3, random_state=101)
print("X_train shape: ", X_train.shape)
print("X_train: \n", X_train)
print("")
print("y_test shape: ", y_test.shape)
print("y_test: \n", y_test)

X_train shape:  (7000, 10)
X_train: 
       CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
803           511     Spain  Female   29       9       0.00              2   
1387          725    France    Male   66       4   86459.80              1   
921           609     Spain    Male   61       1       0.00              1   
5917          674    France  Female   31       1       0.00              1   
9610          455    France    Male   40       1       0.00              3   
...           ...       ...     ...  ...     ...        ...            ...   
599           484   Germany  Female   34       4  148249.54              1   
5695          787    France    Male   46       7  117685.31              2   
8006          716   Germany    Male   41       8  126145.54              2   
1361          578    France    Male   32       4       0.00              2   
1547          653     Spain  Female   30       2   88243.29              2   

      HasCrCard  IsActive

In [61]:
y_train = np.array(y_train).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

# TEST: we create 2 pipelines for different scalers. test which is better.
# we do not scale cathegorical data, choose only the real numerical.
ct_standard_scaled = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), ["Geography", "Gender"]),
                                    ("scaler", StandardScaler(), ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary"])],
                       remainder="passthrough")

ct_minmax_scaled = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), ["Geography", "Gender"]),
                                    ("scaler", MinMaxScaler(), ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary"])],
                       remainder="passthrough")

X_train_std_scaled = ct_standard_scaled.fit_transform(X_train)
X_test_std_scaled = ct_standard_scaled.transform(X_test)
X_train_minmax_scaled = ct_minmax_scaled.fit_transform(X_train)
X_test_minmax_scaled = ct_minmax_scaled.transform(X_test)


# this can be used to see, the transformed dataset as a dataframe with the names.
# why? -> transforming more columns -> chaotic to see what is what.
#X_new_df = pd.DataFrame(X_independent, columns=ct_geography.get_feature_names_out())
#print("new_df with labels: ", X_new_df)

print("first line: ", X_train_std_scaled[0])
print("X_train_std_scaled: ", X_train_std_scaled)
print("")
print("X_test_std_scaled: ", X_test_std_scaled)
print("type of x_train: ", type(X_train_std_scaled))
print("type of y_train: ", type(y_train))





first line:  [ 0.          0.          1.          1.          0.         -1.45057405
 -0.93156572  1.37687702 -1.23805017  0.80996205  0.70308176  0.
  1.        ]
X_train_std_scaled:  [[ 0.          0.          1.         ...  0.70308176  0.
   1.        ]
 [ 1.          0.          0.         ...  0.71692473  1.
   1.        ]
 [ 0.          0.          1.         ... -1.34379489  1.
   0.        ]
 ...
 [ 0.          1.          0.         ...  0.657622    1.
   1.        ]
 [ 1.          0.          0.         ...  0.72291911  1.
   1.        ]
 [ 0.          0.          1.         ... -0.0590053   1.
   1.        ]]

X_test_std_scaled:  [[ 0.          0.          1.         ...  0.40123544  1.
   1.        ]
 [ 1.          0.          0.         ... -0.64634549  1.
   1.        ]
 [ 1.          0.          0.         ...  1.23986288  0.
   1.        ]
 ...
 [ 1.          0.          0.         ... -0.00647153  1.
   1.        ]
 [ 0.          0.          1.         ... -0.8843058

# 2. building the network

In [62]:
X_train_std_scaled = torch.from_numpy(X_train_std_scaled).float()
X_test_std_scaled = torch.from_numpy(X_test_std_scaled).float()
y_train = torch.from_numpy(y_train).float()
y_test = torch.from_numpy(y_test)

X_train_minmax_scaled = torch.from_numpy(X_train_minmax_scaled).float()
X_test_minmax_scaled = torch.from_numpy(X_test_minmax_scaled).float()

In [63]:
models = []

In [64]:
# hyperparameters

scaler = "std"
if scaler == "std":
    scaled_set = X_train_std_scaled
elif scaler == "minmax":
    scaled_set == X_train_minmax_scaled

number_of_neurons = 8
learning_batch_size = 32
n_epochs = 200
alpha = 0.001


In [65]:
class Network(nn.Module):


    def __init__(self, input_size, seed=101):
        super().__init__()
        self.seed = torch.manual_seed(seed)
        self.fcl1 = nn.Linear(input_size, number_of_neurons)
        self.fcl2 = nn.Linear(number_of_neurons, number_of_neurons)
        self.fcl3 = nn.Linear(number_of_neurons, 1)


    def forward(self, data):
        signal = self.fcl1(data)
        signal = F.relu(signal)
        signal = self.fcl2(signal)
        signal = F.relu(signal)
        signal = self.fcl3(signal)
        return F.sigmoid(signal)


In [66]:
# create a torch Dataloader -> A DataLoader takes a dataset and provides an iterator over batches of data.
# It handles batching, shuffling, and other data-loading functionalities, making it easier to work with large datasets.
# the primary input dataset of the Dataloader is torch.utils.data.TensorDataset, BUT
# any iterable can be used (e.g: torch.tensor, tuple), but X and y (independent and dependent variables) must be separeted
# e.g: Dataloader(list(zip(X_train, y_train)))

from torch.utils.data import DataLoader


dataloader = DataLoader(list(zip(scaled_set, y_train)), batch_size=learning_batch_size, shuffle=False)

In [67]:
# rather "bridge" than "brain"
brain = Network(scaled_set.shape[1])
optimizer = Adam(brain.parameters(), lr=alpha)

# 3. train the model

In [68]:
for epoch in range(1, n_epochs + 1):
    for features, labels in dataloader:
        prediction = brain(features)
        #print("feature: ", features)
        #print("label: ", labels)
        #print("prediction: ", prediction)
        loss = F.binary_cross_entropy(prediction, labels)

        #backpropagate
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # visualization:
    # keep every X episode on the screen.
    if epoch % 10 == 0: # if the remnant is 0, that means we are in a X step.
        print("\r Epoch: {} \t Loss: {:.4f}".format(epoch, loss.data))


"""
2 way to save:
    1. save with state_dict() -> only the weights and biases of the layers. -> smaller, flexible (build different architecture on it) -> common way.
    2. save the entire model -> complete, all the aspects incl. architechture -> 
"""
save_name = "bank_business_checkpoint.pth"
torch.save(brain.state_dict(), save_name)
print("saved as: ", save_name)

 Epoch: 10 	 Loss: 0.1361
 Epoch: 20 	 Loss: 0.1253
 Epoch: 30 	 Loss: 0.1272
 Epoch: 40 	 Loss: 0.1274
 Epoch: 50 	 Loss: 0.1315
 Epoch: 60 	 Loss: 0.1358
 Epoch: 70 	 Loss: 0.1369
 Epoch: 80 	 Loss: 0.1352
 Epoch: 90 	 Loss: 0.1337
 Epoch: 100 	 Loss: 0.1313
 Epoch: 110 	 Loss: 0.1333
 Epoch: 120 	 Loss: 0.1336
 Epoch: 130 	 Loss: 0.1346
 Epoch: 140 	 Loss: 0.1377
 Epoch: 150 	 Loss: 0.1404
 Epoch: 160 	 Loss: 0.1451
 Epoch: 170 	 Loss: 0.1473
 Epoch: 180 	 Loss: 0.1466
 Epoch: 190 	 Loss: 0.1466
 Epoch: 200 	 Loss: 0.1466
saved as:  bank_business_checkpoint.pth


# 4. check model accuracy

In [69]:
if scaler == "std":
    test_X = X_test_std_scaled
elif scaler == "minmax":
    test_X = X_test_minmax_scaled

brain.eval()
with torch.no_grad():
    y_pred = brain(test_X)

# convert probabilities to 0 or 1
threshold = 0.5
y_pred_int = (y_pred >= threshold)

# boolean masking
compared = (y_pred_int == y_test)
print("compared shape: ", compared.shape, "\n")
print("compared: ", compared, "\n\n")

# accuracy
accuracy = torch.sum(compared) / len(y_test)
print(f"accuracy: {accuracy :.4f}")

# confusion matrix
matrix_acc = confusion_matrix(y_test, y_pred_int)
print("confusin matrix: \n", matrix_acc)

# save model test infos
models.append({
    "last_loss" : loss.data,
    "accuracy" : accuracy,
    "scaler" : scaler,
    "number_of_neurons" : number_of_neurons,
    "learning_batch_size" : learning_batch_size,
    "n_epochs" : n_epochs,
    "alpha" : alpha
})

compared shape:  torch.Size([3000, 1]) 

compared:  tensor([[ True],
        [ True],
        [False],
        ...,
        [ True],
        [False],
        [ True]]) 


accuracy: 0.8600
confusin matrix: 
 [[2258  120]
 [ 300  322]]


In [70]:
for elem in models:
    print(elem)

{'last_loss': tensor(0.1466), 'accuracy': tensor(0.8600), 'scaler': 'std', 'number_of_neurons': 8, 'learning_batch_size': 32, 'n_epochs': 200, 'alpha': 0.001}


In [88]:
#### predict from random data
# random_data = [620, "Spain", "Female", 21, 6, 94034, 5, 1, 1, 50000]
df_random = X_independent.drop(X_independent.index)

df_random.loc[len(df_random)] = [620, "Spain", "Female", 21, 6, 94034, 5, 1, 1, 50000]
print(df_random)

brain.eval()
with torch.no_grad():
    prediction = brain(torch.tensor(ct_standard_scaled.transform(df_random)).float())
print("prediction: ", str(prediction.data.numpy() * 100) + "%")

   CreditScore Geography  Gender  Age  Tenure  Balance  NumOfProducts  \
0          620     Spain  Female   21       6    94034              5   

   HasCrCard  IsActiveMember  EstimatedSalary  
0          1               1            50000  
prediction:  [[99.992455]]%
