# Preprocessing the data
1.  loading the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
db = pd.read_csv("/kaggle/input/custchurn/customer_churn_large_dataset.csv")
db.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


Hot encoding the data is important.

We can represent male and female as 1, 0. we will use one hot encoding for location.

In [3]:
encoded = pd.get_dummies(db, columns = ["Gender"])

In [4]:
encoded.head()

Unnamed: 0,CustomerID,Name,Age,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn,Gender_Female,Gender_Male
0,1,Customer_1,63,Los Angeles,17,73.36,236,0,0,1
1,2,Customer_2,62,New York,1,48.76,172,0,1,0
2,3,Customer_3,24,Los Angeles,5,85.47,460,0,1,0
3,4,Customer_4,36,Miami,3,97.94,297,1,1,0
4,5,Customer_5,46,Miami,19,58.14,266,0,1,0


In [5]:
encoded["Y"] = encoded["Churn"]
encoded = encoded.drop(["Churn", "Name", "CustomerID", 'Gender_Female', "Location"], axis = 1)

In [6]:
encoded.describe()

Unnamed: 0,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Gender_Male,Y
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,44.02702,12.4901,65.053197,274.39365,0.49784,0.49779
std,15.280283,6.926461,20.230696,130.463063,0.499998,0.499998
min,18.0,1.0,30.0,50.0,0.0,0.0
25%,31.0,6.0,47.54,161.0,0.0,0.0
50%,44.0,12.0,65.01,274.0,0.0,0.0
75%,57.0,19.0,82.64,387.0,1.0,1.0
max,70.0,24.0,100.0,500.0,1.0,1.0


In [7]:
encoded["Y"].value_counts()

0    50221
1    49779
Name: Y, dtype: int64

Since the data set is evenly distributed, we do not need to do deal with over-sampling and under-sampling

Training a base-line model before feature engineering
using
* logistic regression
* Random Forests
* Neural Network - ANN

# Logistic Reg - Baseline

In [8]:
#spliting dataset into features and target
x = encoded.iloc[:, 0:-1]
y = encoded["Y"]

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler



In [10]:
scaler = StandardScaler()
xToBeScaled = x.iloc[:, 0:4]
x_scaled_t = scaler.fit_transform(xToBeScaled)
x_scaled = pd.concat((pd.DataFrame(x_scaled_t, columns = ["Age", "Subscription_Length_Months", "Monthly_Bill", "Total_Usage_GB"]), x.iloc[:, 4:]), 1)
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, train_size = 0.8)

  x_scaled = pd.concat((pd.DataFrame(x_scaled_t, columns = ["Age", "Subscription_Length_Months", "Monthly_Bill", "Total_Usage_GB"]), x.iloc[:, 4:]), 1)


In [11]:
logreg = LogisticRegression(class_weight = "balanced", solver = "newton-cholesky")
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
print(pd.Series(y_pred).value_counts())

0    10058
1     9942
dtype: int64


In [12]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.50      0.50     10029
           1       0.50      0.50      0.50      9971

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000



In [13]:
logreg.score(x_test, y_test)

0.49815

In [14]:
x_scaled.head()

Unnamed: 0,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Gender_Male
0,1.24167,0.651115,0.410606,-0.294289,1
1,1.176226,-1.658879,-0.805374,-0.784852,0
2,-1.310651,-1.08138,1.009204,1.422681,0
3,-0.525321,-1.370129,1.625597,0.173279,0
4,0.12912,0.939864,-0.34172,-0.064338,0


# Random Forests

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
rfc1 = RandomForestClassifier(n_estimators = 200,criterion='log_loss', max_depth = 15)
rfc1.fit(x_train, y_train)
y_pred_rfc = rfc1.predict(x_test)
print(classification_report(y_test, y_pred_rfc))
rfc1.score(x_test, y_test)

In [17]:
rfc1.fit(x_train, y_train)

In [18]:
y_pred_rfc = rfc1.predict(x_test)

In [19]:
print(classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

           0       0.51      0.56      0.53     10029
           1       0.50      0.44      0.47      9971

    accuracy                           0.51     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.51      0.50     20000



In [20]:
rfc1.score(x_test, y_test)

0.50505

# ANN

In [21]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

> Creating a dataset class

In [22]:
class dataset(Dataset):
    def __init__(self,x,y):
        self.x = torch.tensor(x.values,dtype=torch.float32).cuda()
        self.y = torch.tensor(y,dtype=torch.float32).cuda()
        self.length = self.x.shape[0] 
    def __getitem__(self,idx):
        return self.x[idx],self.y[idx]
    def __len__(self):
        return self.length

In [23]:
trainset = dataset(x, y)
trainloader = DataLoader(trainset,batch_size=256,shuffle=False)

In [24]:
class Net(nn.Module):
    def __init__(self,input_shape):
        super(Net,self).__init__()
        self.fc1 = nn.Linear(input_shape,32)
        self.fc2 = nn.Linear(32,64)
        self.fc3 = nn.Linear(64,1)
    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

In [25]:
learning_rate = 0.05
epochs = 50

model = Net(input_shape=x.shape[1])
optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate)
loss_fn = nn.BCEWithLogitsLoss()
model.cuda()

Net(
  (fc1): Linear(in_features=5, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
)

In [26]:
losses = []
accur = []
torch.device("cuda")
for i in range(epochs):
    for j,(x_t,y_t) in enumerate(trainloader):
    
    #calculate output
        output = model(x_t)

        #calculate loss
        loss = loss_fn(output,y_t.reshape(-1,1))
#         print(type(x_t))
        #accuracy
        predicted = model(torch.tensor(x_scaled.values,dtype=torch.float32).cuda()).cpu()
        
        acc = (predicted.reshape(-1).detach().numpy().round() == y).mean()
        #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        
    if i%5 == 0:
        losses.append(loss)
        accur.append(acc)
        print("epoch {}\tloss : {}\t accuracy : {}".format(i,loss,acc))


epoch 0	loss : 0.6933134198188782	 accuracy : 0.50112
epoch 5	loss : 0.6932387948036194	 accuracy : 0.50111
epoch 10	loss : 0.6932271122932434	 accuracy : 0.50114
epoch 15	loss : 0.6932238936424255	 accuracy : 0.50138
epoch 20	loss : 0.6932241320610046	 accuracy : 0.50129
epoch 25	loss : 0.6932260394096375	 accuracy : 0.50135
epoch 30	loss : 0.6932287812232971	 accuracy : 0.50127
epoch 35	loss : 0.6932317018508911	 accuracy : 0.50143
epoch 40	loss : 0.6932352185249329	 accuracy : 0.50182
epoch 45	loss : 0.693239152431488	 accuracy : 0.50169


# Boosting

In [27]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [28]:
model = XGBClassifier()
model.fit(x_train, y_train)
print(model)
y_pred_x = model.predict(x_test)
predictions = [round(value) for value in y_pred_x]

print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
0.502
              precision    recall  f1-score   support

           0       0.50      0.52      0.51     10029
           1       0.50      0.48      0.49      9971

    accuracy                           0.50     20000
   macro avg       0.50 

In [29]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [30]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
kfold = model_selection.KFold(n_splits=10)

In [31]:
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.501175 (0.004774)
LDA: 0.501150 (0.004828)
KNN: 0.503362 (0.004389)
CART: 0.498975 (0.003383)
NB: 0.501000 (0.004989)


In [32]:
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 30)

display(encoded[encoded["Y"]==1].head(n=100))

Unnamed: 0,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Gender_Male,Y
3,36,3,97.94,297,0,1
5,67,15,82.65,456,1,1
7,67,1,97.70,396,0,1
8,20,10,42.45,150,0,1
9,53,12,64.49,383,0,1
...,...,...,...,...,...,...
182,26,2,92.47,397,1,1
183,70,6,42.26,100,0,1
184,54,3,56.44,463,0,1
185,52,17,75.12,318,0,1


In [33]:
display(encoded[encoded["Y"]==0].head(n=100))

Unnamed: 0,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Gender_Male,Y
0,63,17,73.36,236,1,0
1,62,1,48.76,172,0,0
2,24,5,85.47,460,0,0
4,46,19,58.14,266,0,0
6,30,3,73.79,269,0,0
...,...,...,...,...,...,...
206,29,2,89.42,236,0,0
207,50,19,56.30,114,1,0
209,21,2,66.90,209,0,0
210,65,19,93.59,311,1,0


In [34]:
encoded["cost"] = encoded["Total_Usage_GB"]/(encoded["Subscription_Length_Months"]*encoded["Monthly_Bill"])
encoded["cost_1"] = encoded["Total_Usage_GB"]/(encoded["Monthly_Bill"])
encoded["cost_2"] = encoded["Total_Usage_GB"]/(encoded["Subscription_Length_Months"])
encoded["money_spent"] = encoded["Subscription_Length_Months"]*encoded["Monthly_Bill"]
encoded["inverse"] = encoded["cost_1"] + encoded["cost_2"]

In [39]:
encoded.cov()

Unnamed: 0,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Gender_Male,Y,cost,cost_1,cost_2,money_spent,inverse
Age,233.487045,0.357911,0.343111,3.840942,0.006358,0.01191,-0.017072,0.080272,-1.426894,35.692691,-1.346622
Subscription_Length_Months,0.357911,47.975862,-0.741784,-1.990928,0.001109,0.008063,-4.570274,-0.006388,-267.050882,3113.105321,-267.05727
Monthly_Bill,0.343111,-0.741784,409.281055,8.410608,0.022648,-0.002137,-5.005717,-32.261882,7.058745,5043.091188,-25.203137
Total_Usage_GB,3.840942,-1.990928,8.410608,17020.610716,-0.090376,-0.185357,45.761908,292.090367,2670.08459,13.842849,2962.174957
Gender_Male,0.006358,0.001109,0.022648,-0.090376,0.249998,0.00053,0.001147,-0.00433,0.044297,0.361141,0.039968
Y,0.01191,0.008063,-0.002137,-0.185357,0.00053,0.249998,0.001147,-0.001046,0.014573,0.353721,0.013527
cost,-0.017072,-4.570274,-5.005717,45.761908,0.001147,0.001147,1.492807,1.324543,73.870001,-329.021404,75.194544
cost_1,0.080272,-0.006388,-32.261882,292.090367,-0.00433,-0.001046,1.324543,8.476533,45.377524,-401.448354,53.854057
cost_2,-1.426894,-267.050882,7.058745,2670.08459,0.044297,0.014573,73.870001,45.377524,4327.861403,-17331.946902,4373.238927
money_spent,35.692691,3113.105321,5043.091188,13.842849,0.361141,0.353721,-329.021404,-401.448354,-17331.946902,285005.22759,-17733.395256


In [40]:
from sklearn.feature_selection import *

In [45]:
f = f_regression(trains, encoded["Y"])
f1 = f_classif(trains, encoded["Y"])
mi = mutual_info_classif(trains,encoded["Y"])
chi = chi2(trains,encoded["Y"])

cols = [x for x in trains]

vals = pd.DataFrame({'cols':cols, 'f_score' : f[0], 'p_value': f[1], 'f_classif': f1[0], 'p_classif':f1[1], 'mi':mi,'chi':chi[0],'p_chi':chi[1]})

vals.head(n=10)

Unnamed: 0,cols,f_score,p_value,f_classif,p_classif,mi,chi,p_chi
0,Age,0.242999,0.622049,0.242999,0.622049,0.001076,1.2887,0.2562876
1,Subscription_Length_Months,0.542063,0.461581,0.542063,0.461581,0.000996,2.082135,0.1490316
2,Monthly_Bill,0.004465,0.946727,0.004465,0.946727,0.001956,0.02809,0.8668984
3,Total_Usage_GB,0.807423,0.368885,0.807423,0.368885,0.000509,50.084482,1.47267e-12
4,Gender_Male,0.449834,0.502415,0.449834,0.502415,0.007041,0.225892,0.6345865
5,cost,0.352377,0.552772,0.352377,0.552772,0.0,0.707682,0.4002145
6,cost_1,0.051598,0.820307,0.051598,0.820307,0.0,0.092783,0.760668
7,cost_2,0.019628,0.88858,0.019628,0.88858,0.000867,1.959647,0.1615511
8,money_spent,0.1756,0.675183,0.1756,0.675183,0.0,61.651569,4.099404e-15
9,inverse,0.016534,0.897688,0.016534,0.897688,0.000332,1.522909,0.2171798


In [44]:
trains

Unnamed: 0,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Gender_Male,cost,cost_1,cost_2,money_spent,inverse
0,63,17,73.36,236,1,0.189236,3.217012,13.882353,1247.12,17.099365
1,62,1,48.76,172,0,3.527482,3.527482,172.000000,48.76,175.527482
2,24,5,85.47,460,0,1.076401,5.382005,92.000000,427.35,97.382005
3,36,3,97.94,297,0,1.010823,3.032469,99.000000,293.82,102.032469
4,46,19,58.14,266,0,0.240798,4.575163,14.000000,1104.66,18.575163
...,...,...,...,...,...,...,...,...,...,...
99995,33,23,55.13,226,1,0.178235,4.099401,9.826087,1267.99,13.925488
99996,62,19,61.65,351,0,0.299654,5.693431,18.473684,1171.35,24.167115
99997,64,17,96.11,251,1,0.153623,2.611591,14.764706,1633.87,17.376297
99998,51,20,49.25,434,0,0.440609,8.812183,21.700000,985.00,30.512183


In [43]:
required_set = ["Age", "Gender_Male","Total_Usage_GB", "money_spent"]
trains = encoded.drop("Y",axis=1)
trains.head()

Unnamed: 0,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Gender_Male,cost,cost_1,cost_2,money_spent,inverse
0,63,17,73.36,236,1,0.189236,3.217012,13.882353,1247.12,17.099365
1,62,1,48.76,172,0,3.527482,3.527482,172.0,48.76,175.527482
2,24,5,85.47,460,0,1.076401,5.382005,92.0,427.35,97.382005
3,36,3,97.94,297,0,1.010823,3.032469,99.0,293.82,102.032469
4,46,19,58.14,266,0,0.240798,4.575163,14.0,1104.66,18.575163


In [46]:
x_train1,x_test1,y_train1,y_test1 = train_test_split(trains, y, train_size = 0.8)

In [47]:
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, x_train1, y_train1, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.502487 (0.005187)
LDA: 0.499613 (0.005179)
KNN: 0.500050 (0.004784)
CART: 0.498675 (0.007483)
NB: 0.499688 (0.005932)


In [46]:
v1 = vals.drop(["f_classif", "p_classif"], axis=1)

In [47]:
v1

Unnamed: 0,cols,f_score,p_value,mi,chi,p_chi
0,Age,0.242999,0.622049,0.001076,1.2887,0.2562876
1,Subscription_Length_Months,0.542063,0.461581,0.000996,2.082135,0.1490316
2,Monthly_Bill,0.004465,0.946727,0.001956,0.02809,0.8668984
3,Total_Usage_GB,0.807423,0.368885,0.000509,50.084482,1.47267e-12
4,Gender_Male,0.449834,0.502415,0.007041,0.225892,0.6345865
5,cost,0.352377,0.552772,0.0,0.707682,0.4002145
6,cost_1,0.051598,0.820307,0.0,0.092783,0.760668
7,cost_2,0.019628,0.88858,0.000867,1.959647,0.1615511
8,money_spent,0.1756,0.675183,0.0,61.651569,4.099404e-15
9,inverse,0.016534,0.897688,0.000332,1.522909,0.2171798


In [51]:
v1.sort_values(by="chi")

Unnamed: 0,cols,f_score,p_value,mi,chi,p_chi
2,Monthly_Bill,0.004465,0.946727,0.001956,0.02809,0.8668984
6,cost_1,0.051598,0.820307,0.0,0.092783,0.760668
4,Gender_Male,0.449834,0.502415,0.007041,0.225892,0.6345865
5,cost,0.352377,0.552772,0.0,0.707682,0.4002145
0,Age,0.242999,0.622049,0.001076,1.2887,0.2562876
9,inverse,0.016534,0.897688,0.000332,1.522909,0.2171798
7,cost_2,0.019628,0.88858,0.000867,1.959647,0.1615511
1,Subscription_Length_Months,0.542063,0.461581,0.000996,2.082135,0.1490316
3,Total_Usage_GB,0.807423,0.368885,0.000509,50.084482,1.47267e-12
8,money_spent,0.1756,0.675183,0.0,61.651569,4.099404e-15


In [123]:
required = ["money_spent", "Total_Usage_GB", "cost_2", "Age", "Gender_Male"]
xt = trains[required]

In [124]:
xt

Unnamed: 0,money_spent,Total_Usage_GB,cost_2,Age,Gender_Male
0,1247.12,236,13.882353,63,1
1,48.76,172,172.000000,62,0
2,427.35,460,92.000000,24,0
3,293.82,297,99.000000,36,0
4,1104.66,266,14.000000,46,0
...,...,...,...,...,...
99995,1267.99,226,9.826087,33,1
99996,1171.35,351,18.473684,62,0
99997,1633.87,251,14.764706,64,1
99998,985.00,434,21.700000,51,0


In [127]:
x_train, x_test, y_train, y_test = train_test_split(xt, y, train_size = 0.8)
logreg = LogisticRegression(class_weight = "balanced")
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
print(pd.Series(y_pred).value_counts())

1    12054
0     7946
dtype: int64


In [128]:
print(classification_report(y_test, y_pred))
logreg.score(x_test, y_test)

              precision    recall  f1-score   support

           0       0.51      0.40      0.45     10075
           1       0.50      0.60      0.55      9925

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000



0.50065

In [129]:
y_pred2 = [1 for x in range(len(y_test))]
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.40      0.45     10075
           1       0.50      0.60      0.55      9925

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000



In [130]:
pd.Series(y_pred).value_counts()

1    12054
0     7946
dtype: int64

# ANN improv

In [131]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [175]:
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [176]:
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


## test data    
class TestData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

train_data = TrainData(torch.FloatTensor(x_train.values), torch.FloatTensor(y_train))
test_data = TestData(torch.FloatTensor(x_test.values))

In [177]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [189]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 12.
        self.layer_1 = nn.Linear(5, 64) 
        self.layer_2 = nn.Linear(64, 128)
        self.layer_3 = nn.Linear(128, 256)
        self.layer_4 = nn.Linear(256, 32)
        self.layer_out = nn.Linear(32,1)
         
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(32)
        self.batchnorm3 = nn.BatchNorm1d(128)
        self.batchnorm4 = nn.BatchNorm1d(256)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
#         x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm3(x)
        x = self.dropout(x)
        x = self.relu(self.layer_3(x))
        x = self.batchnorm4(x)
        x = self.relu(self.layer_4(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.relu(self.layer_out(x))
        
        return x
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [190]:
model = BinaryClassification()
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

BinaryClassification(
  (layer_1): Linear(in_features=5, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=128, bias=True)
  (layer_3): Linear(in_features=128, out_features=256, bias=True)
  (layer_4): Linear(in_features=256, out_features=32, bias=True)
  (layer_out): Linear(in_features=32, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm4): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [191]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [197]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.69315 | Acc: 50.158


KeyboardInterrupt: 

In [165]:
!pip install skorch
from skorch import NeuralNetClassifier

Collecting skorch
  Downloading skorch-0.14.0-py3-none-any.whl (221 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.3/221.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: skorch
Successfully installed skorch-0.14.0


In [201]:
model2 = NeuralNetClassifier(
    module=BinaryClassification,
    max_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    criterion = nn.BCEWithLogitsLoss,
    optimizer = optim.SGD,
)

In [202]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [203]:
score = make_scorer(binary_acc, greater_is_better=True)

param_grid = {
    'batch_size': [10, 20, 40, 60, 80, 100],
    'max_epochs': [10, 50, 100]
}

grid = GridSearchCV(estimator=model2, param_grid=param_grid, n_jobs=-1, cv=3, scoring = score)

grid_result = grid.fit(torch.FloatTensor(x_train.values), torch.FloatTensor(y_train.values).unsqueeze(1))





print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.6954[0m       [32m0.5018[0m        [35m0.6931[0m  14.7718
  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.6953[0m       [32m0.5018[0m        [35m0.6931[0m  14.7623
      2        [36m0.6932[0m       0.5018        0.6931  15.5235
      2        [36m0.6932[0m       0.5018        0.6931  15.5986
      3        [36m0.6932[0m       0.5018        0.6931  14.5241
      3        [36m0.6932[0m       [32m0.5026[0m        [35m0.6931[0m  14.5692
      4        [36m0.6931[0m       0.5018        0.6931  15.5213
      4        0.6932       0.5018        0.6931  15.6449
      5        [36m0.6931[0m       0.5018        0.6931  14.6645
      5        [36m0.6932[0m       0.5018        0.6931  14.6858
      6        0.6932       0.5018        0.6931  15.

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/tmp/ipykernel_28/364576122.py", line 4, in binary_acc
AttributeError: 'bool' object has no attribute 'sum'

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282

  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.6950[0m       [32m0.5029[0m        [35m0.6931[0m  14.4867
  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.6950[0m       [32m0.5018[0m        [35m0.6931[0m  14.5043
      2        [36m0.6932[0m       0.5018        0.6931  15.2868
      2        [36m0.6932[0m       0.5018        0.6931  15.2576
      3        [36m0.6931[0m       0.5018        0.6931  14.5062
      3        [36m0.6932[0m       0.5018        0.6931  14.4573
      4        [36m0.6931[0m       0.5018        0.6931  15.4266
      4        [36m0.6931[0m       0.5018        0.6931  15.3348
      5        0.6931       0.5018        0.6931  14.5632
      5        0.6931       0.5018        0.6931  14.6750
      6        0.6931       0.5018        0.6931  14.7182
      6        [36m0.

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/tmp/ipykernel_28/364576122.py", line 4, in binary_acc
AttributeError: 'bool' object has no attribute 'sum'



     11        0.6931       0.5018        0.6931  15.2439
  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.6949[0m       [32m0.5018[0m        [35m0.6931[0m  14.4596
     12        0.6931       0.5018        0.6931  14.5619
      2        [36m0.6932[0m       0.5018        0.6931  14.4610
     13        0.6931       0.5018        0.6931  15.3756
      3        [36m0.6932[0m       0.5018        0.6931  15.4596
     14        0.6931       0.5018        0.6931  14.5640
      4        0.6932       0.5018        0.6931  14.5778
     15        0.6931       0.5018        0.6931  15.3907
      5        [36m0.6931[0m       0.5018        0.6931  15.3126
     16        0.6931       0.5018        0.6931  14.2693
      6        0.6931       0.5018        0.6931  14.3390
     17        0.6931       0.5018        0.6931  15.3023
      7        [36m0.6931[0m       0.5018        0.6931  15.5444
     18  

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/tmp/ipykernel_28/364576122.py", line 4, in binary_acc
AttributeError: 'bool' object has no attribute 'sum'



     41        0.6931       0.5018        0.6931  15.4805
  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.6948[0m       [32m0.5018[0m        [35m0.6931[0m  15.7603
     42        0.6931       0.5018        0.6931  14.6328
      2        [36m0.6932[0m       0.5018        0.6931  14.5881
     43        0.6931       0.5018        0.6931  15.2939
      3        0.6932       0.5018        0.6931  15.7847
     44        0.6931       0.5018        0.6931  14.6065
      4        [36m0.6931[0m       0.5018        0.6931  14.5837
     45        0.6931       0.5018        0.6931  15.4227
      5        [36m0.6931[0m       0.5018        0.6931  15.6298
     46        0.6931       0.5018        0.6931  14.5953
      6        0.6931       0.5018        0.6931  14.4668
     47        0.6931       0.5018        0.6931  15.3771
      7        0.6931       0.5018        0.6931  15.6567
     48        0.6

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/tmp/ipykernel_28/364576122.py", line 4, in binary_acc
AttributeError: 'bool' object has no attribute 'sum'



     11        0.6932       0.5018        0.6931  15.9436
  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.6951[0m       [32m0.5018[0m        [35m0.6931[0m  15.7926
     12        0.6932       0.5018        0.6931  14.4698
      2        [36m0.6932[0m       0.5018        0.6931  14.5040
     13        0.6932       0.5018        0.6931  16.1276
      3        0.6932       0.5018        0.6931  16.2369
     14        0.6932       0.5018        0.6931  15.0277
      4        0.6932       0.5018        0.6931  14.6949
     15        0.6932       0.5018        0.6931  15.7752
      5        [36m0.6932[0m       0.5018        0.6931  15.7188
     16        0.6931       0.5018        0.6931  14.7898
      6        0.6932       0.5018        0.6931  14.8312
     17        0.6931       0.5018        0.6931  15.8156
      7        [36m0.6931[0m       0.5018        0.6931  15.5608
     18        0.6

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/tmp/ipykernel_28/364576122.py", line 4, in binary_acc
AttributeError: 'bool' object has no attribute 'sum'



     41        0.6931       0.5018        0.6931  14.5268
  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.6962[0m       [32m0.5018[0m        [35m0.6931[0m  15.5529
     42        0.6931       0.5018        0.6931  15.4423
      2        [36m0.6932[0m       0.5018        0.6931  14.6981
     43        0.6931       0.5018        0.6931  14.7159
      3        [36m0.6931[0m       0.5018        0.6931  14.8362
     44        0.6931       0.5018        0.6931  15.3487
      4        0.6932       0.5018        0.6931  15.0691
     45        0.6931       0.5018        0.6931  14.3319
      5        0.6932       0.5018        0.6931  14.4126
     46        0.6931       0.5018        0.6931  15.2138
      6        0.6932       0.5018        0.6931  15.0855
     47        0.6931       0.5018        0.6931  14.3637
      7        0.6932       0.5018        0.6931  14.4620
     48        0.6931      

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/tmp/ipykernel_28/364576122.py", line 4, in binary_acc
AttributeError: 'bool' object has no attribute 'sum'



     60        0.6931       0.5018        0.6931  15.4046
  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.6951[0m       [32m0.5018[0m        [35m0.6931[0m  14.2513
     61        0.6931       0.5018        0.6931  14.3126
      2        [36m0.6932[0m       0.5018        0.6931  15.3335
     62        0.6931       0.5018        0.6931  15.3818
      3        [36m0.6931[0m       0.5008        0.6932  14.3175
     63        0.6931       0.5018        0.6931  14.3965
      4        0.6933       0.5018        0.6931  15.2754
     64        0.6931       0.5018        0.6931  15.4512
      5        0.6932       0.5018        0.6931  14.2886
     65        0.6931       0.5018        0.6931  14.3769
      6        0.6932       0.5018        0.6931  15.3229
     66        0.6931       0.5018        0.6931  15.3860
      7        0.6931       0.5018        0.6931  14.1683
     67        0.6931      

KeyboardInterrupt: 

In [205]:
db.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0
