In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("cleaned_dataset.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Columns: 509 entries, ClientID to Churned
dtypes: float64(373), int64(136)
memory usage: 52.5 MB


In [3]:
features = df.drop(columns=['Churned'])
label = df["Churned"]
X_Train , X_Test , y_train , y_test = train_test_split(features, label, test_size=0.2, random_state=42)

In [4]:
KF = KFold(n_splits=5)
def Model_Train(Model, X, Y):
    Preds = []
    for i,j in KF.split(X):
        Train_X = X.iloc[i]
        Test_X = X.iloc[j]
        Train_Y = Y.iloc[i]
        
        Model.fit(Train_X, Train_Y)
        Pred_Y = Model.predict(Test_X)
        Preds.append(Pred_Y)
    print(classification_report(Y, np.concatenate(Preds)))

## Classification:

## KNN

In [34]:
from math import sqrt
Range = list(range(1, int(sqrt(len(X_Train))), 2))
Param_grid = dict(n_neighbors=Range)    
KNN = KNeighborsClassifier()    
Grid = GridSearchCV(KNN, Param_grid, cv=5, scoring='accuracy', verbose=3)
Grid.fit(X_Train, y_train)
best_k = Grid.best_params_['n_neighbors']
best_model = Grid.best_estimator_

y_pred = best_model.predict(X_Test)
print("Metrics: \n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 52 candidates, totalling 260 fits
[CV 1/5] END .....................n_neighbors=1;, score=0.716 total time=   0.4s
[CV 2/5] END .....................n_neighbors=1;, score=0.709 total time=   0.2s
[CV 3/5] END .....................n_neighbors=1;, score=0.704 total time=   0.2s
[CV 4/5] END .....................n_neighbors=1;, score=0.706 total time=   0.2s
[CV 5/5] END .....................n_neighbors=1;, score=0.738 total time=   0.2s
[CV 1/5] END .....................n_neighbors=3;, score=0.730 total time=   0.2s
[CV 2/5] END .....................n_neighbors=3;, score=0.719 total time=   0.2s
[CV 3/5] END .....................n_neighbors=3;, score=0.720 total time=   0.2s
[CV 4/5] END .....................n_neighbors=3;, score=0.708 total time=   0.2s
[CV 5/5] END .....................n_neighbors=3;, score=0.728 total time=   0.2s
[CV 1/5] END .....................n_neighbors=5;, score=0.724 total time=   0.2s
[CV 2/5] END .....................n_neighbors=5

## Naive Bayes

In [11]:
NB = GaussianNB()
Model_Train(NB, X_Train, y_train)

y_pred = NB.predict(X_Test)
print("Metrics: \n", classification_report(y_test, y_pred))

Metrics: 
               precision    recall  f1-score   support

           0       0.94      0.34      0.50      2050
           1       0.31      0.94      0.47       655

    accuracy                           0.49      2705
   macro avg       0.63      0.64      0.48      2705
weighted avg       0.79      0.49      0.49      2705



## CART

In [12]:
Cart = DecisionTreeClassifier()
Model_Train(Cart, X_Train, y_train)

y_pred = Cart.predict(X_Test)
print("Metrics: \n", classification_report(y_test, y_pred))

Metrics: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2050
           1       1.00      0.99      0.99       655

    accuracy                           1.00      2705
   macro avg       1.00      0.99      1.00      2705
weighted avg       1.00      1.00      1.00      2705



## Random Forest

In [13]:
RF = RandomForestClassifier(n_estimators=10, random_state=42)
Model_Train(RF, X_Train, y_train)

y_pred = RF.predict(X_Test)
print("Metrics: \n", classification_report(y_test, y_pred))

Metrics: 
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      2050
           1       1.00      0.89      0.94       655

    accuracy                           0.97      2705
   macro avg       0.98      0.94      0.96      2705
weighted avg       0.97      0.97      0.97      2705



## Logistic Regression

In [17]:
LogR = LogisticRegression(solver='liblinear', max_iter=2000)
Model_Train(LogR, X_Train, y_train)

y_pred = LogR.predict(X_Test)
print("Metrics: \n", classification_report(y_test, y_pred))

Metrics: 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      2050
           1       0.99      0.97      0.98       655

    accuracy                           0.99      2705
   macro avg       0.99      0.98      0.99      2705
weighted avg       0.99      0.99      0.99      2705



## SVM

In [9]:
SVM = LinearSVC(dual=False, max_iter= 10000)
Model_Train(SVM, X_Train, y_train)

y_pred = SVM.predict(X_Test)
print("Metrics: \n", classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      8134
           1       0.99      0.98      0.98      2684

    accuracy                           0.99     10818
   macro avg       0.99      0.99      0.99     10818
weighted avg       0.99      0.99      0.99     10818

Metrics: 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      2050
           1       1.00      0.98      0.99       655

    accuracy                           0.99      2705
   macro avg       0.99      0.99      0.99      2705
weighted avg       0.99      0.99      0.99      2705



In [37]:
print(y_pred.tolist())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Neural Network

In [18]:
early_stopping = EarlyStopping(monitor='loss', patience=5)

In [21]:
nn = Sequential()
nn.add(Dense(128, activation='relu'))
nn.add(Dropout(0.2)) 
nn.add(Dense(1, activation='sigmoid'))

nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [22]:
for i,j in KF.split(X_Train):
        Train_X = X_Train.iloc[i]
        Val_X = X_Train.iloc[j]
        Train_Y = y_train.iloc[i]
        
        nn.fit(Train_X, Train_Y, epochs=200, batch_size=32, callbacks=[early_stopping])

Epoch 1/200
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6588 - loss: 69.4746
Epoch 2/200
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7044 - loss: 16.5329
Epoch 3/200
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7158 - loss: 3.3990
Epoch 4/200
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7575 - loss: 1.3627
Epoch 5/200
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7615 - loss: 0.9226
Epoch 1/200
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7619 - loss: 0.7565
Epoch 2/200
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7591 - loss: 0.5067
Epoch 3/200
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7690 - loss: 0.4477
Epoch 4/200
[1m271/271[0m [

In [32]:
y_pred = nn.predict(X_Test)

predictions = []
for pred in y_pred:
    if pred > 0.5:
        predictions.append(1)
    else:
        predictions.append(0)

print("Metrics: \n", classification_report(y_test, predictions))

[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Metrics: 
               precision    recall  f1-score   support

           0       0.98      0.96      0.97      2050
           1       0.88      0.93      0.90       655

    accuracy                           0.95      2705
   macro avg       0.93      0.94      0.94      2705
weighted avg       0.95      0.95      0.95      2705

