In [224]:
# Importing the Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [54]:
# Importing the cleaned dataset
pd.set_option('display.max_columns', None)
churn_df = pd.read_csv("../data/cleaned-data/customer-churn-cleaned.csv")
churn_df.head()

Unnamed: 0,Senior Citizen,Partner,Dependents,Tenure,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn
0,No,Yes,No,1.0,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,No,No,No,34.0,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,No,No,No,2.0,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,No,No,No,45.0,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,No,No,No,2.0,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [43]:
# Checking for missing values if any (just to be sure)
churn_df.isnull().sum()

Senior Citizen       0
Partner              0
Dependents           0
Tenure               0
Phone Service        0
Multiple Lines       0
Internet Service     0
Online Security      0
Online Backup        0
Device Protection    0
Tech Support         0
Streaming TV         0
Streaming Movies     0
Contract             0
Paperless Billing    0
Payment Method       0
Monthly Charges      0
Total Charges        0
Churn                0
dtype: int64

In [44]:
# Checking for class imbalance
churn_df.Churn.value_counts()

No     5163
Yes    1869
Name: Churn, dtype: int64

In [58]:
# Converting our target variable into 0/1
lb = LabelBinarizer()
churn_df["Churn"] = lb.fit_transform(churn_df["Churn"])
churn_df.head(5)

Unnamed: 0,Senior Citizen,Partner,Dependents,Tenure,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn
0,No,Yes,No,1.0,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,No,No,No,34.0,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,No,No,No,2.0,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,No,No,No,45.0,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,No,No,No,2.0,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [56]:
# Making a list of categorical and numerical attributes
num_attributes = ["Tenure", "Monthly Charges", "Total Charges"]
cat_attributes = [column for column in churn_df.columns[:-1] if not column in num_attributes]
print(cat_attributes)

['Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method']


In [31]:
# # Performing SMOTE and random undersampling to solve class imbalance problem
# from imblearn.pipeline import Pipeline
# sampling_pipeline = Pipeline([
#     ("over", SMOTE()),
#     ("under", RandomUnderSampler())
# ])

# X, y = sampling_pipeline.fit_resample(X, y)

In [109]:
# Splitting the dataset in train and test
train_full_set, test_set = train_test_split(churn_df, test_size=0.15, random_state=42, stratify=churn_df["Churn"])
print("Training Samples: ", train_full_set.shape[0])
print("Testing Samples: ", test_set.shape[0])

Training Samples:  5977
Testing Samples:  1055


In [110]:
# Checking for class imbalance here
train_full_set["Churn"].value_counts()

0    4388
1    1589
Name: Churn, dtype: int64

In [113]:
X_train_full = train_full_set.drop("Churn", axis=1)
y_train_full = train_full_set["Churn"]

In [114]:
# Now Splitting the full training dataset again into train and validation
train_set, val_set = train_test_split(train_full_set, test_size=0.2, random_state=42, stratify=train_full_set["Churn"])
print("Train Samples: ", train_set.shape[0])
print("Val Samples: ", val_set.shape[0])

Train Samples:  4781
Val Samples:  1196


In [115]:
# Creating X and y values for each dataset
X_train = train_set.drop("Churn", axis=1)
X_val = val_set.drop("Churn", axis=1)
X_test = test_set.drop("Churn", axis=1)
y_train = train_set["Churn"]
y_val = val_set["Churn"]
y_test = test_set["Churn"]

In [116]:
# Building a columntransformer pipeline for data preprocessing
preprocessing_pipeline = ColumnTransformer([
    ("std_scaler", StandardScaler(), num_attributes),
    ("cat_one_hot", OneHotEncoder(), cat_attributes)
])

In [135]:
# create a K_fold split
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

In [137]:
# Let's first build a Simple Logistic regression Model
lr_pipeline = Pipeline([
    ("preprocess", preprocessing_pipeline),
    ("lr", LogisticRegression(solver="sag", max_iter=1000, class_weight="balanced") )
])

scores = cross_val_score(lr_pipeline, X_train_full, y_train_full, scoring="roc_auc", cv=cv)

In [72]:
# Creating a function to display scores
def display_scores(scores):
    print("ROC-AUC Scores: ", scores)
    print("ROC-AUC Mean: ", scores.mean())
    print("ROC-AUC SD: ", scores.std())

In [138]:
# Checking out the scores for our Logistic Regression Model
display_scores(scores)

ROC-AUC Scores:  [0.83033732 0.86706136 0.84505939 0.85061997 0.84199637 0.85330618
 0.83399414 0.85208353 0.85581026 0.84034157 0.84060938 0.83349092
 0.86840072 0.86151689 0.83264129]
ROC-AUC Mean:  0.8471512867294498
ROC-AUC SD:  0.012046086611621147


In [None]:
# Fitting the pipeline on the training data
lr_pipeline.fit(X_train_full, y_train_full)

In [233]:
# a function to print all metrics of results on test set
def print_metrics(prediction):
    print("Accuracy: ", round(accuracy_score(y_test, prediction), 2))
    print("ROC-AUC score: ", round(roc_auc_score(y_test, prediction), 2))
    print("F1-score: ", round(f1_score(y_test, prediction, average="weighted"), 2))
    print("Precision: ", round(precision_score(y_test, prediction, average="weighted"), 2))
    print("Recall: ", round(recall_score(y_test, prediction, average="weighted"), 2))
    print("---------------------------")
    print(confusion_matrix(y_test, prediction))
    print("---------------------------")
    print(classification_report(y_test, prediction))

In [234]:
# Checking the model performance on test data
y_pred_lr = lr_pipeline.predict(X_test)
print_metrics(y_pred_lr)

Accuracy:  0.72
ROC-AUC score:  0.75
F1-score:  0.74
Precision:  0.8
Recall:  0.72
---------------------------
[[537 238]
 [ 55 225]]
---------------------------
              precision    recall  f1-score   support

           0       0.91      0.69      0.79       775
           1       0.49      0.80      0.61       280

    accuracy                           0.72      1055
   macro avg       0.70      0.75      0.70      1055
weighted avg       0.80      0.72      0.74      1055



In [174]:
# Let's now build a Random Forest classifier Model
rf_pipeline = Pipeline([
    ("preprocess", preprocessing_pipeline),
    ("rf", RandomForestClassifier(n_estimators=50, class_weight="balanced") )
])

scores = cross_val_score(rf_pipeline, X_train, y_train, scoring="roc_auc", cv=cv)

In [175]:
# Checking out the scores for our Random Forest Model
display_scores(scores)

ROC-AUC Scores:  [0.83076085 0.81249019 0.82260751 0.81651132 0.81020201 0.80670633
 0.81777318 0.81957904 0.83523734 0.8146662  0.82575834 0.80975896
 0.80099323 0.81583832 0.83120219]
ROC-AUC Mean:  0.8180056664820702
ROC-AUC SD:  0.009352684299090712


In [None]:
# Fitting the pipeline on the training data
rf_pipeline.fit(X_train_full, y_train_full)

In [237]:
# Checking the model performance on test data
y_pred_rf = rf_pipeline.predict(X_test)
print_metrics(y_pred_rf)

Accuracy:  0.77
ROC-AUC score:  0.68
F1-score:  0.77
Precision:  0.76
Recall:  0.77
---------------------------
[[677  98]
 [141 139]]
---------------------------
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       775
           1       0.59      0.50      0.54       280

    accuracy                           0.77      1055
   macro avg       0.71      0.68      0.69      1055
weighted avg       0.76      0.77      0.77      1055



In [178]:
# Create a param_grid with values for tuning hyperparameters
rf_param_grid = [{'rf__criterion': ["gini", "entropy"], 'rf__n_estimators': [50, 100, 150, 200], 'rf__class_weight': ['balanced'],
               'rf__max_depth': [4, 6, 8, 10], 'rf__max_features': [8, 10, 12, 14]}]

In [None]:
# Creating a gridsearchCV for randomforestModel
rf_pipeline = Pipeline([
    ("preprocess", preprocessing_pipeline),
    ("rf", RandomForestClassifier() )
])

grid_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=cv, scoring="roc_auc", return_train_score=True)
grid_search.fit(X_train_full, y_train_full)

In [180]:
# checking the model's best performing hyperparameters
grid_search.best_params_

{'rf__class_weight': 'balanced',
 'rf__criterion': 'entropy',
 'rf__max_depth': 6,
 'rf__max_features': 10,
 'rf__n_estimators': 200}

In [None]:
# checking the model's best score
grid_search.best_score_

In [187]:
grid_search.best_estimator_[1]

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=6, max_features=10, n_estimators=200)

In [188]:
# Getting the feature importance of each attribute
feature_importances = grid_search.best_estimator_[1].feature_importances_
feature_importances

array([0.1175142 , 0.05816469, 0.08750159, 0.00285114, 0.00340007,
       0.00230598, 0.00272811, 0.0036999 , 0.00317441, 0.00120643,
       0.00166701, 0.00310151, 0.00139365, 0.0039362 , 0.02342251,
       0.05984402, 0.00549814, 0.07499433, 0.00667228, 0.01016267,
       0.01179595, 0.00940408, 0.0032561 , 0.00208765, 0.00782318,
       0.00124573, 0.06095825, 0.00828856, 0.00843062, 0.00296039,
       0.0042215 , 0.00543378, 0.00393782, 0.00745299, 0.00722415,
       0.20349413, 0.02581848, 0.10348922, 0.0054319 , 0.00513718,
       0.00367574, 0.00295397, 0.02966854, 0.00257125])

In [238]:
# Selecting the model with best parameters
final_rf_model = grid_search.best_estimator_

# Checking the model performance on test data
y_pred_rf = final_rf_model.predict(X_test)
print_metrics(y_pred_rf)

Accuracy:  0.73
ROC-AUC score:  0.76
F1-score:  0.75
Precision:  0.8
Recall:  0.73
---------------------------
[[549 226]
 [ 55 225]]
---------------------------
              precision    recall  f1-score   support

           0       0.91      0.71      0.80       775
           1       0.50      0.80      0.62       280

    accuracy                           0.73      1055
   macro avg       0.70      0.76      0.71      1055
weighted avg       0.80      0.73      0.75      1055



In [211]:
# Let's first build a K Nearerst Neigbors classifier Model
knn_pipeline = Pipeline([
    ("preprocess", preprocessing_pipeline),
    ("knn", KNeighborsClassifier(n_neighbors=20))
])

scores = cross_val_score(knn_pipeline, X_train, y_train, scoring="roc_auc", cv=cv)

In [212]:
# Checking out the scores for our K Nearest Neigbors Model
display_scores(scores)

ROC-AUC Scores:  [0.84057036 0.82831673 0.81779    0.83448303 0.81659544 0.80803866
 0.82064461 0.82161765 0.85639455 0.8087018  0.83621027 0.82296644
 0.80869619 0.83693945 0.8305993 ]
ROC-AUC Mean:  0.8259042992510132
ROC-AUC SD:  0.013146831280841658


In [None]:
# Fitting the pipeline on the training data
knn_pipeline.fit(X_train_full, y_train_full)

In [241]:
# Checking the model performance on test data
y_pred_knn = knn_pipeline.predict(X_test)
print_metrics(y_pred_knn)

Accuracy:  0.75
ROC-AUC score:  0.68
F1-score:  0.75
Precision:  0.75
Recall:  0.75
---------------------------
[[646 129]
 [131 149]]
---------------------------
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       775
           1       0.54      0.53      0.53       280

    accuracy                           0.75      1055
   macro avg       0.68      0.68      0.68      1055
weighted avg       0.75      0.75      0.75      1055



In [217]:
# Create a param_grid with values for tuning hyperparameters
knn_param_grid = [{'knn__n_neighbors': [20, 25, 30, 35, 40],
                    'knn__weights': ['uniform', 'distance'],
                    'knn__metric': ['euclidean', 'manhattan', 'minkowski']
                   }]

In [None]:
# Creating a gridsearchCV for randomforestModel
knn_pipeline = Pipeline([
    ("preprocess", preprocessing_pipeline),
    ("knn", KNeighborsClassifier() )
])

knn_grid_search = GridSearchCV(knn_pipeline, knn_param_grid, cv=cv, scoring="roc_auc", return_train_score=True)
knn_grid_search.fit(X_train_full, y_train_full)

In [219]:
# checking the model's best performing hyperparameters
knn_grid_search.best_params_

{'knn__metric': 'euclidean', 'knn__n_neighbors': 40, 'knn__weights': 'uniform'}

In [220]:
#  checking the model's best score
knn_grid_search.best_score_

0.8350876424172345

In [222]:
knn_grid_search.best_estimator_[1]

KNeighborsClassifier(metric='euclidean', n_neighbors=40)

In [242]:
# Selecting the model with best parameters
final_knn_model = knn_grid_search.best_estimator_

# Checking the model performance on test data
y_pred_knn = final_knn_model.predict(X_test)
print_metrics(y_pred_knn)

Accuracy:  0.79
ROC-AUC score:  0.73
F1-score:  0.79
Precision:  0.79
Recall:  0.79
---------------------------
[[667 108]
 [114 166]]
---------------------------
              precision    recall  f1-score   support

           0       0.85      0.86      0.86       775
           1       0.61      0.59      0.60       280

    accuracy                           0.79      1055
   macro avg       0.73      0.73      0.73      1055
weighted avg       0.79      0.79      0.79      1055



In [None]:
# Finally, Let's try out the XGboost Classifier
xgb_pipeline = Pipeline([
    ("preprocess", preprocessing_pipeline),
    ("xgb", XGBClassifier())
])

scores = cross_val_score(xgb_pipeline, X_train, y_train, scoring="roc_auc", cv=cv)

In [247]:
display_scores(scores)

ROC-AUC Scores:  [0.84060946 0.82576777 0.81568971 0.81729928 0.82398154 0.80761689
 0.83445499 0.81600938 0.83235189 0.82778675 0.84383833 0.81247616
 0.81795545 0.83272203 0.8225991 ]
ROC-AUC Mean:  0.8247439152433748
ROC-AUC SD:  0.010169388155857002


In [None]:
# Fitting the pipeline on the training data
xgb_pipeline.fit(X_train_full, y_train_full)

In [249]:
# Checking the model performance on test data
y_pred_xgb = xgb_pipeline.predict(X_test)
print_metrics(y_pred_xgb)

Accuracy:  0.79
ROC-AUC score:  0.7
F1-score:  0.78
Precision:  0.78
Recall:  0.79
---------------------------
[[686  89]
 [137 143]]
---------------------------
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       775
           1       0.62      0.51      0.56       280

    accuracy                           0.79      1055
   macro avg       0.72      0.70      0.71      1055
weighted avg       0.78      0.79      0.78      1055



In [252]:
# Creating a parameter list for XGboost Classifier
xgb_param_grid = [{'xgb__gamma': [0, 0.5, 1, 1.5, 2, 5],
              'xgb__learning_rate': [0.001, 0.01, 0.1, 0.15, 0.2],
              'xgb__max_depth': [5, 6, 7, 8],
              'xgb__n_estimators': [50, 100, 150, 200],
              'xgb__subsample': [0.5, 0.75, 0.9]
             }]

In [253]:
# Creating a gridsearchCV for Xgboost classifier
xgb_pipeline = Pipeline([
    ("preprocess", preprocessing_pipeline),
    ("xgb", XGBClassifier() )
])

xgb_grid_search = GridSearchCV(xgb_pipeline, xgb_param_grid, cv=cv, scoring="roc_auc", return_train_score=True)
xgb_grid_search.fit(X_train_full, y_train_full)

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [None]:
# checking the model's best performing hyperparameters
knn_grid_search.best_params_

In [None]:
# checking the model's best score
knn_grid_search.best_score_

In [None]:
knn_grid_search.best_estimator_[1]

In [None]:
# Selecting the model with best parameters
final_xgb_model = xgb_grid_search.best_estimator_

# Checking the model performance on test data
y_pred_xgb = final_xgb_model.predict(X_test)
print_metrics(y_pred_xgb)

In [None]:
# Saving all the ML models
import joblib

# Logistic Regression Model
joblib.dump(lr_pipeline, "../models/logistic_regression_churn.pkl")

# Random Forest Classifier Model
joblib.dump(final_rf_model, "../models/random_forest_churn.pkl")

# K Nearest Neigbors Classifier Model
joblib.dump(final_knn_model, "../models/k_nearest_neighbors_churn.pkl")

# K Nearest Neigbors Classifier Model
joblib.dump(final_xgb_model, "../models/xtreme_boosting_churn.pkl")