### Importing the necessary libraries

In [3]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

### Loading the processed dataset

In [4]:
df=pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [5]:
df=df.drop('Unnamed: 0',axis=1)

In [6]:
# Dividing the dataset into independent and dependent features

X=df.drop('Churn',axis=1)
X

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [7]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

In [9]:
# Train test split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

### Decision Tree Classifier

In [11]:
model_dt =  DecisionTreeClassifier(criterion='gini', random_state = 100, max_depth = 6, min_samples_leaf = 8)

In [12]:
model_dt.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [13]:
y_pred = model_dt.predict(X_test)
y_pred

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [15]:
model_dt.score(X_test,y_test)

0.7846481876332623

In [16]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1039
           1       0.60      0.51      0.56       368

    accuracy                           0.78      1407
   macro avg       0.72      0.70      0.71      1407
weighted avg       0.78      0.78      0.78      1407



As we can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.

Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [17]:
sm = SMOTEENN()

In [21]:
# Fitting the resampled dataset
X_resampled, y_resampled = sm.fit_resample(X,y)

In [22]:
# Train test split
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [23]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [24]:
# Fitting the Decision Tree Classifier model and prediction

model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.936080740117746
              precision    recall  f1-score   support

           0       0.97      0.89      0.93       555
           1       0.91      0.97      0.94       634

    accuracy                           0.94      1189
   macro avg       0.94      0.93      0.94      1189
weighted avg       0.94      0.94      0.94      1189



In [25]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[495  60]
 [ 16 618]]


Now we can see quite better results, i.e. Accuracy: 93 %, and a very good recall, precision & f1 score for minority class.

Let's try with some other classifier.

### Random Forest Classifier

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [29]:
# Fitting the Random Forest Classifier Model

model_rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [31]:
y_pred=model_rf.predict(X_test)

In [33]:
model_rf.score(X_test,y_test)

0.8024164889836531

In [34]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1039
           1       0.68      0.46      0.55       368

    accuracy                           0.80      1407
   macro avg       0.75      0.69      0.71      1407
weighted avg       0.79      0.80      0.79      1407



Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [37]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(X,y)

In [38]:
# Train test split
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [39]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [40]:
model_rf_smote.fit(xr_train1,yr_train1)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [41]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [42]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [43]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9204737732656514
              precision    recall  f1-score   support

           0       0.94      0.87      0.91       521
           1       0.91      0.96      0.93       661

    accuracy                           0.92      1182
   macro avg       0.92      0.92      0.92      1182
weighted avg       0.92      0.92      0.92      1182



In [44]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[455  66]
 [ 28 633]]


### Hyperparameter Optimization for Random Forest Classifier

In [45]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators=[int(x) for x in np.linspace(start=200,stop=2000,num=10)]

# Number of features to consider at every split
max_features=['auto','sqrt', 'log2']

# Maximum number of levels in tree
max_depth=[int(x) for x in np.linspace(10,1000,10)]

# Minimum number of samples required to split a node
min_samples_split=[2,5,10,14]

# Minimum number of samples required at each leaf node
min_samples_leaf=[1,2,4,6,8]

# Create the random grid
random_grid={'n_estimators':n_estimators,
            'max_features':max_features,
             'max_depth':max_depth,
             'min_samples_split':min_samples_split,
             'min_samples_leaf':min_samples_leaf,
             'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [47]:
rf=RandomForestClassifier()
rf_randomCV=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=25,cv=3,verbose=2,random_state=100,n_jobs=-1)

### fit the randomized model
rf_randomCV.fit(xr_train1,yr_train1)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=25,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [48]:
rf_randomCV.best_params_

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 120,
 'criterion': 'entropy'}

In [49]:
best_random_grid=rf_randomCV.best_estimator_

In [51]:
from sklearn.metrics import accuracy_score
yr_predict1=best_random_grid.predict(xr_test1)
print(confusion_matrix(yr_test1,yr_predict1))
print('Accuracy score {}'.format(accuracy_score(yr_test1,yr_predict1)))
print('Classification report {}'.format(classification_report(yr_test1,yr_predict1)))

[[488  33]
 [ 29 632]]
Accuracy score 0.9475465313028765
Classification report               precision    recall  f1-score   support

           0       0.94      0.94      0.94       521
           1       0.95      0.96      0.95       661

    accuracy                           0.95      1182
   macro avg       0.95      0.95      0.95      1182
weighted avg       0.95      0.95      0.95      1182



### Hyperparameter Optimization for Decision Tree

In [58]:
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

# Setup the parameters and distributions to sample from: param_dist
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)

# Fit it to the data
tree_cv.fit(xr_train1,yr_train1)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

Tuned Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 8, 'min_samples_leaf': 7}
Best score is 0.9147089947089947


In [62]:
from sklearn.metrics import accuracy_score
yr_predict1_tree=tree_cv.predict(xr_test1)
print(confusion_matrix(yr_test1,yr_predict1_tree))
print('Accuracy score {}'.format(accuracy_score(yr_test1,yr_predict1_tree)))
print('Classification report {}'.format(classification_report(yr_test1,yr_predict1_tree)))

[[467  54]
 [ 43 618]]
Accuracy score 0.9179357021996616
Classification report               precision    recall  f1-score   support

           0       0.92      0.90      0.91       521
           1       0.92      0.93      0.93       661

    accuracy                           0.92      1182
   macro avg       0.92      0.92      0.92      1182
weighted avg       0.92      0.92      0.92      1182



### XGBoost Classifier

In [64]:
import xgboost as xgb

In [66]:
classifier= xgb.XGBClassifier()
classifier.fit(xr_train1,yr_train1)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [67]:
prediction=classifier.predict(xr_test1)

In [68]:
from sklearn.metrics import accuracy_score
print(confusion_matrix(yr_test1,prediction))
print('Accuracy score {}'.format(accuracy_score(yr_test1,prediction)))
print('Classification report {}'.format(classification_report(yr_test1,prediction)))

[[496  25]
 [ 28 633]]
Accuracy score 0.955160744500846
Classification report               precision    recall  f1-score   support

           0       0.95      0.95      0.95       521
           1       0.96      0.96      0.96       661

    accuracy                           0.96      1182
   macro avg       0.95      0.95      0.95      1182
weighted avg       0.96      0.96      0.96      1182



### Hyperparameter Optimization for XGB Classifier

In [71]:
xgb.XGBClassifier()

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

In [72]:
from sklearn.model_selection import RandomizedSearchCV

In [69]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Various learning rate parameters
learning_rate = ['0.05','0.1', '0.2','0.3','0.5','0.6']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
#Subssample parameter values
subsample=[0.7,0.6,0.8]
# Minimum child weight parameters
min_child_weight=[3,4,5,6,7]

In [70]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'subsample': subsample,
               'min_child_weight': min_child_weight}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'learning_rate': ['0.05', '0.1', '0.2', '0.3', '0.5', '0.6'], 'max_depth': [5, 10, 15, 20, 25, 30], 'subsample': [0.7, 0.6, 0.8], 'min_child_weight': [3, 4, 5, 6, 7]}


In [74]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
classifier=xgb.XGBClassifier()

In [75]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
xg_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 25, cv = 3, verbose=2, random_state=100, n_jobs = -1)

In [76]:
xg_random.fit(xr_train1,yr_train1)

Fitting 3 folds for each of 25 candidates, totalling 75 fits






RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                           subsample=None, tree_method=None,
                                           validate_parameters=None,
                                   

In [77]:
xg_random.best_params_

{'subsample': 0.7,
 'n_estimators': 1000,
 'min_child_weight': 5,
 'max_depth': 25,
 'learning_rate': '0.1'}

In [78]:
xg_random.best_score_

0.9564021164021165

In [79]:
predictions=xg_random.predict(xr_test1)

In [80]:
from sklearn.metrics import accuracy_score
yr_predict1_xg=xg_random.predict(xr_test1)
print(confusion_matrix(yr_test1,yr_predict1_xg))
print('Accuracy score {}'.format(accuracy_score(yr_test1,yr_predict1_xg)))
print('Classification report {}'.format(classification_report(yr_test1,yr_predict1_xg)))

[[498  23]
 [ 29 632]]
Accuracy score 0.9560067681895094
Classification report               precision    recall  f1-score   support

           0       0.94      0.96      0.95       521
           1       0.96      0.96      0.96       661

    accuracy                           0.96      1182
   macro avg       0.95      0.96      0.96      1182
weighted avg       0.96      0.96      0.96      1182



#### With XGB Classifier, also we are able to get quite good results, infact better than Decision Tree and Random Forest Classifier.

### Pickling the model

In [81]:
import pickle

In [82]:
filename = 'model.sav'

In [83]:
pickle.dump(xg_random, open(filename, 'wb'))

#### Testing

In [84]:
load_model = pickle.load(open(filename, 'rb'))

In [85]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [86]:
model_score_r1

0.9560067681895094

#### Our final model i.e. XGB Classifier with SMOTEENN, is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI.