## **Imports and Pre-reqs**

In [1]:
from google.colab import drive
drive.mount("/gdrive")
%cd /gdrive/My Drive/CIS_508/Colab Notebooks/Projects/3.Customer Churn

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/CIS_508/Colab Notebooks/Projects/3.Customer Churn


In [2]:
! pwd

/gdrive/My Drive/CIS_508/Colab Notebooks/Projects/3.Customer Churn


In [3]:
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import SCORERS, confusion_matrix, roc_auc_score

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')



In [4]:
sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

## **EDA and Data pre-processing Training Data**

In [5]:
data = pd.read_csv("TelcoCustomerChurn.csv")
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
X = data.drop(columns = ['Churn', 'customerID'])
y = data['Churn']
X.shape, y.shape

((7043, 19), (7043,))

In [7]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, random_state = 6)
Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape

((5634, 19), (5634,), (1409, 19), (1409,))

In [8]:
Xtrain.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [9]:
Xtrain.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
dtype: object

In [0]:
Xtrain['TotalCharges'] = pd.to_numeric(Xtrain['TotalCharges'], errors = 'coerce')

In [11]:
Xtrain.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,5634.0,5634.0,5634.0,5625.0
mean,0.161519,32.07579,64.671166,2258.736169
std,0.368042,24.492631,30.177218,2260.108079
min,0.0,0.0,18.25,18.8
25%,0.0,9.0,35.275,398.55
50%,0.0,28.0,70.35,1372.45
75%,0.0,55.0,90.05,3734.25
max,1.0,72.0,118.75,8684.8


In [12]:
Xtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5634 entries, 1370 to 2761
Data columns (total 19 columns):
gender              5634 non-null object
SeniorCitizen       5634 non-null int64
Partner             5634 non-null object
Dependents          5634 non-null object
tenure              5634 non-null int64
PhoneService        5634 non-null object
MultipleLines       5634 non-null object
InternetService     5634 non-null object
OnlineSecurity      5634 non-null object
OnlineBackup        5634 non-null object
DeviceProtection    5634 non-null object
TechSupport         5634 non-null object
StreamingTV         5634 non-null object
StreamingMovies     5634 non-null object
Contract            5634 non-null object
PaperlessBilling    5634 non-null object
PaymentMethod       5634 non-null object
MonthlyCharges      5634 non-null float64
TotalCharges        5625 non-null float64
dtypes: float64(2), int64(2), object(15)
memory usage: 880.3+ KB


In [13]:
Xtrain.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        9
dtype: int64

In [0]:
Xtrain = Xtrain.fillna(value = 0)

In [15]:
Xtrain.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
dtype: int64

In [16]:
ytrain.head()

1370     No
5676     No
5800     No
1645    Yes
366      No
Name: Churn, dtype: object

## **SMOTE for imbalanced class problem**

In [17]:
ytrain.value_counts()

No     4120
Yes    1514
Name: Churn, dtype: int64

In [18]:
le = LabelEncoder()
temp = ytrain.copy()
ytrain = pd.DataFrame(le.fit_transform(temp), columns = ['Churn'], index = temp.index)
ytrain.head()

Unnamed: 0,Churn
1370,0
5676,0
5800,0
1645,1
366,0


In [19]:
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_cols = [x for x in Xtrain.columns if x not in numeric_cols]
numeric_cols, categorical_cols

(['tenure', 'MonthlyCharges', 'TotalCharges'],
 ['gender',
  'SeniorCitizen',
  'Partner',
  'Dependents',
  'PhoneService',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod'])

In [0]:
cat_transformer = Pipeline(steps = [('ohe', OneHotEncoder(sparse = False, handle_unknown='ignore'))] )

pp = ColumnTransformer(
    [('categorical', cat_transformer, categorical_cols)], 
    remainder = 'passthrough' 
)

sm = SMOTE(ratio = 0.9, random_state = 6)

## **Model 1: Random Forest Classifier**

In [21]:
model1 = Pipeline(steps = 
                 [('pp', pp), 
                  ('sm', sm),
                  ('clf', RandomForestClassifier())]
)
model1

Pipeline(memory=None,
         steps=[('pp',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('categorical',
                                                  Pipeline(memory=None,
                                                           steps=[('ohe',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='ignore',
                                                                                 sparse=False))],
                                             

In [22]:
model1.fit(Xtrain, ytrain)

Pipeline(memory=None,
         steps=[('pp',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('categorical',
                                                  Pipeline(memory=None,
                                                           steps=[('ohe',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='ignore',
                                                                                 sparse=False))],
                                             

In [0]:
params1 = {
          'clf__n_estimators':[200],
          'clf__max_depth':[10, 20],
          'clf__min_samples_split':[10, 30],
          'clf__max_features':[25, 45],
          'clf__warm_start':[True]
}

In [24]:
clf1 = GridSearchCV(model1, param_grid = params1, cv = 5, scoring = 'roc_auc')
clf1.fit(Xtrain, ytrain)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pp',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('categorical',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('ohe',
                                                                                          OneHotEncoder(categories='auto',
                                                                                                        drop=None,
                                     

In [25]:
clf1.best_params_

{'clf__max_depth': 10,
 'clf__max_features': 25,
 'clf__min_samples_split': 30,
 'clf__n_estimators': 200,
 'clf__warm_start': True}

In [26]:
clf1.best_score_

0.8423248096204207

## **Model 2: XGB Classifier**

In [27]:
model2 = Pipeline(steps = 
                  [('pp', pp),
                   ('sm', sm),
                   ('estimator', XGBClassifier())
                   ])
model2

Pipeline(memory=None,
         steps=[('pp',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('categorical',
                                                  Pipeline(memory=None,
                                                           steps=[('ohe',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='ignore',
                                                                                 sparse=False))],
                                             

In [28]:
model2.fit(Xtrain, ytrain)

Pipeline(memory=None,
         steps=[('pp',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('categorical',
                                                  Pipeline(memory=None,
                                                           steps=[('ohe',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='ignore',
                                                                                 sparse=False))],
                                             

In [29]:
XGBClassifier().get_params().keys()

dict_keys(['base_score', 'booster', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'gamma', 'learning_rate', 'max_delta_step', 'max_depth', 'min_child_weight', 'missing', 'n_estimators', 'n_jobs', 'nthread', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'seed', 'silent', 'subsample', 'verbosity'])

In [0]:
params2 = {
    'sm__ratio':[0.5, 0.9],
    'estimator__n_estimators': stats.randint(150, 1000),
    'estimator__learning_rate': stats.uniform(0.01, 0.6),
    'estimator__subsample': stats.uniform(0.3, 0.9),
    'estimator__max_depth': [3, 4, 5, 6, 7, 8, 9],
    'estimator__colsample_bytree': stats.uniform(0.5, 0.9),
    'estimator__min_child_weight': [1, 2, 3, 4]
    }

In [31]:
clf2 = RandomizedSearchCV(model2, params2, n_iter = 60, scoring = 'roc_auc', n_jobs = -1, random_state = 6, verbose = True)
clf2.fit(Xtrain, ytrain)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   37.4s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 11.7min finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('pp',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='passthrough',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('categorical',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('ohe',
                                                                                                OneHotEncoder(categories='auto',
                                                                                   

In [32]:
clf2.best_score_

0.8476740423833421

In [33]:
clf2.best_params_

{'estimator__colsample_bytree': 0.8938686061461818,
 'estimator__learning_rate': 0.03749149235388897,
 'estimator__max_depth': 3,
 'estimator__min_child_weight': 1,
 'estimator__n_estimators': 236,
 'estimator__subsample': 0.9557401377053238,
 'sm__ratio': 0.9}

# **Model Selection**

## Test Data preprocessing

In [34]:
Xtest.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
dtype: int64

In [35]:
Xtest.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,1409.0,1409.0,1409.0
mean,0.164656,33.552165,65.123669
std,0.371001,24.798598,29.7468
min,0.0,0.0,18.4
25%,0.0,9.0,38.55
50%,0.0,31.0,70.3
75%,0.0,58.0,89.25
max,1.0,72.0,118.65


In [36]:
Xtest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1409 entries, 4539 to 58
Data columns (total 19 columns):
gender              1409 non-null object
SeniorCitizen       1409 non-null int64
Partner             1409 non-null object
Dependents          1409 non-null object
tenure              1409 non-null int64
PhoneService        1409 non-null object
MultipleLines       1409 non-null object
InternetService     1409 non-null object
OnlineSecurity      1409 non-null object
OnlineBackup        1409 non-null object
DeviceProtection    1409 non-null object
TechSupport         1409 non-null object
StreamingTV         1409 non-null object
StreamingMovies     1409 non-null object
Contract            1409 non-null object
PaperlessBilling    1409 non-null object
PaymentMethod       1409 non-null object
MonthlyCharges      1409 non-null float64
TotalCharges        1409 non-null object
dtypes: float64(1), int64(2), object(16)
memory usage: 220.2+ KB


In [0]:
Xtest['TotalCharges'] = pd.to_numeric(Xtest['TotalCharges'], errors = 'coerce')

In [38]:
Xtest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1409 entries, 4539 to 58
Data columns (total 19 columns):
gender              1409 non-null object
SeniorCitizen       1409 non-null int64
Partner             1409 non-null object
Dependents          1409 non-null object
tenure              1409 non-null int64
PhoneService        1409 non-null object
MultipleLines       1409 non-null object
InternetService     1409 non-null object
OnlineSecurity      1409 non-null object
OnlineBackup        1409 non-null object
DeviceProtection    1409 non-null object
TechSupport         1409 non-null object
StreamingTV         1409 non-null object
StreamingMovies     1409 non-null object
Contract            1409 non-null object
PaperlessBilling    1409 non-null object
PaymentMethod       1409 non-null object
MonthlyCharges      1409 non-null float64
TotalCharges        1407 non-null float64
dtypes: float64(2), int64(2), object(15)
memory usage: 220.2+ KB


In [39]:
Xtest.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        2
dtype: int64

In [0]:
Xtest = Xtest.fillna(value = 0)

In [41]:
Xtest.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
dtype: int64

In [42]:
temp = ytest.copy()
ytest = pd.DataFrame(le.transform(temp), columns = ['Target'], index = temp.index).copy()
ytest.shape

(1409, 1)

## Model Evaluation

In [43]:
clf1_y_pred = model1.predict(Xtest)
print("--------------Model 1: Test Metrics----------------")
print("Confusion matrix:\n", confusion_matrix(ytest, clf1_y_pred))
print("AUC:", roc_auc_score(ytest, clf1_y_pred))

--------------Model 1: Test Metrics----------------
Confusion matrix:
 [[948 106]
 [179 176]]
AUC: 0.6976026939626373


In [44]:
clf2_y_pred = clf2.predict(Xtest)
print("--------------Model 2: Test Metrics----------------")
print("Confusion matrix:\n", confusion_matrix(ytest, clf2_y_pred))
print("AUC:", roc_auc_score(ytest, clf2_y_pred))

--------------Model 2: Test Metrics----------------
Confusion matrix:
 [[931 123]
 [155 200]]
AUC: 0.7233409947350136
