In [1]:
import pandas as pd
import numpy as np

import itertools
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt#visualization
import seaborn as sns#visualization

import io

In [2]:
telcom = pd.read_excel("Telco Churn Data.xlsx")
telcom.head()

Unnamed: 0,Account Length,VMail Message,Day Mins,Eve Mins,Night Mins,Intl Mins,CustServ Calls,Churn,Int'l Plan,Day Calls,Day Charge,Eve Calls,Eve Charge,Night Calls,Night Charge,Intl Calls,Intl Charge
0,128,25,265.1,197.4,244.7,10.0,1,0,0,110,45.07,99,16.78,91,11.01,3,2.7
1,107,26,161.6,195.5,254.4,13.7,1,0,0,123,27.47,103,16.62,103,11.45,3,3.7
2,137,0,243.4,121.2,162.6,12.2,0,0,0,114,41.38,110,10.3,104,7.32,5,3.29
3,84,0,299.4,61.9,196.9,6.6,2,0,1,71,50.9,88,5.26,89,8.86,7,1.78
4,75,0,166.7,148.3,186.9,10.1,3,0,1,113,28.34,122,12.61,121,8.41,3,2.73


In [3]:
print ("Rows     : " ,telcom.shape[0])
print ("Columns  : " ,telcom.shape[1])
print ("\nMissing values : ", telcom.isnull().sum().values.sum())
print ("\nFeatures : \n" ,telcom.columns.tolist())
print ("\nData Types :  \n", telcom.dtypes)
print ("\nUnique values :  \n", telcom.nunique())


Rows     :  3333
Columns  :  17

Missing values :  0

Features : 
 ['Account Length', 'VMail Message', 'Day Mins', 'Eve Mins', 'Night Mins', 'Intl Mins', 'CustServ Calls', 'Churn', "Int'l Plan", 'Day Calls', 'Day Charge', 'Eve Calls', 'Eve Charge', 'Night Calls', 'Night Charge', 'Intl Calls', 'Intl Charge']

Data Types :  
 Account Length      int64
VMail Message       int64
Day Mins          float64
Eve Mins          float64
Night Mins        float64
Intl Mins         float64
CustServ Calls      int64
Churn               int64
Int'l Plan          int64
Day Calls           int64
Day Charge        float64
Eve Calls           int64
Eve Charge        float64
Night Calls         int64
Night Charge      float64
Intl Calls          int64
Intl Charge       float64
dtype: object

Unique values :  
 Account Length     212
VMail Message       46
Day Mins          1667
Eve Mins          1611
Night Mins        1591
Intl Mins          162
CustServ Calls      10
Churn                2
Int'l Plan    

# Modeling Preprocessing

In [4]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [5]:
target_col = ["Churn"]
cat_cols   = telcom.nunique()[telcom.nunique() <= 5].keys().tolist()
cat_cols   = [x for x in cat_cols if x not in target_col]
num_cols   = [x for x in telcom.columns if x not in cat_cols + target_col]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(telcom[cat_cols + num_cols], telcom['Churn'], 
                                                    test_size = 0.2, stratify = telcom['Churn'], 
                                                    random_state = 123)

X_train.reset_index(drop = True, inplace = True)
X_test.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
y_test.reset_index(drop = True, inplace = True)

# Modeling Fitting & Evaluation

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score, recall_score, precision_score

In [8]:
logit = LogisticRegression()
svc = SVC()
ada = AdaBoostClassifier()
gb = GradientBoostingClassifier(n_estimators= 500)
gb_bg = GradientBoostingClassifier(n_estimators= 1000)
rf = RandomForestClassifier(n_estimators= 500)
nn_sm = MLPClassifier(hidden_layer_sizes=(50, 100, 80, 30))
nn_bg = MLPClassifier(hidden_layer_sizes=(50, 200, 100, 80, 50, 30, 15))  
voting = VotingClassifier(estimators = [('gb', gb), ('rf', rf)], voting = 'soft')

In [9]:
dict_model = {
    'logit' : logit, 
    'svc' : svc, 
    'ada' : ada,
    'gb' : gb, 
    'gb_bg' : gb_bg,
    'rf' : rf,
    'voting' : voting,
    'nn_sm' : nn_sm, 
    'nn_bg' : nn_bg,
}

In [10]:
#Fitting
for name, model in dict_model.items():
    print(name)
    model.fit(X_train, y_train)

logit
svc
ada
gb
gb_bg
rf
voting
nn_sm
nn_bg


In [11]:
dict_eval = {} 
dict_eval['model_name'] = []
dict_eval['train_acc'] = []
dict_eval['test_acc'] = []
dict_eval['train_precision'] = []
dict_eval['test_precision'] = []
dict_eval['train_recall'] = []
dict_eval['test_recall'] = []
dict_eval['train_roc'] = []
dict_eval['test_roc'] = []
dict_eval['train_TP'] = []
dict_eval['test_TP'] = []
dict_eval['train_TN'] = []
dict_eval['test_TN'] = []
dict_eval['train_FP'] = []
dict_eval['test_FP'] = []
dict_eval['train_FN'] = []
dict_eval['test_FN'] = []

for name, model in dict_model.items():
    dict_eval['model_name'].append(name)
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    #acc
    train_acc = accuracy_score(y_train, y_pred_train)
    dict_eval['train_acc'].append(train_acc)
    test_acc = accuracy_score(y_test, y_pred_test)
    dict_eval['test_acc'].append(test_acc)
    
    #precision
    train_precision = precision_score(y_train, y_pred_train)
    dict_eval['train_precision'].append(train_precision)
    test_precision = precision_score(y_test, y_pred_test)
    dict_eval['test_precision'].append(test_precision)
    
    #recall 
    train_recall = precision_score(y_train, y_pred_train)
    dict_eval['train_recall'].append(train_recall)
    test_recall = precision_score(y_test, y_pred_test)
    dict_eval['test_recall'].append(test_recall)
    
    #recall 
    train_roc = roc_auc_score(y_train, y_pred_train)
    dict_eval['train_roc'].append(train_roc)
    test_roc = roc_auc_score(y_test, y_pred_test)
    dict_eval['test_roc'].append(test_roc)
    
    #confusion metric
    cm_train = confusion_matrix(y_train, y_pred_train)
    cm_test = confusion_matrix(y_test, y_pred_test)
    
    dict_eval['train_TP'].append(cm_train[0,0])
    dict_eval['test_TP'].append(cm_test[0,0])
    dict_eval['train_TN'].append(cm_train[1,1])
    dict_eval['test_TN'].append(cm_test[1,1])
    dict_eval['train_FP'].append(cm_train[0,1])
    dict_eval['test_FP'].append(cm_test[0,1])
    dict_eval['train_FN'].append(cm_train[1,0])
    dict_eval['test_FN'].append(cm_test[1,0])
    

In [12]:
pd.DataFrame(dict_eval)

Unnamed: 0,model_name,train_acc,test_acc,train_precision,test_precision,train_recall,test_recall,train_roc,test_roc,train_TP,test_TP,train_TN,test_TN,train_FP,test_FP,train_FN,test_FN
0,logit,0.86159,0.86057,0.576577,0.566667,0.576577,0.566667,0.572595,0.576225,2233,557,64,17,47,13,322,80
1,svc,1.0,0.854573,1.0,0.0,1.0,0.0,1.0,0.5,2280,570,386,0,0,0,0,97
2,ada,0.886347,0.88006,0.681223,0.654545,0.681223,0.654545,0.686064,0.6689,2207,551,156,36,73,19,230,61
3,gb,0.997374,0.958021,1.0,0.905882,1.0,0.905882,0.990933,0.88989,2280,562,379,77,0,8,7,20
4,gb_bg,1.0,0.955022,1.0,0.903614,1.0,0.903614,1.0,0.87958,2280,562,386,75,0,8,0,22
5,rf,1.0,0.95952,1.0,0.9375,1.0,0.9375,1.0,0.882212,2280,565,386,75,0,5,0,22
6,voting,0.999625,0.95952,1.0,0.906977,1.0,0.906977,0.998705,0.895044,2280,562,385,78,0,8,1,19
7,nn_sm,0.942986,0.863568,0.858896,0.55,0.858896,0.55,0.852607,0.646419,2234,543,280,33,46,27,106,64
8,nn_bg,0.981995,0.856072,0.920398,0.506173,0.920398,0.506173,0.972257,0.676252,2248,530,370,41,32,40,16,56


In [13]:
for name, model in dict_model.items():
    print(name)
    y_pred_test = model.predict(X_test)
    print(confusion_matrix(y_test, y_pred_test))
    

logit
[[557  13]
 [ 80  17]]
svc
[[570   0]
 [ 97   0]]
ada
[[551  19]
 [ 61  36]]
gb
[[562   8]
 [ 20  77]]
gb_bg
[[562   8]
 [ 22  75]]
rf
[[565   5]
 [ 22  75]]
voting
[[562   8]
 [ 19  78]]
nn_sm
[[543  27]
 [ 64  33]]
nn_bg
[[530  40]
 [ 56  41]]
