### Data Preprocessing

In [15]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('data/Telco-Customer-Churn.csv')
df['TotalCharges'] = df['TotalCharges'].replace(" ", 0).astype('float32')

In [3]:
cat_features = df.drop(['customerID','TotalCharges', 'MonthlyCharges', 'SeniorCitizen', 'tenure', 'Churn'],axis=1).columns
cat_features

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')

In [4]:
df[cat_features].head()

Unnamed: 0,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod
0,Female,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check
1,Male,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check
2,Male,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check
3,Male,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic)
4,Female,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check


In [5]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
ohe.fit(df[cat_features])

OneHotEncoder(sparse=False)

In [6]:
dff = ohe.transform(df[cat_features])
dff = pd.DataFrame(dff, columns=ohe.get_feature_names())
dff = pd.concat([dff, df[['SeniorCitizen', 'MonthlyCharges', 'TotalCharges', 'tenure']]], axis=1)

In [7]:
dff.head()

Unnamed: 0,x0_Female,x0_Male,x1_No,x1_Yes,x2_No,x2_Yes,x3_No,x3_Yes,x4_No,x4_No phone service,...,x11_Yes,x12_Month-to-month,x12_One year,x12_Two year,x13_No,x13_Yes,x14_Bank transfer (automatic),x14_Credit card (automatic),x14_Electronic check,x14_Mailed check
0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [9]:
bin_dict = {'No':0, 'Yes':1}
df.Churn = df.Churn.map(bin_dict)

### Modeling

In [10]:
from sklearn.model_selection import train_test_split

X = dff.values
y = df.Churn

X_train, X_test, y_train, y_test = train_test_split(X,y)

In [11]:
print(df.shape)
print("\n")
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
#print("\n")
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

(7043, 21)


X_train:  (5282, 45)
y_train:  (5282,)
X_test:  (1761, 45)
y_test:  (1761,)


### SVM

In [12]:
# Fitting classifier to the Training set
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

classifier_svm_kernel = SVC()
steps = [
    ('scalar', StandardScaler()),
    ('model', SVC())
]
svm_kernel_pipe = Pipeline(steps)

In [13]:
%%time
parameters = { 'model__kernel' : ['poly'],
               'model__C' : [10],
               'model__gamma' : ['scale'],
               'model__random_state' : [42],
               'model__degree' : [1]
}
classifier_svm_kernel = GridSearchCV(svm_kernel_pipe, parameters, scoring='accuracy', verbose=10, cv=6, n_jobs=-1)
classifier_svm_kernel = classifier_svm_kernel.fit(X_train, y_train.ravel())

Fitting 6 folds for each of 1 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   6 | elapsed:    3.4s remaining:    6.9s
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:    3.4s remaining:    3.4s
[Parallel(n_jobs=-1)]: Done   4 out of   6 | elapsed:    3.4s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    4.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    4.6s finished


Wall time: 5.94 s


In [16]:
y_pred_svm_kernel_train = classifier_svm_kernel.predict(X_train)
accuracy_svm_kernel_train = accuracy_score(y_train, y_pred_svm_kernel_train)
print("Training set: ", accuracy_svm_kernel_train)

y_pred_svm_kernel_test = classifier_svm_kernel.predict(X_test)
accuracy_svm_kernel_test = accuracy_score(y_test, y_pred_svm_kernel_test)
print("Test set: ", accuracy_svm_kernel_test)

Training set:  0.8012116622491481
Test set:  0.7893242475865985


In [108]:
filename = 'data/svm_model.sav'
pickle.dump(classifier_svm_kernel, open(filename, 'wb'))

### LightGBM

In [32]:
import xgboost as xgb

steps = [
    ('scalar', StandardScaler()),
    ('model', xgb.XGBClassifier())
]
xgb_pipe = Pipeline(steps)

In [35]:
%%time
parameters = { 'model__min_child_weight': [10],
        'model__gamma': [5],
        'model__subsample': [0.6],
        'model__colsample_bytree': [0.6],
        'model__max_depth': [3]
}
classifier_xgb_kernel = GridSearchCV(xgb_pipe, parameters, scoring='accuracy', verbose=10, cv=6, n_jobs=-1)
classifier_xgb_kernel = classifier_xgb_kernel.fit(X_train, y_train.ravel())

Fitting 6 folds for each of 1 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   6 | elapsed:    0.6s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:    0.6s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   4 out of   6 | elapsed:    0.7s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    1.2s finished


Wall time: 1.81 s


In [36]:
y_pred_xgb_train = classifier_xgb_kernel.predict(X_train)
accuracy_xgb_kernel_train = accuracy_score(y_train, y_pred_xgb_train)
print("Training set: ", accuracy_xgb_kernel_train)

y_pred_xgb_test = classifier_xgb_kernel.predict(X_test)
accuracy_xgb_test = accuracy_score(y_test, y_pred_xgb_test)
print("Test set: ", accuracy_xgb_test)

Training set:  0.8241196516471033
Test set:  0.8001135718341851


In [37]:
filename = 'data/xgb_model.sav'
pickle.dump(classifier_xgb_kernel, open(filename, 'wb'))

In [123]:
df[df['Churn'] == 1].iloc[0]

customerID              3668-QPYBK
gender                        Male
SeniorCitizen                    0
Partner                         No
Dependents                      No
tenure                           2
PhoneService                   Yes
MultipleLines                   No
InternetService                DSL
OnlineSecurity                 Yes
OnlineBackup                   Yes
DeviceProtection                No
TechSupport                     No
StreamingTV                     No
StreamingMovies                 No
Contract            Month-to-month
PaperlessBilling               Yes
PaymentMethod         Mailed check
MonthlyCharges               53.85
TotalCharges                108.15
Churn                            1
Name: 2, dtype: object

In [133]:
df.tail(2).iloc[0]

customerID              8361-LTMKD
gender                        Male
SeniorCitizen                    1
Partner                        Yes
Dependents                      No
tenure                           4
PhoneService                   Yes
MultipleLines                  Yes
InternetService        Fiber optic
OnlineSecurity                  No
OnlineBackup                    No
DeviceProtection                No
TechSupport                     No
StreamingTV                     No
StreamingMovies                 No
Contract            Month-to-month
PaperlessBilling               Yes
PaymentMethod         Mailed check
MonthlyCharges                74.4
TotalCharges                 306.6
Churn                            1
Name: 7041, dtype: object

In [145]:
import scipy
scipy.__version__

'1.5.0'

In [38]:
xgb.__version__

'1.2.1'