In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier, XGBRegressor
import seaborn as sns
sns.set_style("dark")
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer

#from warnings import filterwarnings
#filterwarnings('ignore')

data=None
def absolute_correlations(col, df=data):
    #absolute_values = np.abs(df[col])
    corrs = pd.DataFrame(df.select_dtypes(include=[np.number]).corrwith(df[col]), columns=['correlation'])
    corrs['absol'] = np.abs(corrs['correlation'])
    return corrs.sort_values('absol', ascending=False).drop('absol', axis=1).tail(len(corrs)-1)

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
data = pd.read_csv('telco_churn.csv')
del data['customerID']

data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
for i in data.columns:
    if 'Yes' in data[i].values:
        data[i] = np.where(data[i] == 'Yes', 1, 0)

data.dtypes

  from ipykernel import kernelapp as app


gender               object
SeniorCitizen         int64
Partner               int32
Dependents            int32
tenure                int64
PhoneService          int32
MultipleLines         int32
InternetService      object
OnlineSecurity        int32
OnlineBackup          int32
DeviceProtection      int32
TechSupport           int32
StreamingTV           int32
StreamingMovies       int32
Contract             object
PaperlessBilling      int32
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                 int32
dtype: object

In [4]:
data['TotalCharges'] = np.where(
    data['TotalCharges'] == ' ',
    np.nan,
    data['TotalCharges']
)

data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)
data['TotalCharges'] = data['TotalCharges'].astype(float)

In [5]:
internet = pd.get_dummies(data['InternetService'], prefix='internet')
del data['InternetService']
data = pd.concat([data, internet], axis=1)

contract = pd.get_dummies(data['Contract'], prefix='contract')
del data['Contract']
data = pd.concat([data, contract], axis=1)

payment = pd.get_dummies(data['PaymentMethod'], prefix='PaymentMethod')
del data['PaymentMethod']
data = pd.concat([data, payment], axis=1)

gender = pd.get_dummies(data['gender'], prefix='gender')
del data['gender']
data = pd.concat([data, gender], axis=1)

In [6]:
data.dtypes

SeniorCitizen                                int64
Partner                                      int32
Dependents                                   int32
tenure                                       int64
PhoneService                                 int32
MultipleLines                                int32
OnlineSecurity                               int32
OnlineBackup                                 int32
DeviceProtection                             int32
TechSupport                                  int32
StreamingTV                                  int32
StreamingMovies                              int32
PaperlessBilling                             int32
MonthlyCharges                             float64
TotalCharges                               float64
Churn                                        int32
internet_DSL                                 uint8
internet_Fiber optic                         uint8
internet_No                                  uint8
contract_Month-to-month        

In [7]:
x = data.drop('Churn', axis=1)
y = data['Churn']

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y,test_size=0.2)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

clf = LogisticRegression()
kf = StratifiedKFold(n_splits=4)

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [10, 50, 100, 150], 
}

grid = GridSearchCV(clf, param_grid, cv=kf, scoring='roc_auc').fit(
    xtrain, ytrain)

clf = grid.best_estimator_
clf

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
cv = cross_val_score(clf, xtrain, ytrain, cv=kf, scoring='roc_auc')

cv.mean(), cv.std()

(0.8428408656181947, 0.005550164701376432)

In [13]:
from sklearn.neighbors import KNeighborsRegressor

clf = KNeighborsRegressor()
kf = StratifiedKFold(n_splits=4)

param_grid = {
    'n_neighbors': np.arange(9,33,2),
    'leaf_size': [10,20,30,40,50],
}

grid = GridSearchCV(clf, param_grid, cv=kf, scoring='roc_auc').fit(
    xtrain, ytrain)

clf = grid.best_estimator_
clf

KNeighborsRegressor(algorithm='auto', leaf_size=10, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=21, p=2,
          weights='uniform')

In [14]:
cv = cross_val_score(clf, xtrain, ytrain, cv=kf, scoring='roc_auc')

cv.mean(), cv.std()

(0.7845191275071202, 0.006397848577243204)

In [77]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
kf = StratifiedKFold(n_splits=4)

param_grid = {
    'max_depth': np.arange(2,13),
    'min_samples_leaf': [1,2,3,5,10],
    'min_samples_split': np.arange(8,18,2),
    'criterion': ['gini']
}

grid = GridSearchCV(clf, param_grid, cv=kf, scoring='roc_auc').fit(
    xtrain, ytrain)

clf = grid.best_estimator_
clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [79]:
cv = cross_val_score(clf, xtrain, ytrain, cv=kf, scoring='roc_auc')

cv.mean(), cv.std()

(0.8376654617729182, 0.015907469727101282)

In [82]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
kf = StratifiedKFold(n_splits=4)

param_grid = {
    'max_depth': np.arange(4,9),
    'min_samples_leaf': [1,2,3,5,10],
    'min_samples_split': np.arange(8,18,2),
    'criterion': ['gini'],
    'n_estimators': [10,50,100,200],
}

grid = GridSearchCV(clf, param_grid, cv=kf, scoring='roc_auc').fit(
    xtrain, ytrain)

clf = grid.best_estimator_
clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=16,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [83]:
cv = cross_val_score(clf, xtrain, ytrain, cv=kf, scoring='roc_auc')

cv.mean(), cv.std()

(0.851710236882321, 0.01464732063331421)

In [88]:
from xgboost import XGBClassifier

clf = XGBClassifier()
kf = StratifiedKFold(n_splits=4)

param_grid = {
    'max_depth': np.arange(4,9),
    'n_estimators': [10,50,100],
    'reg_lambda': [100, 200, 1000]
}

grid = GridSearchCV(clf, param_grid, cv=kf, scoring='roc_auc').fit(
    xtrain, ytrain)

clf = grid.best_estimator_
clf

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=200, scale_pos_weight=1, seed=0,
       silent=True, subsample=1)

In [89]:
cv = cross_val_score(clf, xtrain, ytrain, cv=kf, scoring='roc_auc')

cv.mean(), cv.std()

(0.8506868642021835, 0.015399334013732687)

In [90]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()
kf = StratifiedKFold(n_splits=4)

param_grid = {
    'max_depth': np.arange(4,9),
    'min_samples_leaf': [1,2,3,5,10],
    'min_samples_split': np.arange(8,18,2),
    'criterion': ['gini'],
    'n_estimators': [10,50,100,200],
}

grid = GridSearchCV(clf, param_grid, cv=kf, scoring='roc_auc').fit(
    xtrain, ytrain)

clf = grid.best_estimator_
clf

KeyError: 'gini'