In [1]:

# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)



# Where to save the figures
PROJECT_ROOT_DIR = "."
images = "images"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, images, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

# Data Exploration

In [2]:
import pandas as pd

DATA_PATH = os.path.join("datasets", "churn")
#fn that reads csv from the defined location, so if csv updates it can take care of it
def load_data(data_path=DATA_PATH):
    csv_path = os.path.join(data_path, "customer_churn.csv")
    return pd.read_csv(csv_path,encoding="UTF-8")

In [3]:
churn_data=load_data()

In [4]:
churn_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
#7043 users
churn_data.shape

(7043, 21)

In [6]:
churn_data['Churn'].value_counts()
#imbalanced data

No     5174
Yes    1869
Name: Churn, dtype: int64

In [7]:
#if null is present
churn_data.isnull().sum().values.sum()


0

In [8]:
#customer_id should not affect the prediction
churn_data.drop(['customerID'], axis=1, inplace=True)

In [9]:
churn_data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [10]:
churn_data.info()
#TotalCharges column should be numbertype.. this raises little concern

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), object(17)
memory usage: 1.1+ MB


In [11]:
#no.of spaces in TotalCharges
len(churn_data[churn_data['TotalCharges']==' '])

11

# Feature Engineering


In [12]:
#Dropping the whitespaces data
churn_data=churn_data[churn_data['TotalCharges']!=' ']

In [13]:
churn_data['TotalCharges']=pd.to_numeric(churn_data['TotalCharges'])

In [14]:
#seperating numerical and object dtype for proper understanding
churn_data_num=churn_data.select_dtypes(include=[np.number])
churn_data_object=churn_data.select_dtypes(include=[np.object])

In [15]:
churn_data_num.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0,1,29.85,29.85
1,0,34,56.95,1889.5
2,0,2,53.85,108.15
3,0,45,42.3,1840.75
4,0,2,70.7,151.65


In [16]:
churn_data_object

Unnamed: 0,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,Female,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,No
1,Male,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,No
2,Male,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,Yes
3,Male,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),No
4,Female,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,Yes
5,Female,No,No,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,Yes
6,Male,No,Yes,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),No
7,Female,No,No,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,No
8,Female,Yes,No,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,Yes
9,Male,No,Yes,Yes,No,DSL,Yes,Yes,No,No,No,No,One year,No,Bank transfer (automatic),No


In [17]:
churn_data_object['MultipleLines'].replace('No phone service','No', inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [18]:
churn_data_object['MultipleLines'].value_counts()

No     4065
Yes    2967
Name: MultipleLines, dtype: int64

In [19]:
churn_data=pd.merge(churn_data_object, churn_data_num,left_index=True, right_index=True, how='outer')

In [20]:
churn_data.head()

Unnamed: 0,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,Female,Yes,No,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,No,0,1,29.85,29.85
1,Male,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,No,0,34,56.95,1889.5
2,Male,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,Yes,0,2,53.85,108.15
3,Male,No,No,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),No,0,45,42.3,1840.75
4,Female,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,Yes,0,2,70.7,151.65


In [21]:
#dividing among feature and label
churn_data_feat=churn_data.drop('Churn',axis=1)
churn_data_labels=churn_data['Churn']
churn_data_num= churn_data_feat.select_dtypes(include=[np.number])
churn_data_cat=churn_data_feat.select_dtypes(include=[np.object])

In [22]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn_pandas import CategoricalImputer
from sklearn.preprocessing import OneHotEncoder

#numerical pipeline
num_pipeline = Pipeline([
        
        ('std_scaler', StandardScaler()),
    ])
#categorical pipeline
cat_pipeline= Pipeline([
        
        
        ('1-hot', OneHotEncoder()),
    ])

In [23]:
from sklearn.compose import ColumnTransformer
num_attribs = list(churn_data_num)
cat_attribs = list(churn_data_cat)

#Merging both pipeline
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs)
    ])

#preparing features  for training
churn_data_prepared = full_pipeline.fit_transform(churn_data_feat)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [24]:
from sklearn.model_selection import train_test_split

#Splitting training and testing features and labels(using same random_state allows split at the same point)
train_set, test_set = train_test_split(churn_data_prepared, test_size=0.2, random_state=42)
train_labels,test_labels=train_test_split(churn_data_labels, test_size=0.2, random_state=42)

In [25]:
from imblearn.over_sampling import SMOTE
# smote for imbalanced data
smote = SMOTE(ratio='minority',random_state=42,kind='svm')
X_sm, y_sm = smote.fit_sample(train_set, train_labels)

# Creating various models

In [29]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42)
sgd_clf.fit(X_sm, y_sm)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=42, shuffle=True, tol=-inf,
       validation_fraction=0.1, verbose=0, warm_start=False)

## SGDClassifier with accuracy of around 70 is not bad

In [30]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_sm, y_sm, cv=3, scoring="accuracy")

array([0.71822803, 0.66230937, 0.71039244])

In [48]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=40, n_jobs=-1, random_state=42)


In [49]:
forest_clf.fit(X_sm, y_sm)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [70]:
prediction=forest_clf.predict((test_set))
actual_labels=list((test_labels))

In [31]:
#changing categorical var to binary var
Y_train=np.where(y_sm =='Yes', 1, 0)

## Random forest of 40 decision trees has F1 score of around 77%

In [55]:
from sklearn.metrics import f1_score

f1_score(test_labels, prediction, average="micro")

0.7718550106609808

In [76]:
for i in range(len(test_set)):
    if prediction[i]=='Yes':
        prediction[i]=1
    else:
        prediction[i]=0
    if actual_labels[i]=='Yes':
        actual_labels[i]=1
    else:
        actual_labels[i]=0

In [41]:
#trying to find proper hyperparameters
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [25, 35, 42], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [35, 140], 'max_features': [2, 3, 4]},
  ]

forest_clf = RandomForestClassifier(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_clf, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_sm, Y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{u'n_estimators': [25, 35, 42], u'max_features': [2, 4, 6, 8]}, {u'n_estimators': [35, 140], u'max_features': [2, 3, 4], u'bootstrap': [False]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=u'neg_mean_squared_error', verbose=0)

In [42]:
#best parameters
grid_search.best_params_

{u'bootstrap': False, u'max_features': 2, u'n_estimators': 140}

In [43]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=140, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [44]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.3795626385857405 {u'max_features': 2, u'n_estimators': 25}
0.375231970222418 {u'max_features': 2, u'n_estimators': 35}
0.37587669916903366 {u'max_features': 2, u'n_estimators': 42}
0.3779644730092272 {u'max_features': 4, u'n_estimators': 25}
0.3778042846261013 {u'max_features': 4, u'n_estimators': 35}
0.37410102091531233 {u'max_features': 4, u'n_estimators': 42}
0.3860459519248695 {u'max_features': 6, u'n_estimators': 25}
0.3813128843069675 {u'max_features': 6, u'n_estimators': 35}
0.3803592032364707 {u'max_features': 6, u'n_estimators': 42}
0.3882349704133001 {u'max_features': 8, u'n_estimators': 25}
0.3817888315082132 {u'max_features': 8, u'n_estimators': 35}
0.3786045487819098 {u'max_features': 8, u'n_estimators': 42}
0.37442449786370047 {u'max_features': 2, u'n_estimators': 35, u'bootstrap': False}
0.3700337004480975 {u'max_features': 2, u'n_estimators': 140, u'bootstrap': False}
0.375231970222418 {u'max_features': 3, u'n_estimators': 35, u'bootstrap': False}
0.3705241363166782 {

In [45]:
#calculating feature importance
feature_importances = grid_search.best_estimator_.feature_importances_
sorted(zip(feature_importances, churn_data.columns), reverse=True)

[(0.1184477754204422, u'PhoneService'),
 (0.10887709510583436, u'Partner'),
 (0.10531119890061903, u'Dependents'),
 (0.0340067478543729, u'tenure'),
 (0.02381506824746138, u'TotalCharges'),
 (0.019394013232273667, u'Churn'),
 (0.017155369608848742, u'OnlineSecurity'),
 (0.016988470149351034, u'InternetService'),
 (0.016706644676984628, u'MultipleLines'),
 (0.015797115974356457, u'OnlineBackup'),
 (0.014699077425883788, u'gender'),
 (0.014488571585528758, u'DeviceProtection'),
 (0.01407346427039836, u'TechSupport'),
 (0.012868090170418412, u'PaymentMethod'),
 (0.012591957633504243, u'Contract'),
 (0.012081920688939683, u'PaperlessBilling'),
 (0.007311353246423249, u'MonthlyCharges'),
 (0.004212477293503952, u'StreamingTV'),
 (0.004071143272323674, u'StreamingMovies'),
 (0.00383141051422615, u'SeniorCitizen')]

## Finding params for rand Forest by GridSearchCV gives F1 score around 77%

In [48]:
from sklearn.metrics import f1_score
f1_score(actual_labels, final_predictions, average="micro")

0.7690120824449184

In [49]:
#ensemble method -1
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver="liblinear", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
svm_clf = SVC(gamma="auto", random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')

In [50]:
voting_clf.fit(X_sm, y_sm)

VotingClassifier(estimators=[(u'lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver=u'liblinear',
          tol=0.0001, verbose=0, warm_start=False)), (u'rf', Ra...f',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting=u'hard', weights=None)

## Ensemble method always gives better result then the individual learners

In [51]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_sm, y_sm)
    y_pred = clf.predict(test_set)
    print(clf.__class__.__name__, accuracy_score(test_labels, y_pred))

LogisticRegression 0.7412935323383084
RandomForestClassifier 0.751954513148543
SVC 0.720682302771855
VotingClassifier 0.738450604122246


In [52]:
#ensemble method -2
log_clf = LogisticRegression(solver="liblinear", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
svm_clf = SVC(gamma="auto", probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_sm, y_sm)

VotingClassifier(estimators=[(u'lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver=u'liblinear',
          tol=0.0001, verbose=0, warm_start=False)), (u'rf', Ra...bf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting=u'soft', weights=None)

In [53]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_sm, y_sm)
    y_pred = clf.predict(test_set)
    print(clf.__class__.__name__, accuracy_score(test_labels, y_pred))

LogisticRegression 0.7412935323383084
RandomForestClassifier 0.751954513148543
SVC 0.720682302771855
VotingClassifier 0.7448471926083866


In [57]:
#creating BaggingClassifier with bootstrap=False of 10000 decision trees
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=10000,
    max_samples=5000, bootstrap=False, n_jobs=-1, random_state=42)
bag_clf.fit(X_sm, y_sm)
y_pred = bag_clf.predict(test_set)

## Bagging Classifier with bootstrap=False(Pasting) and 10000 decision trees gives best F1 score till now 77.6%

In [59]:
f1_score(test_labels, y_pred, average="weighted")

0.7759327230585151

In [56]:
from sklearn.metrics import f1_score
#using 'micro' it gives global
f1_score(test_labels, y_pred, average="weighted")

0.7759327230585151

## will try boosting next

In [81]:
try:
    import xgboost
except ImportError as ex:
    print("Error: the xgboost library is not installed.")
    xgboost = None

In [None]:
if xgboost is not None:  
    xgb_reg = xgboost.XGBClassifier(random_state=42)
    xgb_reg.fit(X_sm, y_sm)
    y_pred = xgb_reg.predict(test_set)
    val_error = mean_squared_error(test_labels, y_pred)
    print("Validation MSE:", val_error)