# Predicting customer churn

In [4]:
import pandas as pd
import numpy as np
import os

In [12]:
df=pd.read_csv('churn_dataset_train.csv')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   state                          3500 non-null   object 
 1   account_length                 3500 non-null   int64  
 2   area_code                      3500 non-null   object 
 3   international_plan             3500 non-null   object 
 4   voice_mail_plan                3500 non-null   object 
 5   number_vmail_messages          3500 non-null   int64  
 6   total_day_minutes              3500 non-null   float64
 7   total_day_calls                3500 non-null   int64  
 8   total_day_charge               3216 non-null   float64
 9   total_eve_minutes              3500 non-null   float64
 10  total_eve_calls                3269 non-null   float64
 11  total_eve_charge               3500 non-null   float64
 12  total_night_minutes            3500 non-null   f

"total_day_charge" and  "total_eve_calls" have NA values ,we will impute them with 0 for this excercise,we can fgure better ways of imputing them later if they emerge as important variables

In [153]:
df=df.fillna(0)

In [None]:
"""
Encoding the binary variables for modeling purposes
"""
df['churn_coded']=np.where(df.churn=='yes',1,0)
df['int_plan_coded']=np.where(df.international_plan=='yes',1,0)
df['vm_plan_coded']=np.where(df.voice_mail_plan=='yes',1,0)
df['churn_coded'].value_counts()

In [147]:
t=df.corr()
col_list=t.columns
d= pd.DataFrame(columns=['column1','column2', 'correlation'])
i=0
for col in col_list[0: int(len(col_list)/2)]:
    for col1 in col_list[int(len(col_list)/2):]:
        if col!=col1:
            corr=round(t.iloc[t.index==col][col1].values[0] ,2)
            if(abs(corr)>0.3):
                d=d.append(dict(zip( ['column1','column2', 'correlation'],
                                 [col,col1, corr])) , True)
print(d)

                 column1             column2  correlation
0  number_vmail_messages       vm_plan_coded         0.95
1    total_night_minutes  total_night_charge         1.00


There are no unexpected correlations in the data, however among the expected ones-
1. There is significant correlation between 'number_vmail_messages' and 'vm_plan_coded' , which means if there is an active voicemail plan for customer he has non zero voicemail messages.
2. Also 'total_night_minutes' is positive highly corleated with 'total_night_charge' which is obvious as 
   charge= minutes*rate
   
Conclusion- We can eliminate one of the variables for modeling purposes as they convey same information/variance.  I elinimiate 'total_night_charge' as it is a dervied varibale,  and 'vm_plan_coded' as it is encoded variable.

In [148]:
df.churn.value_counts(1)

no     0.860857
yes    0.139143
Name: churn, dtype: float64

There is class imbalance , class 1,as in churn= yes is minorty class meaning 13% customers are likely to leave the company and nt continue with their plan, and 86% customers are retained over time.

In [151]:
"""
Shuffling the data before train test split
"""
df=df.sample(df.shape[0])

In [152]:
train_cols=['account_length', 
       'number_vmail_messages', 'total_day_minutes',
       'total_day_calls', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls','total_intl_minutes',
       'total_intl_calls', 'total_intl_charge',
       'number_customer_service_calls', 
       'int_plan_coded']

# GBM - boosting based method for churn prediction

In [194]:
from sklearn.ensemble import GradientBoostingClassifier
"""
Train test split 70/30
"""
idx= int(0.7*df.shape[0])
X_train, y_train= df[train_cols][0:idx] ,  df['churn_coded'][0:idx]
X_test, y_test= df[train_cols][idx:] ,  df['churn_coded'][idx:]
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
  max_depth=1, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8895238095238095

In [195]:
pred_prob=clf.predict_proba(X_test)
pred= np.where(pred_prob>0.5 , 1,0)
y_pred= [np.argmax(x) for x in pred]

In [160]:
"""
distribution of target variable in test data, to ensure that model doesn't make a blind guess
and predict everything as majority class.

"""
y_test.value_counts()

0    907
1    143
Name: churn_coded, dtype: int64

In [196]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[873,  34],
       [ 82,  61]])

In [197]:
f1_score(y_test, y_pred, average='macro')

0.7251532191824098

In [162]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, y_pred)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))


Average precision-recall score: 0.35


# Trying random forest model - bagging based model for churn prediction

In [187]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9085714285714286

In [188]:
pred_prob=clf.predict_proba(X_test)
pred= np.where(pred_prob>0.5 , 1,0)
y_pred= [np.argmax(x) for x in pred]
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[904,   3],
       [ 93,  50]])

In [189]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average='macro')

0.7298919567827131

In [186]:
t1=pd.DataFrame(columns=['column','feature_imp'])
t1.column, t1.feature_imp= train_cols, clf.feature_importances_
t1.sort_values(by='feature_imp',ascending=False)[0:10]

Unnamed: 0,column,feature_imp
2,total_day_minutes,0.231654
4,total_day_charge,0.16891
14,int_plan_coded,0.13448
13,number_customer_service_calls,0.115694
7,total_eve_charge,0.055175
5,total_eve_minutes,0.047867
10,total_intl_minutes,0.041981
11,total_intl_calls,0.041646
1,number_vmail_messages,0.040985
12,total_intl_charge,0.035378


# Predicting on test data

In [198]:
test= pd.read_csv('churn_dataset_test.csv')
orig_cols= test.columns

In [199]:
"""
performing the preprocessing steps on test data as well
"""
test= test.fillna(0)
"""
Encoding the binary variables for modeling purposes
"""

test['int_plan_coded']=np.where(test.international_plan=='yes',1,0)
test['vm_plan_coded']=np.where(test.voice_mail_plan=='yes',1,0)


In [200]:
"""
Making predictions
"""
pred_prob=clf.predict_proba(test[train_cols])
pred= np.where(pred_prob>0.5 , 1,0)
y_pred= [np.argmax(x) for x in pred]

In [203]:
"""
assigning the test labels to the test data and saving the data with original columns 
"""
test['churn']=y_pred
test['churn']=np.where(test.churn==1, 'yes','no')
test[orig_cols].to_csv('churn_dataset_test_filled.csv', index=False)

In [202]:
test['churn'].value_counts(1)

no     0.896
yes    0.104
Name: churn, dtype: float64