In [29]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [30]:
df = pd.read_csv("Customer-Churn.csv")
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [31]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [32]:
df.Churn = (df.Churn == 'Yes').astype(int)
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,0
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,0
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,0
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,1


In [33]:
(df.Churn == 1).sum()

np.int64(1869)

In [34]:
df.shape


(7043, 21)

In [35]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [36]:
categorical = list(df.dtypes[df.dtypes == "object"].index)
categorical

['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'totalcharges']

In [37]:
numerical = list(df.dtypes[~(df.dtypes == "object")].index)
numerical

['seniorcitizen', 'tenure', 'monthlycharges', 'churn']

In [38]:
for c  in categorical:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [39]:
df

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,1
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.30,1840.75,0
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-resvb,male,0,yes,yes,24,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,yes,mailed_check,84.80,1990.5,0
7039,2234-xaduh,female,0,yes,yes,72,yes,yes,fiber_optic,no,...,yes,no,yes,yes,one_year,yes,credit_card_(automatic),103.20,7362.9,0
7040,4801-jzazl,female,0,yes,yes,11,no,no_phone_service,dsl,yes,...,no,no,no,no,month-to-month,yes,electronic_check,29.60,346.45,0
7041,8361-ltmkd,male,1,yes,no,4,yes,yes,fiber_optic,no,...,no,no,no,no,month-to-month,yes,mailed_check,74.40,306.6,1


In [40]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                 int64
dtype: object

In [41]:
#df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
#df.totalcharges 

In [42]:
#df.totalcharges.isnull().sum()

In [43]:
#df.totalcharges= df.totalcharges.fillna(df.totalcharges.median)

### mutual information 

In [44]:
categorical_col = [
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
'seniorcitizen']

numerical_col = ['tenure', 'monthlycharges', 'totalcharges']

In [45]:
from sklearn.metrics import mutual_info_score
def mutual_info(c):
    return mutual_info_score(c,df.churn)
df[categorical_col].apply(mutual_info).sort_values()

gender              0.000037
phoneservice        0.000072
multiplelines       0.000801
seniorcitizen       0.010577
partner             0.011454
dependents          0.014467
paperlessbilling    0.019194
streamingtv         0.031908
streamingmovies     0.032001
deviceprotection    0.043917
paymentmethod       0.044519
onlinebackup        0.046792
internetservice     0.055574
techsupport         0.063021
onlinesecurity      0.064677
contract            0.098453
dtype: float64

### correlation coefficient

In [46]:
df['tenure'].corr(df.churn)

np.float64(-0.352228670113078)

In [47]:
df[numerical_col].dtypes


tenure              int64
monthlycharges    float64
totalcharges       object
dtype: object

In [48]:
df.totalcharges.dtype

dtype('O')

In [49]:
df[numerical_col] = df[numerical_col].apply(pd.to_numeric, errors='coerce')


In [50]:
df.totalcharges.isnull().sum()

np.int64(11)

In [51]:
df.totalcharges = df.totalcharges.fillna(0)

In [52]:
df.totalcharges.isnull().sum()

np.int64(0)

In [53]:
df[numerical_col].corrwith(df.churn)

tenure           -0.352229
monthlycharges    0.193356
totalcharges     -0.198324
dtype: float64

In [54]:
df_full_train , df_test = train_test_split(df,test_size=0.2, random_state=0)
len(df_full_train), len(df)

(5634, 7043)

In [55]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25,random_state=0)
len(df_train),len(df_test) ,len(df_val)

(4225, 1409, 1409)

In [56]:
df_full_train = df_full_train.reset_index(drop=True)
df_test= df_test.reset_index(drop=True)
df_train= df_train.reset_index(drop=True)
df_val= df_val.reset_index(drop=True)

In [57]:
X_train= df_train[categorical_col+numerical_col]
y_train = df_train.churn

X_val= df_val[categorical_col+numerical_col]
y_val = df_val.churn

X_test= df_test[categorical_col+numerical_col]
y_test = df_test.churn

### one hot encoding 

In [58]:
dv= DictVectorizer(sparse=False)

dict_train = X_train[categorical_col+numerical_col].to_dict(orient= 'records')
X_train= dv.fit_transform(dict_train)

dict_val = X_val[categorical_col+numerical_col].to_dict(orient= 'records')
X_val= dv.transform(dict_val)


In [63]:
lr= LogisticRegression(max_iter=1000)
lr.fit(X_train,y_train)
y_predict = lr.predict(X_val)
y_pred_proba = lr.predict_proba(X_val)[:, 1] 
roc_accuracy_val = roc_auc_score(y_val,y_pred_proba)

In [64]:
roc_accuracy_val

np.float64(0.8524891093266153)

In [65]:
y_prob_train= lr.predict_proba(X_train)[:,1]
roc_accuracy_train = roc_auc_score(y_train,y_prob_train)

In [66]:
roc_accuracy_train

np.float64(0.8523952268157342)

In [72]:
X_full_train = df_full_train[categorical_col+numerical_col]
y_full_train = df_full_train.churn

In [73]:
dict_full_train = X_full_train[categorical_col+numerical_col].to_dict(orient= 'records')
X_full_train= dv.fit_transform(dict_full_train)

dict_test = X_test[categorical_col+numerical_col].to_dict(orient= 'records')
X_test= dv.transform(dict_test)

In [78]:
lr= LogisticRegression(max_iter=1000)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_full_train_scaled = scaler.fit_transform(X_full_train)
X_test_scaled = scaler.transform(X_test)

lr.fit(X_full_train_scaled,y_full_train)
y_predict = lr.predict(X_test_scaled)
y_pred_proba = lr.predict_proba(X_test_scaled)[:, 1] 
roc_accuracy_test= roc_auc_score(y_test,y_pred_proba)

In [79]:
roc_accuracy_test

np.float64(0.8274051914964707)

In [86]:
X_full = df[categorical_col+numerical_col]
y_full = df.churn

X_full_dict = X_full[categorical_col+numerical_col].to_dict(orient= 'records')
X_full = dv.fit_transform(X_full_dict)

scaler = StandardScaler()
X_full_scaled = scaler.fit_transform(X_full)


lr= LogisticRegression()

lr.fit(X_full_scaled,y_full)
y_predict = lr.predict(X_full_scaled)
y_pred_proba = lr.predict_proba(X_full_scaled)[:, 1] 
roc_accuracy_full= roc_auc_score(y_full,y_pred_proba)

In [87]:
roc_accuracy_full

np.float64(0.8483719995210028)

In [88]:
import pickle

In [None]:
with open("CustomerChurn.pkl", 'wb') as fout:
    pickle.dump((lr,dv,scaler),fout)
