Data from https://www.kaggle.com/blastchar/telco-customer-churn

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
file = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(file)

In [5]:
len(df)

7043

## Initial data preparation
* Normalise column names as usual
* Convert churn column into integer column, set condition to "churn=='yes'" to set as 1.
* Convert charges column to numeric. There are string values ("-") inside this column, hence set error = "coerce" to first set these to NaN, then fillna as 0.

In [6]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [8]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

In [9]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [10]:
df.churn = (df.churn == 'yes').astype(int)

### Split dataset into training, validation and test sets
* First split into training set and test set where test set = 20%.
* From the training set, split further into training and validation set using 

In [11]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [12]:
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

In [13]:
len(df_train), len(df_val), len(df_test)

(3774, 1860, 1409)

In [14]:
y_train = df_train.churn.values
y_val = df_val.churn.values

In [15]:
del df_train['churn']
del df_val['churn']

## Exploratory data analysis

In [None]:
df_train_full.isnull().sum()

In [None]:
df_train_full.churn.value_counts()

In [None]:
global_mean = df_train_full.churn.mean()
round(global_mean, 3)

In [None]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [None]:
df_train_full[categorical].nunique()

## Feature importance

In [None]:
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()
print('gender == female:', round(female_mean, 3))

male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()
print('gender == male:  ', round(male_mean, 3))

In [None]:
female_mean / global_mean

In [None]:
male_mean / global_mean

In [None]:
partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean()
print('partner == yes:', round(partner_yes, 3))

partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean()
print('partner == no :', round(partner_no, 3))

In [None]:
partner_yes / global_mean

In [None]:
partner_no / global_mean

In [None]:
df_group = df_train_full.groupby(by='gender').churn.agg(['mean'])
df_group['diff'] = df_group['mean'] - global_mean
df_group['risk'] = df_group['mean'] / global_mean
df_group

In [None]:
global_mean = df_train_full.churn.mean()
global_mean

In [None]:
for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    display(df_group)

In [None]:
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.churn)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')


display(df_mi.head())
display(df_mi.tail())

In [None]:
df_train_full[numerical].corrwith(df_train_full.churn).to_frame('correlation')

In [None]:
df_train_full.groupby(by='churn')[numerical].mean()

## One-hot encoding

In [None]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [None]:
train_dict[0]

In [None]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [None]:
X_train = dv.transform(train_dict)

In [None]:
X_train.shape

In [None]:
dv.get_feature_names()

## Training logistic regression

In [None]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

In [None]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
model.predict_proba(X_val)

In [None]:
y_pred = model.predict_proba(X_val)[:, 1]

In [None]:
y_pred

In [None]:
churn = y_pred > 0.5

In [None]:
(y_val == churn).mean()

## Model interpretation

In [None]:
model.intercept_[0]

In [None]:
dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))

In [None]:
subset = ['contract', 'tenure', 'totalcharges']
train_dict_small = df_train[subset].to_dict(orient='records')
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)

X_small_train = dv_small.transform(train_dict_small)

dv_small.get_feature_names()

In [None]:
model_small = LogisticRegression(solver='liblinear', random_state=1)
model_small.fit(X_small_train, y_train)

In [None]:
model_small.intercept_[0]

In [None]:
dict(zip(dv_small.get_feature_names(), model_small.coef_[0].round(3)))

In [None]:
val_dict_small = df_val[subset].to_dict(orient='records')
X_small_val = dv_small.transform(val_dict_small)

In [None]:
y_pred_small = model_small.predict_proba(X_small_val)[:, 1]

## Using the model

In [None]:
customer = {
    'customerid': '8879-zkjof',
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'no',
    'tenure': 41,
    'phoneservice': 'yes',
    'multiplelines': 'no',
    'internetservice': 'dsl',
    'onlinesecurity': 'yes',
    'onlinebackup': 'no',
    'deviceprotection': 'yes',
    'techsupport': 'yes',
    'streamingtv': 'yes',
    'streamingmovies': 'yes',
    'contract': 'one_year',
    'paperlessbilling': 'yes',
    'paymentmethod': 'bank_transfer_(automatic)',
    'monthlycharges': 79.85,
    'totalcharges': 3320.75,
}

In [None]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

In [None]:
print(list(X_test[0]))

In [None]:
customer = {
    'gender': 'female',
    'seniorcitizen': 1,
    'partner': 'no',
    'dependents': 'no',
    'phoneservice': 'yes',
    'multiplelines': 'yes',
    'internetservice': 'fiber_optic',
    'onlinesecurity': 'no',
    'onlinebackup': 'no',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'yes',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 85.7,
    'totalcharges': 85.7
}

In [None]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]