In [1]:
!wget https://raw.githubusercontent.com/timfox456/ai-service/main/data/WA_Fn-UseC_-Telco-Customer-Churn.csv

--2022-11-01 15:07:06--  https://raw.githubusercontent.com/timfox456/ai-service/main/data/WA_Fn-UseC_-Telco-Customer-Churn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 970457 (948K) [text/plain]
Saving to: ‘WA_Fn-UseC_-Telco-Customer-Churn.csv’


2022-11-01 15:07:06 (24.8 MB/s) - ‘WA_Fn-UseC_-Telco-Customer-Churn.csv’ saved [970457/970457]



In [3]:

import pandas as pd 
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt 
%matplotlib inline


df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv') 
print(len(df))
df.head()

7043


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [6]:
# Total Charges: empties
total_charges = pd.to_numeric(df.TotalCharges, errors='coerce')
df[total_charges.isnull()][['customerID', 'TotalCharges']] 
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce') 
df.TotalCharges = df.TotalCharges.fillna(0)

In [12]:
# Column Names and Naming Conventions
df.columns = df.columns.str.lower().str.replace(' ', '_') 
string_columns = list(df.dtypes[df.dtypes == 'object'].index) 
for col in string_columns:
  df[col] = df[col].str.lower().str.replace(' ', '_')

df.churn = (df.churn == 'yes').astype(int)

In [16]:
# Split data For training, test, validation

from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

y_train = df_train.churn.values 
y_val = df_val.churn.values
del df_train['churn'] 
del df_val['churn']


In [17]:
# Validate no missing values
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [18]:
df_train_full.churn.value_counts()

0    4113
1    1521
Name: churn, dtype: int64

In [20]:
global_mean = df_train_full.churn.mean()
round(global_mean,3)

0.27

In [21]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [22]:
df_train_full[numerical].describe()

Unnamed: 0,tenure,monthlycharges,totalcharges
count,5634.0,5634.0,5634.0
mean,32.277955,64.779127,2277.423953
std,24.555211,30.104993,2266.412636
min,0.0,18.25,0.0
25%,9.0,35.4,389.1375
50%,29.0,70.375,1391.0
75%,55.0,89.85,3787.5
max,72.0,118.65,8684.8


In [23]:
df_train_full.corr()

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,churn
seniorcitizen,1.0,0.023443,0.225234,0.110459,0.141966
tenure,0.023443,1.0,0.251072,0.828268,-0.351885
monthlycharges,0.225234,0.251072,1.0,0.650913,0.196805
totalcharges,0.110459,0.828268,0.650913,1.0,-0.196353
churn,0.141966,-0.351885,0.196805,-0.196353,1.0


In [24]:
# Feature importance: Gender
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean() 
male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()

print(female_mean, male_mean)


0.27682403433476394 0.2632135306553911


In [None]:
# Feature Importance: Partner

partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean() 
partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean()

In [25]:
# Feature Importance All Cateegorical Varibles

from IPython.display import display 
for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean']) 
    df_group['diff'] = df_group['mean'] - global_mean 
    df_group['rate'] = df_group['mean'] / global_mean 
    display(df_group)

Unnamed: 0_level_0,mean,diff,rate
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


Unnamed: 0_level_0,mean,diff,rate
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,-0.027698,0.897403
1,0.413377,0.143409,1.531208


Unnamed: 0_level_0,mean,diff,rate
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,0.059841,1.221659
yes,0.205033,-0.064935,0.759472


Unnamed: 0_level_0,mean,diff,rate
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,0.043792,1.162212
yes,0.165666,-0.104302,0.613651


Unnamed: 0_level_0,mean,diff,rate
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,-0.028652,0.89387
yes,0.273049,0.003081,1.011412


Unnamed: 0_level_0,mean,diff,rate
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,-0.012561,0.953474
no_phone_service,0.241316,-0.028652,0.89387
yes,0.290742,0.020773,1.076948


Unnamed: 0_level_0,mean,diff,rate
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.077621,0.712482
fiber_optic,0.425171,0.155203,1.574895
no,0.077805,-0.192163,0.288201


Unnamed: 0_level_0,mean,diff,rate
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,0.150953,1.559152
no_internet_service,0.077805,-0.192163,0.288201
yes,0.153226,-0.116742,0.56757


Unnamed: 0_level_0,mean,diff,rate
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,0.134355,1.497672
no_internet_service,0.077805,-0.192163,0.288201
yes,0.217232,-0.052736,0.80466


Unnamed: 0_level_0,mean,diff,rate
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,0.125907,1.466379
no_internet_service,0.077805,-0.192163,0.288201
yes,0.230412,-0.039556,0.85348


Unnamed: 0_level_0,mean,diff,rate
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,0.148946,1.551717
no_internet_service,0.077805,-0.192163,0.288201
yes,0.159926,-0.110042,0.59239


Unnamed: 0_level_0,mean,diff,rate
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,0.072864,1.269897
no_internet_service,0.077805,-0.192163,0.288201
yes,0.302723,0.032755,1.121328


Unnamed: 0_level_0,mean,diff,rate
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,0.068938,1.255358
no_internet_service,0.077805,-0.192163,0.288201
yes,0.307273,0.037305,1.138182


Unnamed: 0_level_0,mean,diff,rate
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161733,1.599082
one_year,0.120573,-0.149395,0.446621
two_year,0.028274,-0.241694,0.10473


Unnamed: 0_level_0,mean,diff,rate
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,-0.097897,0.637375
yes,0.338151,0.068183,1.25256


Unnamed: 0_level_0,mean,diff,rate
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,-0.101797,0.622928
credit_card_(automatic),0.164339,-0.10563,0.608733
electronic_check,0.45589,0.185922,1.688682
mailed_check,0.19387,-0.076098,0.718121


In [27]:
# Mutual Information
from sklearn.metrics import mutual_info_score
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.churn)
df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI') 
df_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


In [None]:
# Correlation Cofficient
df_train_full[numerical].corrwith(df_train_full.churn)

In [29]:
# One Hot Encoding

train_dict = df_train[categorical + numerical].to_dict(orient='records')

from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False) 
dv.fit(train_dict)
X_train = dv.transform(train_dict)

print(X_train[0])
dv.get_feature_names()

[0.0000e+00 0.0000e+00 1.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 1.0000e+00 0.0000e+00 1.0000e+00 1.0000e+00 0.0000e+00
 0.0000e+00 8.6100e+01 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00
 1.0000e+00 0.0000e+00 1.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 1.0000e+00 0.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00
 1.0000e+00 7.1000e+01 6.0459e+03]




['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'dependents=no',
 'dependents=yes',
 'deviceprotection=no',
 'deviceprotection=no_internet_service',
 'deviceprotection=yes',
 'gender=female',
 'gender=male',
 'internetservice=dsl',
 'internetservice=fiber_optic',
 'internetservice=no',
 'monthlycharges',
 'multiplelines=no',
 'multiplelines=no_phone_service',
 'multiplelines=yes',
 'onlinebackup=no',
 'onlinebackup=no_internet_service',
 'onlinebackup=yes',
 'onlinesecurity=no',
 'onlinesecurity=no_internet_service',
 'onlinesecurity=yes',
 'paperlessbilling=no',
 'paperlessbilling=yes',
 'partner=no',
 'partner=yes',
 'paymentmethod=bank_transfer_(automatic)',
 'paymentmethod=credit_card_(automatic)',
 'paymentmethod=electronic_check',
 'paymentmethod=mailed_check',
 'phoneservice=no',
 'phoneservice=yes',
 'seniorcitizen',
 'streamingmovies=no',
 'streamingmovies=no_internet_service',
 'streamingmovies=yes',
 'streamingtv=no',
 'streamingtv=no_internet_servic

In [33]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', random_state=1) 
model.fit(X_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [34]:
# One Hot Encoding Input
val_dict = df_val[categorical + numerical].to_dict(orient='records') 
X_val = dv.transform(val_dict)
y_pred = model.predict_proba(X_val)[:, 1]

In [35]:
# Accuracy

churn = y_pred >= 0.5
(y_val == churn).mean() #Quality Measure called ACCURACY

0.8016129032258065

In [36]:
# Model Interpretation: Weights
dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))



{'contract=month-to-month': 0.563,
 'contract=one_year': -0.086,
 'contract=two_year': -0.599,
 'dependents=no': -0.03,
 'dependents=yes': -0.092,
 'deviceprotection=no': 0.1,
 'deviceprotection=no_internet_service': -0.116,
 'deviceprotection=yes': -0.106,
 'gender=female': -0.027,
 'gender=male': -0.095,
 'internetservice=dsl': -0.323,
 'internetservice=fiber_optic': 0.317,
 'internetservice=no': -0.116,
 'monthlycharges': 0.001,
 'multiplelines=no': -0.168,
 'multiplelines=no_phone_service': 0.127,
 'multiplelines=yes': -0.081,
 'onlinebackup=no': 0.136,
 'onlinebackup=no_internet_service': -0.116,
 'onlinebackup=yes': -0.142,
 'onlinesecurity=no': 0.258,
 'onlinesecurity=no_internet_service': -0.116,
 'onlinesecurity=yes': -0.264,
 'paperlessbilling=no': -0.213,
 'paperlessbilling=yes': 0.091,
 'partner=no': -0.048,
 'partner=yes': -0.074,
 'paymentmethod=bank_transfer_(automatic)': -0.027,
 'paymentmethod=credit_card_(automatic)': -0.136,
 'paymentmethod=electronic_check': 0.175,


In [41]:
# Train a Small Subset


small_subset = ['contract', 'tenure', 'totalcharges'] 
train_dict_small = df_train[small_subset].to_dict(orient='records') 
dv_small = DictVectorizer(sparse=False) 
dv_small.fit(train_dict_small)
X_small_train = dv_small.transform(train_dict_small) 
dv_small.get_feature_names()

model_small = LogisticRegression(solver='liblinear', random_state=1) 
model_small.fit(X_small_train, y_train)
model_small.intercept_[0] #Check the bias
dict(zip(dv_small.get_feature_names(), model_small.coef_[0].round(3))) #Check the other weights



{'contract=month-to-month': 0.91,
 'contract=one_year': -0.144,
 'contract=two_year': -1.404,
 'tenure': -0.097,
 'totalcharges': 0.001}

In [44]:
dict(zip(dv_small.get_feature_names(), model_small.coef_[0].round(3))) 

{'contract=month-to-month': 0.91,
 'contract=one_year': -0.144,
 'contract=two_year': -1.404,
 'tenure': -0.097,
 'totalcharges': 0.001}

In [47]:
# Use Churn Model:
customer = {
'customerid': '8879-zkjof', 'gender': 'female', 'seniorcitizen': 0, 'partner': 'no', 'dependents': 'no', 'tenure': 41, 'phoneservice': 'yes', 'multiplelines': 'no', 'internetservice': 'dsl', 'onlinesecurity': 'yes', 'onlinebackup': 'no', 'deviceprotection': 'yes',
'techsupport': 'yes',
'streamingtv': 'yes',
'streamingmovies': 'yes',
'contract': 'one_year',
'paperlessbilling': 'yes',
'paymentmethod': 'bank_transfer_(automatic)', 'monthlycharges': 79.85,
'totalcharges': 3320.75, }
X_test = dv.transform([customer])
model.predict_proba(X_test)

array([[0.92667889, 0.07332111]])

In [49]:
# Another Customer

customer = {
'gender': 'female', 'seniorcitizen': 1,
'partner': 'no',
'dependents': 'no', 'phoneservice': 'yes', 'multiplelines': 'yes', 'internetservice': 'fiber_optic', 'onlinesecurity': 'no', 'onlinebackup': 'no',
'deviceprotection': 'no',
'techsupport': 'no',
'streamingtv': 'yes', 'streamingmovies': 'no',
'contract': 'month-to-month', 'paperlessbilling': 'yes', 'paymentmethod': 'electronic_check', 'tenure': 1,
'monthlycharges': 85.7,
'totalcharges': 85.7 }


X_test = dv.transform([customer]) 
model.predict_proba(X_test)[0, 1]


0.8321656556545182