In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [5]:
df.columns = df.columns.str.lower().str.replace(' ', "_")
categorical_columns = list(df.dtypes[df.dtypes == object].index)

In [6]:
df.head().T


Unnamed: 0,0,1,2,3,4
customerid,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
seniorcitizen,0,0,0,0,0
partner,Yes,No,No,No,No
dependents,No,No,No,No,No
tenure,1,34,2,45,2
phoneservice,No,Yes,Yes,No,Yes
multiplelines,No phone service,No,No,No phone service,No
internetservice,DSL,DSL,DSL,DSL,Fiber optic
onlinesecurity,No,Yes,Yes,Yes,No


In [7]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce').fillna(0)
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                object
dtype: object

In [8]:
df.churn = (df.churn=='Yes').astype(int)

In [9]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.2, random_state=1)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_full_train = df_full_train.reset_index(drop=True)


In [10]:
df_full_train.churn.value_counts()

churn
0    4113
1    1521
Name: count, dtype: int64

In [11]:
y_train, y_val, y_test = df_train.churn, df_val.churn, df_test.churn
del df_train["churn"]
del df_val["churn"]
del df_test["churn"]

In [12]:
df_full_train = df_full_train.reset_index(drop=True)

In [13]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']
categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]
df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [14]:
from IPython.display import display

In [15]:
global_churn = df_full_train.churn.mean()
for c in categorical:
    print(c) 
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn
    df_group['risk'] = df_group['mean'] / global_churn
    display(df_group)
    print()
    print()

gender


Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,0.276824,2796,0.006856,1.025396
Male,0.263214,2838,-0.006755,0.97498




seniorcitizen


Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.027698,0.897403
1,0.413377,912,0.143409,1.531208




partner


Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.329809,2932,0.059841,1.221659
Yes,0.205033,2702,-0.064935,0.759472




dependents


Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.31376,3968,0.043792,1.162212
Yes,0.165666,1666,-0.104302,0.613651




phoneservice


Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.241316,547,-0.028652,0.89387
Yes,0.273049,5087,0.003081,1.011412




multiplelines


Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.257407,2700,-0.012561,0.953474
No phone service,0.241316,547,-0.028652,0.89387
Yes,0.290742,2387,0.020773,1.076948




internetservice


Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DSL,0.192347,1934,-0.077621,0.712482
Fiber optic,0.425171,2479,0.155203,1.574895
No,0.077805,1221,-0.192163,0.288201




onlinesecurity


Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.420921,2801,0.150953,1.559152
No internet service,0.077805,1221,-0.192163,0.288201
Yes,0.153226,1612,-0.116742,0.56757




onlinebackup


Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.404323,2498,0.134355,1.497672
No internet service,0.077805,1221,-0.192163,0.288201
Yes,0.217232,1915,-0.052736,0.80466




deviceprotection


Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.395875,2473,0.125907,1.466379
No internet service,0.077805,1221,-0.192163,0.288201
Yes,0.230412,1940,-0.039556,0.85348




techsupport


Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.418914,2781,0.148946,1.551717
No internet service,0.077805,1221,-0.192163,0.288201
Yes,0.159926,1632,-0.110042,0.59239




streamingtv


Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.342832,2246,0.072864,1.269897
No internet service,0.077805,1221,-0.192163,0.288201
Yes,0.302723,2167,0.032755,1.121328




streamingmovies


Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.338906,2213,0.068938,1.255358
No internet service,0.077805,1221,-0.192163,0.288201
Yes,0.307273,2200,0.037305,1.138182




contract


Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Month-to-month,0.431701,3104,0.161733,1.599082
One year,0.120573,1186,-0.149395,0.446621
Two year,0.028274,1344,-0.241694,0.10473




paperlessbilling


Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.172071,2313,-0.097897,0.637375
Yes,0.338151,3321,0.068183,1.25256




paymentmethod


Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bank transfer (automatic),0.168171,1219,-0.101797,0.622928
Credit card (automatic),0.164339,1217,-0.10563,0.608733
Electronic check,0.45589,1893,0.185922,1.688682
Mailed check,0.19387,1305,-0.076098,0.718121






In [16]:
from sklearn.metrics import mutual_info_score

In [17]:
mutual_info_score(df_full_train.churn, df_full_train.gender)

np.float64(0.0001174846211139946)

In [18]:
mi = df_full_train[categorical].apply(lambda x: mutual_info_score(x, df_full_train.churn))

In [19]:
mi.sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

In [20]:
df_full_train[numerical].corrwith(df_full_train.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [21]:
from sklearn.feature_extraction import DictVectorizer

In [22]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)


In [23]:
display(train_dict[:5])

[{'gender': 'Male',
  'seniorcitizen': 0,
  'partner': 'No',
  'dependents': 'No',
  'phoneservice': 'Yes',
  'multiplelines': 'No',
  'internetservice': 'No',
  'onlinesecurity': 'No internet service',
  'onlinebackup': 'No internet service',
  'deviceprotection': 'No internet service',
  'techsupport': 'No internet service',
  'streamingtv': 'No internet service',
  'streamingmovies': 'No internet service',
  'contract': 'One year',
  'paperlessbilling': 'No',
  'paymentmethod': 'Mailed check',
  'tenure': 30,
  'monthlycharges': 19.7,
  'totalcharges': 625.05},
 {'gender': 'Female',
  'seniorcitizen': 0,
  'partner': 'Yes',
  'dependents': 'Yes',
  'phoneservice': 'Yes',
  'multiplelines': 'Yes',
  'internetservice': 'Fiber optic',
  'onlinesecurity': 'No',
  'onlinebackup': 'No',
  'deviceprotection': 'No',
  'techsupport': 'No',
  'streamingtv': 'Yes',
  'streamingmovies': 'No',
  'contract': 'Month-to-month',
  'paperlessbilling': 'Yes',
  'paymentmethod': 'Electronic check',
  '

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
model = LogisticRegression(solver = 'lbfgs')


In [29]:
model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
model.intercept_[0]

np.float64(-0.11799289543738488)

In [32]:
model.coef_[0].round(3)

array([ 0.583, -0.173, -0.526, -0.025, -0.091,  0.067, -0.106, -0.077,
       -0.046, -0.07 , -0.353,  0.343, -0.106,  0.002, -0.244,  0.154,
       -0.026,  0.058, -0.106, -0.068,  0.243, -0.106, -0.253, -0.223,
        0.107, -0.105, -0.011, -0.116, -0.033,  0.085, -0.051,  0.154,
       -0.27 ,  0.194, -0.094, -0.106,  0.084, -0.049, -0.106,  0.039,
        0.215, -0.106, -0.225, -0.07 ,  0.   ])

In [33]:
y_pred = model.predict_proba(X_val)[:, 1]

In [34]:
churn_decision = y_pred > 0.5

In [35]:
(y_val == churn_decision).mean()

np.float64(0.80301685891748)

In [36]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual'] = y_val

In [37]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [38]:
df_pred.correct.mean()

np.float64(0.80301685891748)

In [39]:
a = [1,2,3,4]
b = 'abcd'
d = dict(zip(a,b))

In [42]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'contract=Month-to-month': np.float64(0.583),
 'contract=One year': np.float64(-0.173),
 'contract=Two year': np.float64(-0.526),
 'dependents=No': np.float64(-0.025),
 'dependents=Yes': np.float64(-0.091),
 'deviceprotection=No': np.float64(0.067),
 'deviceprotection=No internet service': np.float64(-0.106),
 'deviceprotection=Yes': np.float64(-0.077),
 'gender=Female': np.float64(-0.046),
 'gender=Male': np.float64(-0.07),
 'internetservice=DSL': np.float64(-0.353),
 'internetservice=Fiber optic': np.float64(0.343),
 'internetservice=No': np.float64(-0.106),
 'monthlycharges': np.float64(0.002),
 'multiplelines=No': np.float64(-0.244),
 'multiplelines=No phone service': np.float64(0.154),
 'multiplelines=Yes': np.float64(-0.026),
 'onlinebackup=No': np.float64(0.058),
 'onlinebackup=No internet service': np.float64(-0.106),
 'onlinebackup=Yes': np.float64(-0.068),
 'onlinesecurity=No': np.float64(0.243),
 'onlinesecurity=No internet service': np.float64(-0.106),
 'onlinesecurity=Yes

In [46]:
small = ['contract', 'tenure', 'monthlycharges']
df_train[small].iloc[:10].to_dict(orient = 'records')

[{'contract': 'One year', 'tenure': 30, 'monthlycharges': 19.7},
 {'contract': 'Month-to-month', 'tenure': 23, 'monthlycharges': 83.75},
 {'contract': 'Month-to-month', 'tenure': 14, 'monthlycharges': 95.8},
 {'contract': 'Two year', 'tenure': 56, 'monthlycharges': 19.7},
 {'contract': 'Two year', 'tenure': 63, 'monthlycharges': 98.0},
 {'contract': 'Month-to-month', 'tenure': 33, 'monthlycharges': 80.6},
 {'contract': 'Month-to-month', 'tenure': 2, 'monthlycharges': 19.25},
 {'contract': 'Month-to-month', 'tenure': 8, 'monthlycharges': 30.45},
 {'contract': 'Two year', 'tenure': 66, 'monthlycharges': 66.1},
 {'contract': 'One year', 'tenure': 42, 'monthlycharges': 54.75}]

In [47]:
dicts_train_small = df_train[small].to_dict(orient='records')
dv_small = DictVectorizer(sparse=False)
X_train_small = dv_small.fit_transform(dicts_train_small)
dv_small.get_feature_names_out()

array(['contract=Month-to-month', 'contract=One year',
       'contract=Two year', 'monthlycharges', 'tenure'], dtype=object)

In [48]:
model_small = LogisticRegression(solver='lbfgs')
model_small.fit(X_train_small, y_train)

In [51]:
dict(zip(dv_small.get_feature_names_out(), model_small.coef_[0].round(3)))


{'contract=Month-to-month': np.float64(1.011),
 'contract=One year': np.float64(-0.025),
 'contract=Two year': np.float64(-0.986),
 'monthlycharges': np.float64(0.028),
 'tenure': np.float64(-0.036)}