In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [68]:
data=pd.read_csv('customer_churn.csv')

In [69]:
data.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [70]:
# Droping the irrelative columns

In [71]:
data=data[['tenure','SeniorCitizen','MonthlyCharges','Churn']]

In [72]:
# Train-test-split

In [73]:
y=data['Churn']
X=data.drop('Churn',axis=1)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [75]:
# Normalizing the data

In [76]:
transformer = StandardScaler().fit(X_train)# How to judge which scaling mode should we use?
X_train_scaled = pd.DataFrame(transformer.transform(X_train),columns=X.columns)

X_test_scaled = pd.DataFrame(transformer.transform(X_test),columns=X.columns)
X_train_scaled.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,-0.340191,-0.435476,-0.514314
1,0.88321,2.296336,0.01784
2,-1.196572,-0.435476,-0.819594
3,1.16867,-0.435476,-1.483535
4,-0.829552,-0.435476,0.658427


In [77]:
y_train = y_train.reset_index(drop=True) 
y_test =y_test.reset_index(drop=True)

In [78]:
# Building logistic regression model

In [79]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_scaled, y_train)
LR.score(X_test_scaled, y_test)
pred=LR.predict(X_test_scaled)

In [80]:
print("precision: ",precision_score(y_test,pred,pos_label='Yes'))
print("recall: ",recall_score(y_test,pred,pos_label='Yes'))
print("f1: ",f1_score(y_test,pred,pos_label='Yes'))

precision:  0.6115942028985507
recall:  0.4557235421166307
f1:  0.5222772277227723


In [81]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

array([[1164,  134],
       [ 252,  211]], dtype=int64)

In [82]:
# This model miss a lot of the cases that the churn result is yes. Therefore this model is not reliable.

In [83]:
# The LR score is nearly 0.8 that means the accuracy of the model is nearly 80%. However the model is not reliable.
# The reason for that fake high accuracy is that the most of the churn result is No and our model can good predict this part. 

In [84]:
# using SMOTE to improve the model

In [85]:
sm = SMOTE(random_state=100, k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_scaled,y_train)

In [86]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred,pos_label='Yes'))
print("recall: ",recall_score(y_test,pred,pos_label='Yes'))
print("f1: ",f1_score(y_test,pred,pos_label='Yes'))

precision:  0.4776978417266187
recall:  0.7170626349892009
f1:  0.5734024179620035


In [None]:
# After using SMOTE the precision of the model decrease but the recall score increase. 
# In my opinion this model is more reliable than before. Actually our purpose is to predict the churn with the 'Yes' result.
# Now the recall score increase that means our model focus more on the positive part and miss less cases that churn result is 'Yes'.
# Though the accuracy will decrease but it is worth.