In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('files_for_lab/customer_churn.csv')

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# We will try to predict variable Churn using a logistic regression
# on variables tenure, SeniorCitizen,MonthlyCharges

In [5]:
# X/y split

X = df[['tenure', 'SeniorCitizen', 'MonthlyCharges' ]] # Features
y = df['Churn'] # Target

In [6]:
X.shape

(7043, 3)

In [7]:
y.head()

0     No
1     No
2    Yes
3     No
4    Yes
Name: Churn, dtype: object

In [8]:
X.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,1,0,29.85
1,34,0,56.95
2,2,0,53.85
3,45,0,42.3
4,2,0,70.7


In [9]:
# here we start scaling, so need to do train-test-split before
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
# All values is numerical, so I will not split the data into cat and num

In [12]:
from sklearn.preprocessing import StandardScaler

transformer = StandardScaler().fit(X_train)
X_train_normalized = pd.DataFrame(transformer.transform(X_train),columns=X.columns)
X_test_normalized = pd.DataFrame(transformer.transform(X_test),columns=X.columns)
X_train_normalized.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,-0.340191,-0.435476,-0.514314
1,0.88321,2.296336,0.01784
2,-1.196572,-0.435476,-0.819594
3,1.16867,-0.435476,-1.483535
4,-0.829552,-0.435476,0.658427


In [13]:
X_test_normalized.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,-0.544091,-0.435476,-0.217375
1,1.12789,-0.435476,1.731076
2,-0.788772,-0.435476,0.241378
3,-1.278132,-0.435476,-1.476862
4,0.92399,-0.435476,0.434889


In [18]:
# Normally, I would concat X_train_normalized with X_train_encodeded
# But in this case, there are no cat features. I will assing X_train_transformed to X_train_normalized
# Just to follow the convention

X_train_transformed = X_train_normalized
X_test_transformed = X_test_normalized

In [16]:
X_train_transformed.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,-0.340191,-0.435476,-0.514314
1,0.88321,2.296336,0.01784
2,-1.196572,-0.435476,-0.819594
3,1.16867,-0.435476,-1.483535
4,-0.829552,-0.435476,0.658427


In [17]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_transformed, y_train)

In [19]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_transformed, y_train)
LR.score(X_test_transformed, y_test)

0.7808063600227144

In [22]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

pred = LR.predict(X_test_transformed)

print("precision: ",precision_score(y_test,pred, pos_label='Yes'))
print("recall: ",recall_score(y_test,pred, pos_label='Yes'))
print("f1: ",f1_score(y_test,pred, pos_label='Yes'))

precision:  0.6115942028985507
recall:  0.4557235421166307
f1:  0.5222772277227723


In [23]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

array([[1164,  134],
       [ 252,  211]])

In [24]:
# This model seems to be quite bad, the confusion matrix and the recall says that especially
# in guessing negative the results are bad

In [26]:
# Guessing all yes I would have about 70% or accurancy. So, this confirms that the model is not good ad all
y_train.value_counts()

No     3876
Yes    1406
Name: Churn, dtype: int64

SMOTE

In [27]:
#!pip install imblearn
from imblearn.over_sampling import SMOTE

In [30]:
from sklearn.linear_model import LogisticRegression

sm = SMOTE(random_state=100, k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_transformed,y_train)

LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test_transformed)

print("precision: ",precision_score(y_test,pred, pos_label='Yes'))
print("recall: ",recall_score(y_test,pred, pos_label='Yes'))
print("f1: ",f1_score(y_test,pred, pos_label='Yes'))

precision:  0.4776978417266187
recall:  0.7170626349892009
f1:  0.5734024179620035


In [31]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

array([[935, 363],
       [131, 332]])

In [32]:
# From the confusion matrix, we can notice that the recal value has dramatically improved
# After "smoting" the data