In [239]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Loading Data

In [240]:
df = pd.read_csv("customer_churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# 1. Explore and pre-process the dataset.

## Using One Hot Encoding

In [241]:
df.drop('customerID', axis=1, inplace=True)
df = pd.get_dummies(df, columns=['SeniorCitizen','gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod'])
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

## Replace Missing Value of TotalCharges Column by mean

In [242]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)

## Min-Max Scaler using in pipe.

# 2. Use cross-validation, and build a model to predict churn rate. It  can be any kind of model, or it could be a model that you  haven't seen in the class.

- Build a decision tree model and check accuracy with cross validation 5 folds

In [243]:
from sklearn.model_selection import cross_val_score,KFold,cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline


X = df.drop('Churn', axis=1)
y = df['Churn']

for i in range(1,20):

    pipe = Pipeline([('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier(n_neighbors=i, metric = 'minkowski', p = 2))])

    # clf = KNeighborsClassifier(n_neighbors=i, metric = 'minkowski', p = 2)
    k_folds = KFold(n_splits = 5)

    scores = cross_val_score(pipe, X, y, scoring='accuracy', cv = k_folds)
    print("k =",i)
    print(f"accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))
    print()

k = 1
accuracy: 0.71461 (+/- 0.02667)

k = 2
accuracy: 0.74784 (+/- 0.01375)

k = 3
accuracy: 0.74783 (+/- 0.01995)

k = 4
accuracy: 0.76757 (+/- 0.01578)

k = 5
accuracy: 0.76473 (+/- 0.01698)

k = 6
accuracy: 0.77268 (+/- 0.01307)

k = 7
accuracy: 0.76729 (+/- 0.01901)

k = 8
accuracy: 0.77510 (+/- 0.01617)

k = 9
accuracy: 0.77382 (+/- 0.01849)

k = 10
accuracy: 0.77637 (+/- 0.01452)

k = 11
accuracy: 0.77140 (+/- 0.01148)

k = 12
accuracy: 0.77510 (+/- 0.01495)

k = 13
accuracy: 0.77311 (+/- 0.01813)

k = 14
accuracy: 0.77808 (+/- 0.02112)

k = 15
accuracy: 0.77921 (+/- 0.01805)

k = 16
accuracy: 0.78489 (+/- 0.02069)

k = 17
accuracy: 0.78219 (+/- 0.02012)

k = 18
accuracy: 0.78660 (+/- 0.01787)

k = 19
accuracy: 0.78376 (+/- 0.01854)



In [244]:
pipe = Pipeline([('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier(n_neighbors=6, metric = 'minkowski', p = 2))])
k_folds = KFold(n_splits = 5)

scores = cross_val_score(pipe, X, y, scoring='accuracy', cv = k_folds)
print("k =",6)
print(f"accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))

k = 6
accuracy: 0.77268 (+/- 0.01307)


# 3. Once the model is built,

## Show the confusion-matrix

In [245]:
from sklearn.metrics import classification_report
y_pred = cross_val_predict(pipe, X, y, cv=k_folds)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y, y_pred))

[[4637  537]
 [1064  805]]


## Analyze precision, recall and f1-score of the two classes.

In [246]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85      5174
           1       0.60      0.43      0.50      1869

    accuracy                           0.77      7043
   macro avg       0.71      0.66      0.68      7043
weighted avg       0.76      0.77      0.76      7043



# 4. Improve f1-score of the minority class using

In [254]:
print(df['Churn'].value_counts()) # 0 is No, 1 is Yes

Churn
0    5174
1    1869
Name: count, dtype: int64


## Sampling fixing size
- make a number of class 0 (No) equal to number of class 1 (Yes)

In [255]:
df_sapling_fixing_size = df[df['Churn'] == 0].sample(n=len(df[df['Churn'] == 1]), random_state=42)
df_sapling_fixing_size = pd.concat([df_sapling_fixing_size, df[df['Churn'] == 1]])
print(df_sapling_fixing_size['Churn'].value_counts())

Churn
0    1869
1    1869
Name: count, dtype: int64


## Over-sampling by duplication the minority class

In [256]:
df_train_majority = df[df['Churn'] == 0]
df_train_minority = df[df['Churn'] == 1]
df_train_minority_upsampled = df_train_minority.sample(n=len(df_train_majority), replace=True, random_state=42)
df_train_upsampled = pd.concat([df_train_majority, df_train_minority_upsampled])
print(df_train_upsampled['Churn'].value_counts())

Churn
0    5174
1    5174
Name: count, dtype: int64


## Over-sampling by SMOTE

In [257]:
from imblearn.over_sampling import SMOTE

X_train = df.drop('Churn', axis=1)
y_train = df['Churn']

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

smote_df = pd.concat([X_train_smote, y_train_smote], axis=1)
print(smote_df['Churn'].value_counts())

Churn
0    5174
1    5174
Name: count, dtype: int64


## Compare performance from the three techniques

### Sampling fixing size

In [258]:
x = df_sapling_fixing_size.drop('Churn', axis=1)
y = df_sapling_fixing_size['Churn']

pipe = Pipeline([('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier(n_neighbors=6, metric = 'minkowski', p = 2))])
k_folds = KFold(n_splits = 5)

scores = cross_val_score(pipe, x, y, scoring='accuracy', cv = k_folds)
print("k =",6)
print(f"accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))
y_pred = cross_val_predict(pipe, x, y, cv=k_folds)
print(confusion_matrix(y, y_pred))
print(classification_report(y, y_pred))

k = 6
accuracy: 0.63614 (+/- 0.12306)
[[1214  655]
 [ 705 1164]]
              precision    recall  f1-score   support

           0       0.63      0.65      0.64      1869
           1       0.64      0.62      0.63      1869

    accuracy                           0.64      3738
   macro avg       0.64      0.64      0.64      3738
weighted avg       0.64      0.64      0.64      3738



### Over-sampling by duplication the minority class

In [259]:
x = df_train_upsampled.drop('Churn', axis=1)
y = df_train_upsampled['Churn']

pipe = Pipeline([('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier(n_neighbors=6, metric = 'minkowski', p = 2))])
k_folds = KFold(n_splits = 5)

scores = cross_val_score(pipe, x, y, scoring='accuracy', cv = k_folds)
print("k =",6)
print(f"accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))
y_pred = cross_val_predict(pipe, x, y, cv=k_folds)
print(confusion_matrix(y, y_pred))
print(classification_report(y, y_pred))

k = 6
accuracy: 0.67481 (+/- 0.10449)
[[3376 1798]
 [1567 3607]]
              precision    recall  f1-score   support

           0       0.68      0.65      0.67      5174
           1       0.67      0.70      0.68      5174

    accuracy                           0.67     10348
   macro avg       0.68      0.67      0.67     10348
weighted avg       0.68      0.67      0.67     10348



### Over-sampling by SMOTE

In [260]:
x = smote_df.drop('Churn', axis=1)
y = smote_df['Churn']

pipe = Pipeline([('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier(n_neighbors=6, metric = 'minkowski', p = 2))])
k_folds = KFold(n_splits = 5)

scores = cross_val_score(pipe, x, y, scoring='accuracy', cv = k_folds)
print("k =",6)
print(f"accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2))
y_pred = cross_val_predict(pipe, x, y, cv=k_folds)
print(confusion_matrix(y, y_pred))
print(classification_report(y, y_pred))

k = 6
accuracy: 0.83167 (+/- 0.13752)
[[4498  676]
 [1066 4108]]
              precision    recall  f1-score   support

           0       0.81      0.87      0.84      5174
           1       0.86      0.79      0.83      5174

    accuracy                           0.83     10348
   macro avg       0.83      0.83      0.83     10348
weighted avg       0.83      0.83      0.83     10348

