# Model Building (Customer Churn)

## Importing required Libraries

In [54]:
## we'll add all required libraries here
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, classification_report, confusion_matrix, f1_score, precision_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

## Loading Dataset

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/ujoshidev/Telco-Customer-Churn/main/Dataset/telco_churn_dummies.csv',index_col=[0])
df.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,1,0,0,1,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,1,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,0,1,1,0,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,1,0,1,0,1,0,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0


## Modeling

In [42]:
df.groupby('Churn')['Churn'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,5163.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1869.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [9]:
X=df.drop('Churn',axis=1)
y=df['Churn']
print(X.shape,'   ',y.shape)

(7032, 50)     (7032,)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape,'  ',y_train.shape,'  ',X_test.shape,'  ',y_test.shape)

(5625, 50)    (5625,)    (1407, 50)    (1407,)


### 1. Decision Tree Classifier

In [12]:
model_dt= DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=6, min_samples_leaf=8)
model_dt.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [13]:
y_pred=model_dt.predict(X_test)
y_pred

array([0, 0, 0, ..., 1, 0, 0])

In [14]:
model_dt.score(X_test,y_test)

0.7725657427149965

In [15]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86      1039
           1       0.60      0.38      0.47       368

    accuracy                           0.77      1407
   macro avg       0.70      0.65      0.66      1407
weighted avg       0.75      0.77      0.75      1407



As it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.

We need to check recall, precision & f1 score for the minority class (Churn).

It's quite evident that the precision, recall & f1 score is too low for Churned customers.

We are using upsampling to cater this imbalance problem.

*Using Smote-ENN: Smote (Oversampler) combined with ENN (Undersampler)*

In [38]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(X,y)

In [39]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [40]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [41]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.941929974380871
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       502
           1       0.93      0.97      0.95       669

    accuracy                           0.94      1171
   macro avg       0.94      0.94      0.94      1171
weighted avg       0.94      0.94      0.94      1171



Now we can see quite better results (accuracy) and a very good recall, precision & f1-score for minority class.

Let's try with some other classifier.

### 2. Random Forest Classifier

In [48]:
model_rf = RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

In [49]:
print(model_rf.score(X_test,y_test))
print(classification_report(y_test, y_pred, labels=[0,1]))

0.7818052594171997
              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1039
           1       0.61      0.45      0.52       368

    accuracy                           0.78      1407
   macro avg       0.72      0.67      0.69      1407
weighted avg       0.77      0.78      0.77      1407



In [52]:
X_resampled1, y_resampled1 = sm.fit_resample(X,y)
xr_train1, xr_test1, yr_train1, yr_test1 = train_test_split(X_resampled1, y_resampled1, test_size=0.2)


model_rf_smote = RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)
model_rf_smote.fit(xr_train1,yr_train1)
yr_predict1 = model_rf_smote.predict(xr_test1)

In [53]:
print(model_rf_smote.score(xr_test1, yr_test1))
print(metrics.classification_report(yr_test1, yr_predict1))

0.9296264118158123
              precision    recall  f1-score   support

           0       0.96      0.89      0.92       540
           1       0.91      0.96      0.94       611

    accuracy                           0.93      1151
   macro avg       0.93      0.93      0.93      1151
weighted avg       0.93      0.93      0.93      1151



With Random Forest Classifier, also we are able to get better than Decision Tree.

## Performing PCA

In [56]:
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[0.99984698]


In [57]:
model = RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)
model.fit(xr_train_pca, yr_train1)
yr_predict_pca = model.predict(xr_test_pca)

In [58]:
print(model.score(xr_test_pca, yr_test1))
print(metrics.classification_report(yr_test1, yr_predict_pca))

0.7098175499565595
              precision    recall  f1-score   support

           0       0.77      0.55      0.64       540
           1       0.68      0.85      0.76       611

    accuracy                           0.71      1151
   macro avg       0.72      0.70      0.70      1151
weighted avg       0.72      0.71      0.70      1151



With PCA, we couldn't see any better results, hence let's finalise the model which was created by RF Classifier, and save the model.

## Saving Final Model

In [59]:
import pickle

filename = 'finalmodel.sav'
pickle.dump(model_rf_smote, open(filename, 'wb'))
load_model = pickle.load(open(filename, 'rb'))

print(load_model.score(xr_test1, yr_test1))

0.9296264118158123
