# Building Prediction Model

In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [2]:
df = pd.read_csv('tel_churn.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,Monthly Charge,Total Charges,Churn Label,Gender_Female,Gender_Male,Senior Citizen_No,Senior Citizen_Yes,Married_No,Married_Yes,...,Paperless Billing_Yes,Payment Method_Bank Withdrawal,Payment Method_Credit Card,Payment Method_Mailed Check,Tenure_group_1 - 12,Tenure_group_13 - 12,Tenure_group_25 - 12,Tenure_group_37 - 12,Tenure_group_49 - 12,Tenure_group_61 - 12
0,0,39.65,39.65,1,False,True,False,True,True,False,...,True,True,False,False,True,False,False,False,False,False
1,1,80.65,633.3,1,True,False,False,True,False,True,...,True,False,True,False,True,False,False,False,False,False
2,2,95.45,1752.55,1,False,True,False,True,True,False,...,True,True,False,False,False,True,False,False,False,False


In [3]:
df = df.drop('Unnamed: 0', axis=1)

In [4]:
df.head()

Unnamed: 0,Monthly Charge,Total Charges,Churn Label,Gender_Female,Gender_Male,Senior Citizen_No,Senior Citizen_Yes,Married_No,Married_Yes,Dependents_No,...,Paperless Billing_Yes,Payment Method_Bank Withdrawal,Payment Method_Credit Card,Payment Method_Mailed Check,Tenure_group_1 - 12,Tenure_group_13 - 12,Tenure_group_25 - 12,Tenure_group_37 - 12,Tenure_group_49 - 12,Tenure_group_61 - 12
0,39.65,39.65,1,False,True,False,True,True,False,True,...,True,True,False,False,True,False,False,False,False,False
1,80.65,633.3,1,True,False,False,True,False,True,False,...,True,False,True,False,True,False,False,False,False,False
2,95.45,1752.55,1,False,True,False,True,True,False,False,...,True,True,False,False,False,True,False,False,False,False
3,98.5,2514.5,1,True,False,False,True,False,True,False,...,True,True,False,False,False,False,True,False,False,False
4,76.5,2868.15,1,True,False,False,True,False,True,False,...,True,True,False,False,False,False,False,True,False,False


In [5]:
x = df.drop('Churn Label', axis=1)
x

Unnamed: 0,Monthly Charge,Total Charges,Gender_Female,Gender_Male,Senior Citizen_No,Senior Citizen_Yes,Married_No,Married_Yes,Dependents_No,Dependents_Yes,...,Paperless Billing_Yes,Payment Method_Bank Withdrawal,Payment Method_Credit Card,Payment Method_Mailed Check,Tenure_group_1 - 12,Tenure_group_13 - 12,Tenure_group_25 - 12,Tenure_group_37 - 12,Tenure_group_49 - 12,Tenure_group_61 - 12
0,39.65,39.65,False,True,False,True,True,False,True,False,...,True,True,False,False,True,False,False,False,False,False
1,80.65,633.30,True,False,False,True,False,True,False,True,...,True,False,True,False,True,False,False,False,False,False
2,95.45,1752.55,False,True,False,True,True,False,False,True,...,True,True,False,False,False,True,False,False,False,False
3,98.50,2514.50,True,False,False,True,False,True,False,True,...,True,True,False,False,False,False,True,False,False,False
4,76.50,2868.15,True,False,False,True,False,True,False,True,...,True,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,21.15,1419.40,True,False,True,False,True,False,True,False,...,True,True,False,False,False,False,False,False,False,True
7039,84.80,1990.50,False,True,True,False,False,True,False,True,...,True,False,False,True,False,True,False,False,False,False
7040,103.20,7362.90,True,False,True,False,False,True,False,True,...,True,False,True,False,False,False,False,False,False,True
7041,29.60,346.45,True,False,True,False,False,True,False,True,...,True,True,False,False,True,False,False,False,False,False


In [6]:
y = df['Churn Label']
y

0       1
1       1
2       1
3       1
4       1
       ..
7038    0
7039    0
7040    0
7041    0
7042    0
Name: Churn Label, Length: 7043, dtype: int64

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Decision Tree classifier

In [8]:
model_dt = DecisionTreeClassifier(criterion='gini', max_depth=6, min_samples_leaf=8, random_state=100)

In [9]:
model_dt.fit(x_train, y_train)

In [10]:
y_pred = model_dt.predict(x_test)

In [11]:
y_pred

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [12]:
model_dt.score(x_test,y_pred)*100

100.0

In [13]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.93      0.88      1009
           1       0.75      0.49      0.60       400

    accuracy                           0.81      1409
   macro avg       0.79      0.71      0.74      1409
weighted avg       0.80      0.81      0.80      1409



In [14]:
print(confusion_matrix(y_test, y_pred))

[[943  66]
 [202 198]]


### Smart Analysis (smoteenn)
### Up sampling and down sampling
### It use to resample using Smote and cleaning using ENN

In [15]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [16]:
xr_train, xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled, test_size=0.2)

In [17]:
model_dt_smote = DecisionTreeClassifier(criterion='gini', max_depth=6, min_samples_leaf=8)
model_dt_smote.fit(xr_train, yr_train)

In [18]:
y_pred_smote = model_dt_smote.predict(xr_test)

In [19]:
print(classification_report(yr_test, y_pred_smote, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92       530
           1       0.92      0.96      0.94       664

    accuracy                           0.93      1194
   macro avg       0.93      0.93      0.93      1194
weighted avg       0.93      0.93      0.93      1194



In [20]:
print(confusion_matrix(yr_test, y_pred_smote))

[[473  57]
 [ 27 637]]


# Random Forest Classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
model_rf = RandomForestClassifier(criterion='gini', max_depth=6, min_samples_leaf=8, n_estimators=100)
model_rf.fit(x_train, y_train)
y_pred_rf = model_rf.predict(x_test)

In [23]:
print(classification_report(y_test, y_pred_rf, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.84      0.92      0.88      1009
           1       0.73      0.56      0.64       400

    accuracy                           0.82      1409
   macro avg       0.79      0.74      0.76      1409
weighted avg       0.81      0.82      0.81      1409



In [24]:
sm = SMOTEENN()
Xr_resampled, yr_resampled = sm.fit_resample(x,y)
xr_train, xr_test, yr_train, yr_test = train_test_split(Xr_resampled, yr_resampled, test_size=0.2)
model_smote_rf = DecisionTreeClassifier(criterion='gini', max_depth=6, min_samples_leaf=8)
model_smote_rf.fit(xr_train, yr_train)
y_pred_smote_rf = model_smote_rf.predict(xr_test)

In [25]:
print(classification_report(yr_test, y_pred_smote_rf, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       549
           1       0.94      0.94      0.94       651

    accuracy                           0.93      1200
   macro avg       0.93      0.93      0.93      1200
weighted avg       0.93      0.93      0.93      1200



In [26]:
print(confusion_matrix(yr_test, y_pred_smote_rf))

[[509  40]
 [ 40 611]]


## How to save the code?? 
## Use pickle to save the code and re use it when needed.

## Now run the model and see the score directly

In [27]:
import pickle
filename = 'model.pkl'
pickle.dump(model_smote_rf, open(filename, 'wb'))

In [29]:
load_model = pickle.load(open(filename, 'rb'))

In [30]:
load_model.score(xr_test, yr_test)

0.9333333333333333