# Importing libraries

In [3]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN


# Reading csv

In [4]:
df = pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1-12,tenure_group_13-24,tenure_group_25-36,tenure_group_37-48,tenure_group_49-60,tenure_group_61-72
0,0,0,29,29,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56,1889,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53,108,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42,1840,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70,151,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [5]:
df =df.drop('Unnamed: 0', axis = 1)

In [6]:
x=df.drop('Churn', axis =1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1-12,tenure_group_13-24,tenure_group_25-36,tenure_group_37-48,tenure_group_49-60,tenure_group_61-72
0,0,29,29,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56,1889,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53,108,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42,1840,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70,151,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84,1990,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103,7362,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29,346,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74,306,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [7]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

# Train Test Split

In [8]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

### Decision Tree Classifier

In [9]:
model_dt = DecisionTreeClassifier(criterion ='gini' ,random_state = 100, max_depth=6, min_samples_leaf =8)

In [10]:
model_dt.fit(x_train,y_train)

In [11]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [12]:
model_dt.score(x_test,y_test)

0.7796730632551528

In [13]:
print(classification_report(y_test,y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1029
           1       0.62      0.47      0.53       378

    accuracy                           0.78      1407
   macro avg       0.72      0.68      0.69      1407
weighted avg       0.77      0.78      0.77      1407



#### As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
#### Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
#### Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [14]:
sm =SMOTEENN()
x_resampled,y_resampled = sm.fit_resample(x,y)

In [15]:
xr_train,xr_test,yr_train ,yr_test = train_test_split(x_resampled,y_resampled,test_size=0.2)

In [16]:
model_dt_smote=DecisionTreeClassifier(criterion='gini',random_state=100,max_depth=6, min_samples_leaf=8)

In [17]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict=model_dt_smote.predict(xr_test)
model_score_r= model_dt_smote.score(xr_test,yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test,yr_predict))

0.9509043927648578
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       548
           1       0.94      0.96      0.95       613

    accuracy                           0.95      1161
   macro avg       0.95      0.95      0.95      1161
weighted avg       0.95      0.95      0.95      1161



In [18]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict=model_dt_smote.predict(xr_test)
model_score_r =model_dt_smote.score(xr_test,yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test,yr_predict))

0.9509043927648578
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       548
           1       0.94      0.96      0.95       613

    accuracy                           0.95      1161
   macro avg       0.95      0.95      0.95      1161
weighted avg       0.95      0.95      0.95      1161



In [19]:
print(metrics.confusion_matrix(yr_test,yr_predict))

[[513  35]
 [ 22 591]]


### Now we can see quite better results,i.e Accuracy: 92% and a very good recall,precision & f1 score for minority class.
### Lets try with some other classifier.

## Random Forest CLassifier

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
model_rf=RandomForestClassifier(n_estimators = 100 , criterion ='gini', random_state=100,max_depth=6, min_samples_leaf=8)

In [22]:
model_rf.fit(x_train,y_train)

In [23]:
y_pred=model_rf.predict(x_test)

In [24]:
model_rf.score(x_test,y_test)

0.7931769722814499

In [25]:
print(classification_report(y_test,y_pred,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.93      0.87      1029
           1       0.69      0.43      0.53       378

    accuracy                           0.79      1407
   macro avg       0.75      0.68      0.70      1407
weighted avg       0.78      0.79      0.78      1407



In [26]:
sm=SMOTEENN()
x_resampled1,y_resampled1=sm.fit_resample(x,y)

In [27]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(x_resampled1,y_resampled1,test_size=0.2)

In [28]:
model_rf_smote=RandomForestClassifier(max_depth=6,min_samples_leaf=8,random_state=100)

In [29]:
model_rf_smote.fit(xr_train1,yr_train1)

In [30]:
yr_predict1 =model_rf_smote.predict(xr_test1)

In [31]:
model_score_r1=model_rf_smote.score(xr_test1,yr_test1)

In [32]:
print(model_score_r1)
print(metrics.classification_report(yr_test1,yr_predict1))

0.9285099052540913
              precision    recall  f1-score   support

           0       0.96      0.89      0.92       552
           1       0.91      0.96      0.93       609

    accuracy                           0.93      1161
   macro avg       0.93      0.93      0.93      1161
weighted avg       0.93      0.93      0.93      1161



### with Random Forest Classifier, also we are able to get quite good result , infact better than Decision Tree.

# Pickling the model

In [33]:
import pickle

In [34]:
# Save model
file_name = 'model.sav'
with open(file_name, 'wb') as f:
    pickle.dump(model_rf_smote, f)

# Load model
with open(file_name, 'rb') as f:
    loaded_model = pickle.load(f)

In [35]:
model_score_rf = loaded_model.score(xr_test1,yr_test1)

In [36]:
model_score_r1

0.9285099052540913

In [37]:
import joblib

In [38]:
joblib.dump(model_rf,'churn_rf_model.pkl')

['churn_rf_model.pkl']