### Importing required modules

In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")

### Reading CSV File

In [2]:
df=pd.read_csv('cleaned_telecom_churn_data.csv',index_col=0)


In [3]:
df.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,0,1,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,1,0,1,1,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,1,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,0


In [4]:
x=df.drop('Churn',axis=1)
y=df['Churn']

### Splitting data fot training and testing.

In [5]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

###  Expermenting different Classification ML models

 #### 1.) LOGISTICS REGRESSION

In [6]:
model_lr=LogisticRegression()

In [7]:
model_lr.fit(x_train,y_train)

LogisticRegression()

In [8]:
y_pred=model_lr.predict(x_test)
y_pred

array([0, 0, 1, ..., 0, 1, 0], dtype=int64)

In [9]:
model_lr.score(x_test,y_test)

0.8073916133617626

In [10]:
print(classification_report(y_test, y_pred, ))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1026
           1       0.70      0.51      0.59       381

    accuracy                           0.81      1407
   macro avg       0.77      0.71      0.73      1407
weighted avg       0.80      0.81      0.80      1407



In [11]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [12]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [13]:
model_lr_resampled=LogisticRegression()

In [14]:
model_lr_resampled.fit(xr_train,yr_train)

LogisticRegression()

In [15]:
y_pred=model_lr.predict(xr_test)
y_pred

array([0, 0, 0, ..., 0, 1, 1], dtype=int64)

In [16]:
model_lr.score(xr_test,yr_test)

0.8266438941076003

Could not get a good accuracy using logistic regression even after solving imbalance data problem.

#### 2.) DECISION TREE CLASSIFIER

In [17]:
model_dt=DecisionTreeClassifier(criterion ="gini" ,random_state = 42,max_depth=6, min_samples_leaf=8)

In [18]:
model_dt.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=42)

In [19]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [20]:
model_dt.score(x_test,y_test)

0.7818052594171997

In [21]:
print(classification_report(y_test, y_pred, ))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1026
           1       0.63      0.48      0.54       381

    accuracy                           0.78      1407
   macro avg       0.72      0.69      0.70      1407
weighted avg       0.77      0.78      0.77      1407



As we can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.

Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

# 

In [22]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

 **SMOTE-ENN** method combines the SMOTE <u>ability to generate synthetic examples for minority class</u> and <u>ENN ability to delete some observations from both classes that are identified as having different class between the observation’s class and its K-nearest neighbor majority class</u>.

In [23]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [24]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 42,max_depth=6, min_samples_leaf=8)

In [25]:

model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9213483146067416
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       510
           1       0.94      0.92      0.93       647

    accuracy                           0.92      1157
   macro avg       0.92      0.92      0.92      1157
weighted avg       0.92      0.92      0.92      1157



In [26]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[473  37]
 [ 54 593]]


Now we can see quite better results, and a very good recall, precision & f1 score for minority class.

accuracy obtained through decision forest:93% 

#### 3.) RANDOM FOREST CLASSIFIER

In [27]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 42,max_depth=6, min_samples_leaf=8)
model_rf.fit(x_train,y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=42)

In [28]:
model_rf.fit(x_train,y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=42)

In [29]:
y_pred=model_rf.predict(x_test)

In [30]:
model_rf.score(x_test,y_test)

0.7995735607675906

In [31]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1026
           1       0.69      0.46      0.56       381

    accuracy                           0.80      1407
   macro avg       0.76      0.69      0.71      1407
weighted avg       0.79      0.80      0.79      1407



In [32]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [33]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [34]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 42,max_depth=6, min_samples_leaf=8)
model_rf_smote.fit(xr_train1,yr_train1)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=42)

In [35]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [36]:
 model_rf_smote.score(xr_test1, yr_test1)

0.9363327674023769

In [37]:
print(metrics.classification_report(yr_test1, yr_predict1))

              precision    recall  f1-score   support

           0       0.95      0.90      0.93       538
           1       0.92      0.96      0.94       640

    accuracy                           0.94      1178
   macro avg       0.94      0.93      0.94      1178
weighted avg       0.94      0.94      0.94      1178



In [38]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[486  52]
 [ 23 617]]


With RF Classifier, we get results better than Decision Tree.

accuracy obtained through random forest:94% 

==============================================================

### <u>**We are finalising the model obtained using Random Forest Classifier.**</u>

## Saving the model 

In [39]:
import pickle

In [40]:
filename='model.pkl'

In [41]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [42]:
load_model = pickle.load(open(filename, 'rb'))

In [43]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [44]:
model_score_r1

0.9363327674023769

Our final model i.e. RF Classifier with SMOTEENN, is now ready and dumped in model.pkl, which we will use and prepare API's so that we can access our model from UI.