# Importing Libraries

In [7]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Reading csv

In [2]:
df=pd.read_csv("final_df.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn,Gender_Female,Gender_Male,Location_Chicago,Location_Houston,Location_Los Angeles,Location_Miami,Location_New York
0,0,63,17,73.36,236,0,0,1,0,0,1,0,0
1,1,62,1,48.76,172,0,1,0,0,0,0,0,1
2,2,24,5,85.47,460,0,1,0,0,0,1,0,0
3,3,36,3,97.94,297,1,1,0,0,0,0,1,0
4,4,46,19,58.14,266,0,1,0,0,0,0,1,0


In [3]:
df=df.drop('Unnamed: 0',axis=1)

In [4]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Gender_Female,Gender_Male,Location_Chicago,Location_Houston,Location_Los Angeles,Location_Miami,Location_New York
0,63,17,73.36,236,0,1,0,0,1,0,0
1,62,1,48.76,172,1,0,0,0,0,0,1
2,24,5,85.47,460,1,0,0,0,1,0,0
3,36,3,97.94,297,1,0,0,0,0,1,0
4,46,19,58.14,266,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
99995,33,23,55.13,226,0,1,0,1,0,0,0
99996,62,19,61.65,351,1,0,0,0,0,0,1
99997,64,17,96.11,251,0,1,1,0,0,0,0
99998,51,20,49.25,434,1,0,0,0,0,0,1


In [5]:
y=df['Churn']
y

0        0
1        0
2        0
3        1
4        0
        ..
99995    1
99996    0
99997    1
99998    1
99999    1
Name: Churn, Length: 100000, dtype: int64

**Train Test Split**

In [6]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

# Modeling

**Logistic Regression**

In [8]:
import warnings
# Create an instance of the LogisticRegression model
model1 = LogisticRegression()

# Train the model on the training data
model1.fit(x_train, y_train)

In [9]:
y_pred1=model1.predict(x_test)
y_pred1

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [23]:
model1.score(x_test,y_test)

0.5041

In [24]:
print(classification_report(y_test, y_pred1, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.51      0.56      0.53     10165
           1       0.50      0.45      0.47      9835

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000



Since the accuracy with Logistic Regression is very less as we can get this accuracy while giving output as 1 in all cases becuase our dataset contains 50% churners.

**RandomForest**

In [25]:
# Create an instance of the RandomForestClassifier model
model2 = RandomForestClassifier()

# Train the model on the training data
model2.fit(x_train, y_train)

In [26]:
y_pred2=model1.predict(x_test)
y_pred2

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [27]:
print(classification_report(y_test, y_pred2, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.51      0.56      0.53     10165
           1       0.50      0.45      0.47      9835

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000



In case of random forest also we can see there is very much less precision, recall and f1-score.

Now to improve the accuracy of model let's try to upsample the dataset.

In [28]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [29]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [30]:
# Create an instance of the RandomForestClassifier model
model_smote = RandomForestClassifier()


In [31]:
model_smote.fit(xr_train,yr_train)
yr_predict = model_smote.predict(xr_test)
model_score_r = model_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.587265055619486
              precision    recall  f1-score   support

           0       0.58      0.59      0.59      1283
           1       0.60      0.58      0.59      1324

    accuracy                           0.59      2607
   macro avg       0.59      0.59      0.59      2607
weighted avg       0.59      0.59      0.59      2607



Something better.

Let's try SMOTE process once again.

In [35]:
sm = SMOTEENN()
X_resampled2, y_resampled2 = sm.fit_resample(X_resampled, y_resampled)

In [36]:
xr2_train,xr2_test,yr2_train,yr2_test=train_test_split(X_resampled2, y_resampled2,test_size=0.2)

In [37]:
# Create an instance of the RandomForestClassifier model
model_smote2 = RandomForestClassifier()

In [38]:
model_smote2.fit(xr2_train,yr2_train)
yr2_predict = model_smote.predict(xr2_test)
model_score_r2 = model_smote.score(xr2_test, yr2_test)
print(model_score_r2)
print(metrics.classification_report(yr2_test, yr2_predict))

0.931497175141243
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       660
           1       0.95      0.92      0.94       756

    accuracy                           0.93      1416
   macro avg       0.93      0.93      0.93      1416
weighted avg       0.93      0.93      0.93      1416



Best accuracy, Final model. Yayy.

# Saving the model

In [39]:
import pickle

In [40]:
filename = 'model.sav'

In [41]:
pickle.dump(model_smote2, open(filename, 'wb'))

Our final model i.e. RF Classifier with SMOTEENN, is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI.