In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [2]:
data=pd.read_excel('data_churn.xlsx')
data.head()

Unnamed: 0.1,Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_France,country_Germany,country_Spain,gender_Female,gender_Male
0,0,15634602,619,42,2,0.0,1,1,1,101348.88,1,1,0,0,1,0
1,1,15647311,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1,0
2,2,15619304,502,42,8,159660.8,3,1,0,113931.57,1,1,0,0,1,0
3,3,15701354,699,39,1,0.0,2,0,0,93826.63,0,1,0,0,1,0
4,4,15737888,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1,1,0


In [7]:
data=data.drop('Unnamed: 0',axis=1)

In [8]:
# x and y variables
x=data.drop('churn',axis=1)
x

Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,country_France,country_Germany,country_Spain,gender_Female,gender_Male
0,15634602,619,42,2,0.00,1,1,1,101348.88,1,0,0,1,0
1,15647311,608,41,1,83807.86,1,0,1,112542.58,0,0,1,1,0
2,15619304,502,42,8,159660.80,3,1,0,113931.57,1,0,0,1,0
3,15701354,699,39,1,0.00,2,0,0,93826.63,1,0,0,1,0
4,15737888,850,43,2,125510.82,1,1,1,79084.10,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,15606229,771,39,5,0.00,2,1,0,96270.64,1,0,0,0,1
9996,15569892,516,35,10,57369.61,1,1,1,101699.77,1,0,0,0,1
9997,15584532,709,36,7,0.00,1,0,1,42085.58,1,0,0,1,0
9998,15682355,772,42,3,75075.31,2,1,0,92888.52,0,1,0,0,1


In [9]:
y=data['churn']
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: churn, Length: 10000, dtype: int64

In [22]:
y.shape

(10000,)

# Train Test Split

In [10]:
x_train, x_test, y_train, y_test=train_test_split(x,y,train_size=0.8)

In [11]:
model=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [12]:
model.fit(x_train,y_train)

In [13]:
y_pred=model.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [30]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.87      0.97      0.91      1589
           1       0.76      0.42      0.54       411

    accuracy                           0.85      2000
   macro avg       0.81      0.69      0.73      2000
weighted avg       0.84      0.85      0.84      2000



In [31]:
# data is imbalanced. oversampling using smoteen

In [36]:
sm = SMOTEENN()
x_new, y_new = sm.fit_resample(x,y)

In [39]:
print(x_new.shape)
print(y_new.shape)

(7687, 14)
(7687,)


In [41]:
xn_train, xn_test, yn_train, yn_test=train_test_split(x_new,y_new,test_size=0.2)

In [42]:
model_new=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [43]:
model_new.fit(xn_train,yn_train)

In [44]:
yn_pred=model_new.predict(xn_test)

In [56]:
model_new.score(xn_test,yn_test)

0.834850455136541

In [46]:
print(classification_report(yn_test,yn_pred,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.84      0.76      0.80       656
           1       0.83      0.89      0.86       882

    accuracy                           0.83      1538
   macro avg       0.84      0.83      0.83      1538
weighted avg       0.83      0.83      0.83      1538



In [48]:
print(metrics.confusion_matrix(yn_test, yn_pred))

[[500 156]
 [ 98 784]]


In [49]:
#using another classifier to see if it performs any better

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
model_1=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [52]:
model_1.fit(x_train,y_train)

In [53]:
y1_pred=model_1.predict(x_test)

In [54]:
model_1.score(x_test,y_test)

0.847

In [58]:
print(classification_report(y_test,y1_pred,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.85      0.99      0.91      1589
           1       0.87      0.30      0.45       411

    accuracy                           0.85      2000
   macro avg       0.86      0.64      0.68      2000
weighted avg       0.85      0.85      0.82      2000



In [57]:
#as it is imbalanced, let's perform oversampling

In [60]:
osm1=SMOTEENN()
x_n1,y_n1=osm1.fit_resample(x,y)

In [62]:
x1_train,x1_test,y1_train,y1_test=train_test_split(x_n1,y_n1,test_size=0.2)

In [63]:
model_n1=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [64]:
model_n1.fit(x1_train,y1_train)

In [65]:
yn1_pred=model_n1.predict(x1_test)
yn1_pred

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [66]:
model_n1.score(x1_test,y1_test)

0.8706338939197931

In [67]:
print(classification_report(y1_test,yn1_pred))

              precision    recall  f1-score   support

           0       0.86      0.83      0.84       645
           1       0.88      0.90      0.89       901

    accuracy                           0.87      1546
   macro avg       0.87      0.86      0.87      1546
weighted avg       0.87      0.87      0.87      1546



In [68]:
print(metrics.confusion_matrix(y1_test,yn1_pred))

[[534 111]
 [ 89 812]]


In [69]:
# RF classifier is performing better

# Pickling the model

In [70]:
import pickle

In [71]:
file='model.sav'

In [73]:
pickle.dump(model_n1, open(file,'wb'))

In [74]:
load=pickle.load(open(file,'rb'))

In [75]:
print(load.score(x1_test,y1_test))

0.8706338939197931
