In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_excel("Churn_Modelling.xlsx",index_col=0)
df.head()

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  int64  
 10  IsActiveMember   10000 non-null  int64  
 11  EstimatedSalary  10000 non-null  float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1.1+ MB


In [4]:
df.isna().sum()

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [5]:
df["Exited"].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [6]:
df.drop(["CustomerId","Surname","Geography"],axis=1,inplace=True)
df

Unnamed: 0_level_0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,619,Female,42,2,0.00,1,1,1,101348.88,1
2,608,Female,41,1,83807.86,1,0,1,112542.58,0
3,502,Female,42,8,159660.80,3,1,0,113931.57,1
4,699,Female,39,1,0.00,2,0,0,93826.63,0
5,850,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...
9996,771,Male,39,5,0.00,2,1,0,96270.64,0
9997,516,Male,35,10,57369.61,1,1,1,101699.77,0
9998,709,Female,36,7,0.00,1,0,1,42085.58,1
9999,772,Male,42,3,75075.31,2,1,0,92888.52,1


In [7]:
x=df.iloc[:,:-1]
x

Unnamed: 0_level_0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,619,Female,42,2,0.00,1,1,1,101348.88
2,608,Female,41,1,83807.86,1,0,1,112542.58
3,502,Female,42,8,159660.80,3,1,0,113931.57
4,699,Female,39,1,0.00,2,0,0,93826.63
5,850,Female,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...
9996,771,Male,39,5,0.00,2,1,0,96270.64
9997,516,Male,35,10,57369.61,1,1,1,101699.77
9998,709,Female,36,7,0.00,1,0,1,42085.58
9999,772,Male,42,3,75075.31,2,1,0,92888.52


In [8]:
y=df["Exited"]
y

RowNumber
1        1
2        0
3        1
4        0
5        0
        ..
9996     0
9997     0
9998     1
9999     1
10000    0
Name: Exited, Length: 10000, dtype: int64

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
CT = ColumnTransformer(transformers = [ ("trans1",OrdinalEncoder(),["Gender"]),
                                        ("trans2",StandardScaler(),['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
                                                                    'HasCrCard', 'IsActiveMember', 'EstimatedSalary'])], remainder='passthrough')
x = CT.fit_transform(x)

In [10]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.20,random_state=1)

In [11]:
def mymodel(model):
    model.fit(xtrain,ytrain)
    ypred=model.predict(xtest)
    print(classification_report(ytest,ypred))

In [12]:
lreg=LogisticRegression()
knn=KNeighborsClassifier()
svm=SVC()
dt=DecisionTreeClassifier()

In [13]:
mymodel(lreg)

              precision    recall  f1-score   support

           0       0.82      0.97      0.89      1585
           1       0.58      0.17      0.26       415

    accuracy                           0.80      2000
   macro avg       0.70      0.57      0.57      2000
weighted avg       0.77      0.80      0.76      2000



In [14]:
params = [ ['lbfgs','l2'],['lbfgs','none'],
           ['liblinear','l1'],['liblinear','l2'],
           ['newton-cg','l2'],['newton-cg','none'],
           ['sag','l2'],['sag','none'],
           ['saga','l1'],['saga','l2'],['saga','none']  ]

#  ['saga','elasticnet']  --->  Need to perform Scalling on the Data 

all_combinations = []

for i in params:

    from sklearn.linear_model import LogisticRegression

    model = LogisticRegression(solver=i[0] , penalty=i[1])

    model.fit(xtrain,ytrain)

    ypred = model.predict(xtest)
    
    from sklearn.metrics import accuracy_score
    acc = accuracy_score(ytest,ypred)
    
    print(f"{i} ---> {acc} ")
    
    all_combinations.append(acc)

['lbfgs', 'l2'] ---> 0.8025 
['lbfgs', 'none'] ---> 0.802 
['liblinear', 'l1'] ---> 0.802 
['liblinear', 'l2'] ---> 0.8025 
['newton-cg', 'l2'] ---> 0.8025 
['newton-cg', 'none'] ---> 0.802 
['sag', 'l2'] ---> 0.8025 
['sag', 'none'] ---> 0.802 
['saga', 'l1'] ---> 0.802 
['saga', 'l2'] ---> 0.8025 
['saga', 'none'] ---> 0.802 


In [15]:
# Best Hyperparmeter for Logistic Regression
print(f"{params[all_combinations.index(max(all_combinations))]} ---> {max(all_combinations)}")

['lbfgs', 'l2'] ---> 0.8025


In [16]:
mymodel(knn)

              precision    recall  f1-score   support

           0       0.86      0.96      0.90      1585
           1       0.70      0.40      0.50       415

    accuracy                           0.84      2000
   macro avg       0.78      0.68      0.70      2000
weighted avg       0.82      0.84      0.82      2000



In [17]:
ac_list=[]

for i in range(1,30):
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(xtrain,ytrain)
    ypred=knn.predict(xtest)
    ac=accuracy_score(ytest,ypred)
    ac_list.append(ac)
    print(i,"---->",ac)

1 ----> 0.7935
2 ----> 0.8325
3 ----> 0.8275
4 ----> 0.84
5 ----> 0.839
6 ----> 0.8445
7 ----> 0.8485
8 ----> 0.839
9 ----> 0.8415
10 ----> 0.8415
11 ----> 0.8435
12 ----> 0.842
13 ----> 0.844
14 ----> 0.842
15 ----> 0.844
16 ----> 0.8405
17 ----> 0.844
18 ----> 0.8415
19 ----> 0.844
20 ----> 0.844
21 ----> 0.8475
22 ----> 0.8455
23 ----> 0.8445
24 ----> 0.844
25 ----> 0.844
26 ----> 0.842
27 ----> 0.8415
28 ----> 0.84
29 ----> 0.842


In [18]:
ac_list.index(max(ac_list))+1

7

In [19]:
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(xtrain,ytrain)
ypred=knn.predict(xtest)
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.85      0.98      0.91      1585
           1       0.79      0.36      0.50       415

    accuracy                           0.85      2000
   macro avg       0.82      0.67      0.70      2000
weighted avg       0.84      0.85      0.83      2000



In [20]:
# Best Hyperparmeter for KNN
print(f"{ac_list.index(max(ac_list))+1} ---> {max(ac_list)}")

7 ---> 0.8485


In [21]:
mymodel(svm)

              precision    recall  f1-score   support

           0       0.86      0.98      0.91      1585
           1       0.84      0.37      0.51       415

    accuracy                           0.85      2000
   macro avg       0.85      0.67      0.71      2000
weighted avg       0.85      0.85      0.83      2000



# Decision Tree

In [22]:
mymodel(dt)

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      1585
           1       0.47      0.47      0.47       415

    accuracy                           0.78      2000
   macro avg       0.67      0.66      0.67      2000
weighted avg       0.78      0.78      0.78      2000



# Hyper parameter

In [23]:
dt1=DecisionTreeClassifier(max_depth=5)
mymodel(dt1)

              precision    recall  f1-score   support

           0       0.86      0.98      0.91      1585
           1       0.81      0.38      0.52       415

    accuracy                           0.85      2000
   macro avg       0.83      0.68      0.72      2000
weighted avg       0.85      0.85      0.83      2000



In [24]:
for i in range(1,50):
    dt2=DecisionTreeClassifier(max_depth=i)
    dt2.fit(xtrain,ytrain)
    ypred=dt2.predict(xtest)
    print(f" {i} = {accuracy_score(ytest,ypred)}")

 1 = 0.7925
 2 = 0.828
 3 = 0.838
 4 = 0.846
 5 = 0.853
 6 = 0.8535
 7 = 0.845
 8 = 0.836
 9 = 0.8375
 10 = 0.827
 11 = 0.819
 12 = 0.813
 13 = 0.8055
 14 = 0.8005
 15 = 0.798
 16 = 0.7905
 17 = 0.791
 18 = 0.794
 19 = 0.78
 20 = 0.7915
 21 = 0.787
 22 = 0.7805
 23 = 0.7885
 24 = 0.7775
 25 = 0.7815
 26 = 0.7815
 27 = 0.7805
 28 = 0.779
 29 = 0.7835
 30 = 0.784
 31 = 0.778
 32 = 0.7835
 33 = 0.785
 34 = 0.7745
 35 = 0.7805
 36 = 0.7835
 37 = 0.7835
 38 = 0.778
 39 = 0.778
 40 = 0.7805
 41 = 0.779
 42 = 0.784
 43 = 0.7775
 44 = 0.7775
 45 = 0.7805
 46 = 0.7815
 47 = 0.7865
 48 = 0.7845
 49 = 0.782


# max_depth

In [25]:
#best value of max_depth=6
dt3=DecisionTreeClassifier(max_depth=6)
mymodel(dt3)

              precision    recall  f1-score   support

           0       0.86      0.97      0.91      1585
           1       0.77      0.41      0.54       415

    accuracy                           0.85      2000
   macro avg       0.82      0.69      0.73      2000
weighted avg       0.84      0.85      0.84      2000



# min_samples_leaf

In [26]:
dt4=DecisionTreeClassifier(min_samples_leaf=10)
mymodel(dt4)

              precision    recall  f1-score   support

           0       0.86      0.93      0.89      1585
           1       0.61      0.43      0.51       415

    accuracy                           0.82      2000
   macro avg       0.74      0.68      0.70      2000
weighted avg       0.81      0.82      0.81      2000



In [27]:
for i in range(1,75):
    dt5=DecisionTreeClassifier(min_samples_leaf=i)
    dt5.fit(xtrain,ytrain)
    ypred=dt5.predict(xtest)
    print(f"{i} = {accuracy_score(ytest,ypred)}")

1 = 0.776
2 = 0.7935
3 = 0.7965
4 = 0.8085
5 = 0.8085
6 = 0.816
7 = 0.8145
8 = 0.822
9 = 0.824
10 = 0.8265
11 = 0.8295
12 = 0.832
13 = 0.833
14 = 0.8385
15 = 0.84
16 = 0.84
17 = 0.8405
18 = 0.845
19 = 0.8425
20 = 0.844
21 = 0.8505
22 = 0.8515
23 = 0.849
24 = 0.8495
25 = 0.848
26 = 0.8495
27 = 0.8505
28 = 0.851
29 = 0.847
30 = 0.848
31 = 0.848
32 = 0.848
33 = 0.848
34 = 0.8535
35 = 0.8535
36 = 0.8535
37 = 0.8535
38 = 0.8515
39 = 0.8515
40 = 0.8445
41 = 0.8445
42 = 0.844
43 = 0.847
44 = 0.847
45 = 0.847
46 = 0.847
47 = 0.847
48 = 0.8435
49 = 0.8435
50 = 0.8435
51 = 0.8435
52 = 0.8425
53 = 0.8405
54 = 0.8425
55 = 0.8345
56 = 0.8345
57 = 0.8345
58 = 0.832
59 = 0.8325
60 = 0.8325
61 = 0.838
62 = 0.838
63 = 0.838
64 = 0.838
65 = 0.838
66 = 0.838
67 = 0.838
68 = 0.837
69 = 0.837
70 = 0.837
71 = 0.837
72 = 0.837
73 = 0.837
74 = 0.837


In [28]:
dt6=DecisionTreeClassifier(min_samples_leaf=34)
mymodel(dt6)

              precision    recall  f1-score   support

           0       0.87      0.96      0.91      1585
           1       0.74      0.45      0.56       415

    accuracy                           0.85      2000
   macro avg       0.81      0.70      0.74      2000
weighted avg       0.84      0.85      0.84      2000



In [29]:
dt7=DecisionTreeClassifier(max_depth=6,min_samples_leaf=34)
mymodel(dt7)

              precision    recall  f1-score   support

           0       0.86      0.97      0.91      1585
           1       0.78      0.41      0.54       415

    accuracy                           0.85      2000
   macro avg       0.82      0.69      0.73      2000
weighted avg       0.85      0.85      0.84      2000



# gini

In [30]:
dt8=DecisionTreeClassifier(criterion="gini",min_samples_leaf=34)
mymodel(dt8)

              precision    recall  f1-score   support

           0       0.87      0.96      0.91      1585
           1       0.74      0.45      0.56       415

    accuracy                           0.85      2000
   macro avg       0.81      0.70      0.74      2000
weighted avg       0.84      0.85      0.84      2000



In [31]:
dt9=DecisionTreeClassifier(criterion="gini",max_depth=6)
mymodel(dt9)

              precision    recall  f1-score   support

           0       0.86      0.97      0.91      1585
           1       0.77      0.41      0.54       415

    accuracy                           0.85      2000
   macro avg       0.82      0.69      0.73      2000
weighted avg       0.84      0.85      0.84      2000



In [32]:
dt10=DecisionTreeClassifier(criterion="entropy",max_depth=6)
mymodel(dt10)

              precision    recall  f1-score   support

           0       0.86      0.97      0.91      1585
           1       0.81      0.40      0.54       415

    accuracy                           0.86      2000
   macro avg       0.83      0.69      0.73      2000
weighted avg       0.85      0.86      0.84      2000



In [33]:
for i in range(1,50):
    dt11=DecisionTreeClassifier(criterion="entropy",max_depth=i)
    dt11.fit(xtrain,ytrain)
    ypred=dt11.predict(xtest)
    print(f"{i}= {accuracy_score(ytest,ypred)}")

1= 0.7925
2= 0.82
3= 0.838
4= 0.8465
5= 0.8555
6= 0.856
7= 0.85
8= 0.849
9= 0.8455
10= 0.8365
11= 0.8335
12= 0.8205
13= 0.8215
14= 0.8105
15= 0.8045
16= 0.807
17= 0.8025
18= 0.8
19= 0.792
20= 0.796
21= 0.7945
22= 0.796
23= 0.7915
24= 0.791
25= 0.7935
26= 0.7925
27= 0.7955
28= 0.797
29= 0.794
30= 0.787
31= 0.792
32= 0.794
33= 0.7915
34= 0.7925
35= 0.792
36= 0.7915
37= 0.791
38= 0.7865
39= 0.7915
40= 0.7935
41= 0.7975
42= 0.7885
43= 0.789
44= 0.7885
45= 0.792
46= 0.792
47= 0.7985
48= 0.7875
49= 0.7985


In [34]:
#best value of max_depth when criterion=entropy
dt12=DecisionTreeClassifier(criterion="entropy",max_depth=5)
mymodel(dt12)

              precision    recall  f1-score   support

           0       0.86      0.97      0.91      1585
           1       0.81      0.40      0.53       415

    accuracy                           0.86      2000
   macro avg       0.83      0.69      0.72      2000
weighted avg       0.85      0.86      0.84      2000



In [35]:
dt13=DecisionTreeClassifier(criterion="entropy",min_samples_leaf=11)
mymodel(dt13)

              precision    recall  f1-score   support

           0       0.87      0.93      0.90      1585
           1       0.64      0.45      0.52       415

    accuracy                           0.83      2000
   macro avg       0.75      0.69      0.71      2000
weighted avg       0.82      0.83      0.82      2000



In [36]:
for i in range(1,50):
    dt14=DecisionTreeClassifier(criterion="entropy",min_samples_leaf=i)
    dt14.fit(xtrain,ytrain)
    ypred=dt14.predict(xtest)
    print(f"{i}= {accuracy_score(ytest,ypred)}")

1= 0.792
2= 0.813
3= 0.791
4= 0.801
5= 0.8055
6= 0.8085
7= 0.816
8= 0.819
9= 0.8255
10= 0.8295
11= 0.8335
12= 0.831
13= 0.832
14= 0.84
15= 0.8375
16= 0.8355
17= 0.832
18= 0.833
19= 0.832
20= 0.8345
21= 0.8365
22= 0.8415
23= 0.8425
24= 0.8455
25= 0.8495
26= 0.8495
27= 0.8505
28= 0.8535
29= 0.8535
30= 0.8535
31= 0.853
32= 0.853
33= 0.8535
34= 0.8535
35= 0.8535
36= 0.8535
37= 0.854
38= 0.854
39= 0.853
40= 0.853
41= 0.8525
42= 0.8525
43= 0.8525
44= 0.8575
45= 0.8575
46= 0.8575
47= 0.8565
48= 0.852
49= 0.852


In [37]:
# best value of min_samples_leaf=15 when criterion=entropy
dt15=DecisionTreeClassifier(criterion="entropy",min_samples_leaf=44)
mymodel(dt15)

              precision    recall  f1-score   support

           0       0.87      0.96      0.91      1585
           1       0.76      0.45      0.57       415

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.74      2000
weighted avg       0.85      0.86      0.84      2000



# Final models

In [38]:
#gini
dt16=DecisionTreeClassifier(max_depth=5,min_samples_leaf=44)
mymodel(dt16)

              precision    recall  f1-score   support

           0       0.85      0.98      0.91      1585
           1       0.81      0.35      0.49       415

    accuracy                           0.85      2000
   macro avg       0.83      0.66      0.70      2000
weighted avg       0.84      0.85      0.82      2000



In [39]:
#entropy
dt17=DecisionTreeClassifier(criterion="entropy",min_samples_leaf=44)
mymodel(dt17)

              precision    recall  f1-score   support

           0       0.87      0.96      0.91      1585
           1       0.76      0.45      0.57       415

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.74      2000
weighted avg       0.85      0.86      0.84      2000

