In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV

In [2]:
df = pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
dt = df.copy

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [5]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, StandardScaler

In [6]:
def Lable(x):
    df[x] = LabelEncoder().fit_transform(df[x])

In [7]:
df['Gender'].nunique()

2

In [8]:
Lable('Gender')
Lable('Geography')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  int32  
 5   Gender           10000 non-null  int32  
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int32(2), int64(9), object(1)
memory usage: 1015.8+ KB


### Train Test split

In [10]:
x = df.drop(['Exited','Surname','CustomerId'],axis=1)
y = df['Exited']

In [11]:
x.head()

Unnamed: 0,RowNumber,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1,619,0,0,42,2,0.0,1,1,1,101348.88
1,2,608,2,0,41,1,83807.86,1,0,1,112542.58
2,3,502,0,0,42,8,159660.8,3,1,0,113931.57
3,4,699,0,0,39,1,0.0,2,0,0,93826.63
4,5,850,2,0,43,2,125510.82,1,1,1,79084.1


In [12]:
y[:5]

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1,stratify=y)

### 1.Random Forest:

In [14]:
rf_cls = RandomForestClassifier(random_state=1)
rf_cls.fit(x_train,y_train)

RandomForestClassifier(random_state=1)

In [15]:
rf_cls.score(x_train,y_train)

1.0

In [16]:
rf_cls.score(x_test,y_test)

0.864

### Hyperparameter

In [17]:
rf_cls = RandomForestClassifier(random_state=1)

hyp = {'n_estimators':np.arange(10,20),
       'criterion':['gini','entropy'],
       'max_depth':np.arange(5,10),
       'min_samples_split':np.arange(2,10),
       'min_samples_leaf':np.arange(1,10)}

Rscv_model = RandomizedSearchCV(rf_cls,hyp,cv=5)
Rscv_model.fit(x_train,y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=1),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': array([5, 6, 7, 8, 9]),
                                        'min_samples_leaf': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                                        'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9]),
                                        'n_estimators': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])})

In [18]:
Rscv_model.best_params_

{'n_estimators': 17,
 'min_samples_split': 8,
 'min_samples_leaf': 5,
 'max_depth': 9,
 'criterion': 'gini'}

In [19]:
rf_cls = RandomForestClassifier(n_estimators=19,
                                criterion='entropy',
                                max_depth=9,
                                min_samples_leaf=9,
                                min_samples_split=2)
rf_cls.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', max_depth=9, min_samples_leaf=9,
                       n_estimators=19)

In [20]:
rf_cls.score(x_train,y_train)

0.871375

In [21]:
rf_cls.score(x_test,y_test)

0.8595

In [23]:
# rf_cls = RandomForestClassifier(random_state=1)

# hyp = {'n_estimators':np.arange(15,20),
#        'criterion':['gini','entropy'],
#        'max_depth':np.arange(5,10),
#        'min_samples_split':np.arange(5,10),
#        'min_samples_leaf':np.arange(1,8)}

# Gscv_model = GridSearchCV(rf_cls,hyp,cv=5)
# Gscv_model.fit(x_train,y_train)

In [36]:
# Gscv_model.best_params_

In [25]:
rf_cls = RandomForestClassifier(n_estimators=19,
                                criterion='entropy',
                                max_depth=9,
                                min_samples_leaf=3,
                                min_samples_split=8)
rf_cls.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', max_depth=9, min_samples_leaf=3,
                       min_samples_split=8, n_estimators=19)

In [26]:
rf_cls.score(x_train,y_train)

0.876625

In [27]:
rf_cls.score(x_test,y_test)

0.859

In [28]:
y_pred_test = rf_cls.predict(x_test)
y_pred_train = rf_cls.predict(x_train)

In [29]:
confusion_matrix(y_test,y_pred_test)

array([[1559,   34],
       [ 248,  159]], dtype=int64)

In [30]:
clas1 = classification_report(y_test,y_pred_test)
print(clas1)

              precision    recall  f1-score   support

           0       0.86      0.98      0.92      1593
           1       0.82      0.39      0.53       407

    accuracy                           0.86      2000
   macro avg       0.84      0.68      0.72      2000
weighted avg       0.85      0.86      0.84      2000



In [31]:
confusion_matrix(y_train,y_pred_train)

array([[6284,   86],
       [ 901,  729]], dtype=int64)

In [45]:
x_test[10:20]

Unnamed: 0,RowNumber,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
8527,8528,694,2,1,39,3,0.0,1,1,1,95625.03
2655,2656,850,1,1,53,2,94078.97,2,1,0,36980.54
3495,3496,591,1,1,46,4,129269.27,1,1,0,163504.33
8648,8649,809,2,0,48,2,0.0,1,1,0,160976.85
9322,9323,767,0,0,35,9,0.0,2,1,0,39511.61
8492,8493,494,0,1,28,9,114731.76,2,0,1,79479.74
6566,6567,525,1,0,30,0,157989.21,2,1,1,100687.67
7758,7759,711,0,1,41,3,0.0,2,1,1,193747.57
2274,2275,580,0,0,65,9,106804.26,3,1,0,107890.69
6354,6355,660,2,0,42,5,0.0,3,1,1,189016.24


In [44]:
rf_cls.predict(x_test[10:20])

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1], dtype=int64)

In [46]:
x.columns

Index(['RowNumber', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure',
       'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary'],
      dtype='object')

In [49]:
def prediction(RowNumber, CreditScore, Geography, Gender, Age, Tenure,
       Balance, NumOfProducts, HasCrCard, IsActiveMember, EstimatedSalary):
    
    pred = rf_cls.predict([[RowNumber, CreditScore, Geography, Gender, Age, Tenure,
       Balance, NumOfProducts, HasCrCard, IsActiveMember, EstimatedSalary]])
    return pred[0]

In [50]:
prediction(8528, 694, 2, 1, 39, 3, 0.00, 1, 1, 1, 95625.03)

0

In [51]:
import pickle,os

In [52]:
folder_path = 'model'

if not os.path.exists(folder_path):
    os.mkdir(folder_path)
    
pickle.dump(rf_cls, open(f"{folder_path}/rf_pickle.pkl", "wb"))

In [None]:
2656	850	1	1	53	2	94078.97	2	1	0	36980.54

### 2.Knn Model:

In [32]:
std_scale = StandardScaler()

x_scale1 = std_scale.fit_transform(x_train)
x_scale2 = std_scale.fit_transform(x_test)

In [33]:
df_train = pd.DataFrame(x_scale1,columns=x.columns)
df_train

Unnamed: 0,RowNumber,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,-0.251653,-0.556600,1.511623,-1.095269,3.516577,1.380964,0.925750,-0.914333,0.649397,0.967285,-1.505216
1,1.446838,-0.370472,-0.903110,0.913017,0.956282,-1.376312,-1.219862,0.797901,0.649397,-1.033822,1.614655
2,-1.486920,-2.118010,1.511623,0.913017,2.189016,-1.376312,-0.194773,-0.914333,0.649397,0.967285,-0.405078
3,-0.028889,-1.094304,-0.903110,-1.095269,0.102850,0.691645,-1.219862,-0.914333,-1.539890,0.967285,1.067959
4,0.501651,0.994469,-0.903110,0.913017,-0.181627,-1.031652,-1.219862,0.797901,0.649397,-1.033822,0.756245
...,...,...,...,...,...,...,...,...,...,...,...
7995,0.037732,-1.818136,-0.903110,-1.095269,-0.845407,1.380964,-1.219862,0.797901,-1.539890,-1.033822,-0.638851
7996,1.518664,1.935451,-0.903110,-1.095269,-0.940233,1.380964,-1.219862,0.797901,0.649397,0.967285,-1.009989
7997,1.596735,-0.494557,1.511623,0.913017,-0.466104,1.036305,0.353001,-0.914333,0.649397,-1.033822,-1.050503
7998,-1.571931,2.069877,-0.903110,0.913017,0.102850,1.380964,-1.219862,0.797901,-1.539890,0.967285,0.338571


In [34]:
df_test = pd.DataFrame(x_scale2,columns=x.columns)
df_test

Unnamed: 0,RowNumber,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,-0.208793,-2.494105,1.528986,0.910029,-0.385107,-1.083480,-1.250284,0.848433,0.632898,0.982159,-0.693331
1,-0.453917,-1.228033,1.528986,0.910029,-0.385107,1.019684,0.676315,-0.900913,0.632898,0.982159,0.591288
2,1.559059,0.058795,-0.897013,-1.098866,-0.580221,0.669157,0.205954,-0.900913,0.632898,-1.018165,0.608574
3,1.613111,-1.010102,0.315986,-1.098866,0.395350,-0.382425,0.788048,-0.900913,0.632898,0.982159,-0.454164
4,1.206522,1.044670,0.315986,0.910029,2.541608,-1.434008,0.954749,-0.900913,0.632898,0.982159,0.349967
...,...,...,...,...,...,...,...,...,...,...,...
1995,0.934889,-1.061991,1.528986,0.910029,3.614736,-1.083480,0.828850,0.848433,0.632898,0.982159,1.386932
1996,-1.178272,-0.294045,-0.897013,0.910029,-0.482664,0.318629,-1.250284,0.848433,-1.580033,-1.018165,1.655083
1997,-1.307031,-0.190269,0.315986,0.910029,-0.189993,-0.031898,0.587243,-0.900913,0.632898,-1.018165,-0.096557
1998,-1.387247,-0.771417,-0.897013,0.910029,-0.385107,-1.434008,-1.250284,0.848433,0.632898,-1.018165,-1.666956


In [35]:
knn_model = KNeighborsClassifier()

hyp2 = {'n_neighbors': np.arange(2,15),
                 'p': [1,2]}

Gscv_model1 = GridSearchCV(knn_model,hyp2,cv=5)
Gscv_model1.fit(df_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                         'p': [1, 2]})

In [37]:
Gscv_model1.best_estimator_

KNeighborsClassifier(n_neighbors=13, p=1)

In [38]:
Gscv_model1.best_score_

0.8311249999999999

In [39]:
knn_model = KNeighborsClassifier(n_neighbors=13,p=1)
knn_model.fit(df_train,y_train)

KNeighborsClassifier(n_neighbors=13, p=1)

In [40]:
knn_model.score(df_train,y_train)

0.852625

In [41]:
knn_model.score(df_test,y_test)

0.836