In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
dataset = pd.read_csv('Churn_Modelling.csv')
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
dataset = dataset.drop(['RowNumber','CustomerId','Surname'], axis=1)
dataset.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
# preprocessing part 1 - check missing value
dataset.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [6]:
# preprocessing part 2 - Encoding 
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [7]:
dataset['Geography'] = dataset['Geography'].astype('category')
dataset['Geography'] = dataset['Geography'].cat.codes
dataset = pd.get_dummies(dataset, columns=['Geography'])

In [8]:
dataset['Gender'] = dataset['Gender'].astype('category')
dataset['Gender'] = dataset['Gender'].cat.codes
dataset = pd.get_dummies(dataset, columns=['Gender'])

In [9]:
dataset = dataset.drop(['Geography_0','Gender_0'], axis=1)

In [10]:
dataset.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_1,Geography_2,Gender_1
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


In [11]:
# part 3 - outlier / feature scaling 
# to check imbalance dataset
dataset['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [12]:
x = dataset.drop(['Exited'],axis=1)
y = dataset['Exited']

In [13]:
x.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_1,Geography_2,Gender_1
0,619,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0


In [14]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

In [15]:
len(dataset)

10000

In [16]:
import imblearn

In [17]:
from imblearn.over_sampling import RandomOverSampler
over = RandomOverSampler()
x_over, y_over = over.fit_resample(x,y)

In [18]:
print(x_over.shape)
print(y_over.shape)

(15926, 11)
(15926,)


In [None]:
# preprocessing part completed
# spliting the data into training and test for prediction the model

In [19]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_over, y_over, train_size=0.75,
                                                   random_state=555)

# XGBoost Model

In [20]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.2-py3-none-win_amd64.whl (106.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.2


In [21]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(x_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [22]:
y_pred_train = classifier.predict(x_train)
y_pred_test = classifier.predict(x_test)

# Evaluate the model

In [23]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [24]:
print(accuracy_score(y_train, y_pred_train))
print('\n')
print(accuracy_score(y_test, y_pred_test))

0.9588914936369726


0.8877448518332496


In [25]:
print(classification_report(y_train, y_pred_train))
print('\n')
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      5904
           1       0.95      0.97      0.96      6040

    accuracy                           0.96     11944
   macro avg       0.96      0.96      0.96     11944
weighted avg       0.96      0.96      0.96     11944



              precision    recall  f1-score   support

           0       0.92      0.86      0.89      2059
           1       0.86      0.92      0.89      1923

    accuracy                           0.89      3982
   macro avg       0.89      0.89      0.89      3982
weighted avg       0.89      0.89      0.89      3982



In [26]:
print(confusion_matrix(y_train, y_pred_train))
print('\n')
print(confusion_matrix(y_test, y_pred_test))

[[5623  281]
 [ 210 5830]]


[[1762  297]
 [ 150 1773]]


In [27]:
# K-Fold is not required as we got good result by XGBoost
# Training accuracy = 95%
# Test Accuracy = 89.5%

from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(classifier, X=x_test, y=y_test, cv=20)
print(accuracy)















































































[0.83       0.84       0.79899497 0.81407035 0.8040201  0.84924623
 0.88442211 0.83417085 0.83417085 0.83417085 0.8241206  0.78894472
 0.85929648 0.81407035 0.86432161 0.83919598 0.86432161 0.89949749
 0.85427136 0.88944724]


# XGBoost
## Training accuracy = 95%
## Test Accuracy = 89.5%

# Gradient Boosting Algorithm

In [28]:
from sklearn.ensemble import GradientBoostingClassifier
gboosting = GradientBoostingClassifier()
gboosting.fit(x_train, y_train)

GradientBoostingClassifier()

In [29]:
y_pred_gd_train = gboosting.predict(x_train)
y_pred_gd_test = gboosting.predict(x_test)

In [30]:
print(accuracy_score(y_train, y_pred_gd_train))
print('\n')
print(accuracy_score(y_test, y_pred_gd_test))

0.8094440723375753


0.7928176795580111


In [31]:
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(gboosting, X=x_test, y=y_test, cv=15)
print(accuracy)

[0.80075188 0.7443609  0.78195489 0.81203008 0.79323308 0.78571429
 0.7481203  0.76603774 0.76226415 0.83396226 0.78867925 0.79245283
 0.79622642 0.81132075 0.80754717]


# Adaboost 

In [32]:
from sklearn.ensemble import AdaBoostClassifier
adaboost = AdaBoostClassifier()
adaboost.fit(x_train, y_train)

AdaBoostClassifier()

In [33]:
y_pred_ada_train = adaboost.predict(x_train)
y_pred_ada_test = adaboost.predict(x_test)

In [34]:
print(accuracy_score(y_train, y_pred_ada_train))
print('\n')
print(accuracy_score(y_test, y_pred_ada_test))

0.7838245144005358


0.7659467604218986


In [35]:
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(adaboost, X=x_test, y=y_test, cv=15)
print(accuracy)

[0.7481203  0.73684211 0.76691729 0.78947368 0.77443609 0.76691729
 0.71428571 0.73207547 0.74339623 0.79622642 0.78490566 0.74339623
 0.78490566 0.78490566 0.76603774]


# XGBoost
### Training accuracy = 95%
### Test Accuracy = 89.5%

# Gradient Boosting
### Training accuracy = 80%
### Test Accuracy = 79%

# Adaboost
### Training accuracy = 77.8%
### Test Accuracy = 76.4%

## conclusion : XGBoost is the best algorithm in boosting techniques
## note : 95% times you will perform random forest for high variance and xgboost for high bias
### in case you don't have clarity then pls use either RF or XGBoost