<h3>Import Libraries</h3>

In [91]:
import pandas as pd
from sklearn import metrics
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import lightgbm as lgb
from imblearn.combine import SMOTEENN
import plotly.express as px

In [59]:
# read csv file
df = pd.read_csv('bank_churn.csv')
df.head()

Unnamed: 0,CreditScore,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,target,Complain,Satisfaction Score,...,Age_group_81-90,Age_group_91-100,CreditScore_group_350-450,CreditScore_group_451-550,CreditScore_group_551-650,CreditScore_group_651-850,EstimatedSalary_group_0-50000,EstimatedSalary_group_51000-100000,EstimatedSalary_group_100001-150000,EstimatedSalary_group_150001-200000
0,619,2,0.0,1,1,1,101348.88,1,1,2,...,0,0,0,0,1,0,0,0,1,0
1,608,1,83807.86,1,0,1,112542.58,0,1,3,...,0,0,0,0,1,0,0,0,1,0
2,502,8,159660.8,3,1,0,113931.57,1,1,3,...,0,0,0,1,0,0,0,0,1,0
3,699,1,0.0,2,0,0,93826.63,0,0,5,...,0,0,0,0,0,1,0,1,0,0
4,850,2,125510.82,1,1,1,79084.1,0,0,5,...,0,0,0,0,0,1,0,1,0,0


In [60]:
df.columns

Index(['CreditScore', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'target', 'Complain',
       'Satisfaction Score', 'Point Earned', 'Geography_France',
       'Geography_Germany', 'Geography_Spain', 'Gender_Female', 'Gender_Male',
       'Card Type_DIAMOND', 'Card Type_GOLD', 'Card Type_PLATINUM',
       'Card Type_SILVER', 'Age_group_18-30', 'Age_group_31-40',
       'Age_group_41-50', 'Age_group_51-60', 'Age_group_61-70',
       'Age_group_71-80', 'Age_group_81-90', 'Age_group_91-100',
       'CreditScore_group_350-450', 'CreditScore_group_451-550',
       'CreditScore_group_551-650', 'CreditScore_group_651-850',
       'EstimatedSalary_group_0-50000', 'EstimatedSalary_group_51000-100000',
       'EstimatedSalary_group_100001-150000',
       'EstimatedSalary_group_150001-200000'],
      dtype='object')

<h4>creating x and y variables</h4>

In [113]:
X = df.drop(columns=['target','Complain'],axis=1)
Y = df['target']

In [128]:
df.shape

(10000, 36)

### Train Test split


In [114]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2)

### LGBMClassifier


In [115]:
lgb_model = lgb.LGBMClassifier(criterion='gini',random_state=100,max_depth=6,min_samples_leaf=8)
lgb_model.fit(x_train,y_train)
y_pred = lgb_model.predict(x_test)
y_pred



array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [116]:
print(classification_report(y_test,y_pred,labels=[0,1]))  

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1604
           1       0.75      0.45      0.56       396

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.74      2000
weighted avg       0.85      0.86      0.85      2000



In [117]:
print(confusion_matrix(y_test,y_pred))

[[1546   58]
 [ 218  178]]


####  ```We cannot use this model as the dataset is highly imbalanced so we will use: ``` 
###  SMOTE-ENN
#### this method combines the SMOTE ability to generate synthetic examples for minority class and ENN ability to delete some observations from both classes that are identified as having different class between the observation’s class and its K-nearest neighbor majority class.

In [118]:
sm = SMOTEENN()
x_resample,y_resampled = sm.fit_resample(X,Y)
xs_train,xs_test,ys_train,ys_test = train_test_split(x_resample,y_resampled,test_size = 0.2)
lgb_model_smote = lgb.LGBMClassifier()
lgb_model_smote.fit(xs_train,ys_train)
y_pred_smote_lgb = lgb_model_smote.predict(xs_test)


### score of decision tree classifier using SMOTE

In [119]:
print(classification_report(ys_test,y_pred_smote_lgb,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.86      0.94      0.90       610
           1       0.95      0.88      0.92       798

    accuracy                           0.91      1408
   macro avg       0.91      0.91      0.91      1408
weighted avg       0.91      0.91      0.91      1408



#### ``` The accuracy of the model changes with normal tran test split if you re-run the sell , therefore we will use cross-validation to find average```

### STRATIFIES K-FOLD CROSS VALIDATION { 10-fold }


In [124]:
from statistics import mean, stdev
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(lgb_model_smote,x_resample,y_resampled,cv=skf)

In [125]:
print('List of possible accuracy:\n', scores)
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(scores)*100, '%')
print('\nMinimum Accuracy That can be obtained from this model is:',
      min(scores)*100, '%')
print('\nOverall Accuracy:',
      mean(scores)*100, '%')
print('\nStandard Deviation is:', stdev(scores))

List of possible accuracy:
 [0.91619318 0.93039773 0.90767045 0.921875   0.90625    0.92329545
 0.93323864 0.92471591 0.90753912 0.90042674]

Maximum Accuracy That can be obtained from this model is: 93.32386363636364 %

Minimum Accuracy That can be obtained from this model is: 90.04267425320057 %

Overall Accuracy: 91.71602224233804 %

Standard Deviation is: 0.011221528638467333


##### ``` As we can observe that our overall Accuracy of the model is 91.716%```

### save the pickle


In [126]:
import pickle
filename = 'lgb_model_smote.sav'
pickle.dump(lgb_model_smote,open(filename,'wb'))