In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PowerTransformer, StandardScaler, minmax_scale
import pandas as pd
import numpy as np;
import scipy;
## plotting libraries
from matplotlib import pyplot as plt
import seaborn as sns
## stats Libraries
from scipy import stats
import statsmodels.api as sm
## Sklearn libraries
from sklearn import model_selection
from sklearn import metrics as metrics
from sklearn import preprocessing
from sklearn import linear_model as lm
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

In [51]:
# load the data as dataframe
data = pd.read_csv("data/data_cleaned.csv")

In [52]:
# Drop Columns
data = data.drop('dmy_Spain', axis=1)
data = data.drop('HasCrCard', axis=1)
data = data.drop('dmy_Male', axis=1)
data = data.drop('dmy_Germany', axis=1)
data = data.drop('Tenure', axis=1)
data = data.drop('IsActiveMember', axis=1)

In [53]:
data

Unnamed: 0,CreditScore,Age,Balance,NumOfProducts,EstimatedSalary,Exited
0,619,42,0.00,1,101348.88,1
1,608,41,83807.86,1,112542.58,0
2,502,42,159660.80,3,113931.57,1
3,699,39,0.00,2,93826.63,0
4,850,43,125510.82,1,79084.10,0
...,...,...,...,...,...,...
9995,771,39,0.00,2,96270.64,0
9996,516,35,57369.61,1,101699.77,0
9997,709,36,0.00,1,42085.58,1
9998,772,42,75075.31,2,92888.52,1


In [54]:
# X/y split
y = data['Exited']
X = data.drop(['Exited'], axis = 1)

In [55]:
# train / test split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=.20, random_state=42)

In [56]:
# upsampling with smote
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X_train, y_train)
y_sm.value_counts()

0    6356
1    6356
Name: Exited, dtype: int64

In [57]:
# define hyperparameters to test, define the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [ 100, 150, 200],
    'min_samples_split': [2, 3, ],
    'min_samples_leaf' : [1, 2, ],
    'max_features': ['sqrt'],
    'max_depth':[14,12],
    'bootstrap':[True] 
    #'max_samples' : ['None', 0.5],

    }
clf = RandomForestClassifier(random_state=100)

In [58]:
# define the gridsearchCV in a variable
grid_search = GridSearchCV(clf, param_grid,return_train_score=True,n_jobs=-1, scoring="recall")

In [59]:
# fit the in param_grid defined models
grid_search.fit(X_sm,y_sm)

GridSearchCV(estimator=RandomForestClassifier(random_state=100), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [14, 12],
                         'max_features': ['sqrt'], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 3],
                         'n_estimators': [100, 150, 200]},
             return_train_score=True, scoring='recall')

In [60]:
# get the best parameters 
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 14,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 150}

In [61]:
# redefine the model with the best parameters
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(random_state=100, max_features='sqrt', 
                             min_samples_leaf=1, min_samples_split=2, n_estimators=15, max_depth=14, bootstrap=True)
cross_val_scores_train = cross_val_score(clf, X_sm, y_sm, cv=5)
print(np.mean(cross_val_scores_train))


0.7935028137591258


In [62]:
# fit the model
clf.fit(X_sm, y_sm)

RandomForestClassifier(max_depth=14, max_features='sqrt', n_estimators=15,
                       random_state=100)

In [63]:
len(X_train.columns)

5

In [64]:
# safe the feature names in a list
feature_names = X_train.columns
feature_names = list(feature_names)

In [65]:
# create a dataframe that lists the importance of features
df = pd.DataFrame(list(zip(feature_names, clf.feature_importances_)))
df.columns = ['columns_name', 'score_feature_importance']
df.sort_values(by=['score_feature_importance'], ascending = False)

Unnamed: 0,columns_name,score_feature_importance
1,Age,0.343995
3,NumOfProducts,0.17896
4,EstimatedSalary,0.174026
2,Balance,0.1608
0,CreditScore,0.142219


In [66]:
# cross validation of train and test set
cross_val_scores_test = cross_val_score(clf, X_test, y_test, cv=10)
print(np.mean(cross_val_scores_train))
print(np.mean(cross_val_scores_test))

0.7935028137591258
0.8295


In [67]:
cross_val_scores_test

array([0.835, 0.83 , 0.84 , 0.815, 0.83 , 0.85 , 0.845, 0.855, 0.79 ,
       0.805])

In [68]:
from sklearn.metrics import recall_score

In [69]:
# safe the prediction in a variable
y_pred= clf.predict(X_test)

In [70]:
y_pred

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [71]:
# calculate the recall score
recall_score(y_test, y_pred)

0.6997455470737913