In [29]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay

from sklearn.ensemble import RandomForestClassifier

import pickle

In [30]:
df_original = pd.read_csv("../datasets/Churn_Modelling.csv")
df_original.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [31]:
churn_df = df_original.drop(['RowNumber', 'CustomerId', 'Surname', 'Gender'], axis=1)
churn_df.head()

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,41,1,83807.86,1,0,1,112542.58,0
2,502,France,42,8,159660.8,3,1,0,113931.57,1
3,699,France,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,43,2,125510.82,1,1,1,79084.1,0


In [32]:
churn_df2 = pd.get_dummies(churn_df, drop_first=True)
churn_df2.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain
0,619,42,2,0.0,1,1,1,101348.88,1,False,False
1,608,41,1,83807.86,1,0,1,112542.58,0,False,True
2,502,42,8,159660.8,3,1,0,113931.57,1,False,False
3,699,39,1,0.0,2,0,0,93826.63,0,False,False
4,850,43,2,125510.82,1,1,1,79084.1,0,False,True


In [33]:
y = churn_df2["Exited"]

X = churn_df2.copy()
X = X.drop("Exited", axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [34]:
%%time

rf = RandomForestClassifier(random_state=0)

cv_params = {'max_depth': [2,3,4,5, None],
             'min_samples_leaf': [1,2,3],
             'min_samples_split': [2,3,4],
             'max_features': [2,3,4],
             'n_estimators': [75, 100, 125, 150]
             }

scoring = ['accuracy', 'precision', 'recall', 'f1']

rf_cv = GridSearchCV(rf, cv_params, scoring=scoring, cv=5, refit='f1')

rf_cv.fit(X_train, y_train)

CPU times: user 19min 45s, sys: 13.7 s, total: 19min 58s
Wall time: 20min 1s


0,1,2
,estimator,RandomForestC...andom_state=0)
,param_grid,"{'max_depth': [2, 3, ...], 'max_features': [2, 3, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 3, ...], ...}"
,scoring,"['accuracy', 'precision', ...]"
,n_jobs,
,refit,'f1'
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,150
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,4
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [35]:
path = '../models/'

In [36]:
with open(path+'rf_cv_model.pickle', 'wb') as to_write:
    pickle.dump(rf_cv, to_write)

In [37]:
with open(path + 'rf_cv_model.pickle', 'rb') as to_read:
    rf_cv = pickle.load(to_read)

In [38]:
rf_cv.best_params_

{'max_depth': None,
 'max_features': 4,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 150}

In [39]:
rf_cv.best_score_

np.float64(0.5833023473561427)

In [40]:
def make_results(model_name, model_object):

    cv_results = pd.DataFrame(model_object.cv_results_)


    best_estimator_results = cv_results.iloc[cv_results['mean_test_f1'].idxmax(), :]

    f1 = best_estimator_results.mean_test_f1
    recall = best_estimator_results.mean_test_recall
    precision = best_estimator_results.mean_test_precision
    accuracy = best_estimator_results.mean_test_accuracy

    table = pd.DataFrame({'Model': [model_name],
                          'F1': [f1],
                          'Recall': [recall],
                          'Precision': [precision],
                          'Accuracy': [accuracy]
                         }
                        )

    return table

In [41]:
rf_cv_results = make_results('Random Forest CV', rf_cv)
rf_cv_results

Unnamed: 0,Model,F1,Recall,Precision,Accuracy
0,Random Forest CV,0.583302,0.47514,0.758639,0.862133


In [42]:
# creating separate validation data without touching the test data
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=10)

In [43]:
split_index = [0 if x in X_val.index else -1 for x in X_train.index]

In [44]:
from sklearn.model_selection import PredefinedSplit

In [45]:
rf = RandomForestClassifier(random_state=0)

cv_params = {'max_depth': [2,3,4,5, None],
             'min_samples_leaf': [1,2,3],
             'min_samples_split': [2,3,4],
             'max_features': [2,3,4],
             'n_estimators': [75, 100, 125, 150]
             }

scoring = ['accuracy', 'precision', 'recall', 'f1']

custom_split = PredefinedSplit(split_index)

rf_val = GridSearchCV(rf, cv_params, scoring=scoring, cv=custom_split, refit='f1')

In [46]:
rf_val.fit(X_train, y_train)

0,1,2
,estimator,RandomForestC...andom_state=0)
,param_grid,"{'max_depth': [2, 3, ...], 'max_features': [2, 3, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 3, ...], ...}"
,scoring,"['accuracy', 'precision', ...]"
,n_jobs,
,refit,'f1'
,cv,"PredefinedSpl...hape=(7500,)))"
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,150
,criterion,'gini'
,max_depth,
,min_samples_split,3
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,4
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
with open(path+'rf_val_model.pickle', 'wb') as to_write:
    pickle.dump(rf_val, to_write)

In [None]:
with open(path+'rf_val_model.pickle', 'rb') as to_read:
    rf_val = pickle.load(to_read)

In [47]:
rf_val_results = make_results('Random Forest Validated', rf_val)

In [48]:
rf_val_results

Unnamed: 0,Model,F1,Recall,Precision,Accuracy
0,Random Forest Validated,0.579592,0.464052,0.771739,0.862667
