## Model 3: Random Forest

In [1]:
import pandas as pd
import numpy as np
import time


import matplotlib as mpl
import matplotlib.pyplot as plt

# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_curve, auc, RocCurveDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

save_figures = False
figure_path = "../figures/models/"
# df_init = pd.read_parquet("../data/init.parquet")
# df_before = pd.read_parquet("../data/before_split.parquet")
# df_train = pd.read_parquet("../data/prep_train.parquet")
# df_val = pd.read_parquet("../data/prep_val.parquet")
# df_test = pd.read_parquet("../data/prep_test.parquet")
# df_tt_train = pd.read_parquet("../data/prep_tt_train.parquet")
# df_tt_test = pd.read_parquet("../data/prep_tt_test.parquet")
df_all = pd.read_parquet("../data/prep_all.parquet")

In [2]:
df_all.head()

Unnamed: 0,Year_Y2015,Year_Y2016,Country_England,Country_Wales,Supermarket_Asda,Supermarket_Tesco Extra,Supermarket_Tesco Metro,Supermarket_Waitrose,Time_Evening,Time_Morning,...,YearCountryAge_Y2016EnglandAge_g2,YearCountryAge_Y2016EnglandAge_g3,YearCountryAge_Y2016WalesAge_g1,YearCountryAge_Y2016WalesAge_g2,YearCountryAge_Y2016WalesAge_g3,YearCountryAge_nan,ObsSize,FemaleN,MaleN,y
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-0.471136,-1.156159,0.871226,1
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-0.471136,-1.156159,0.871226,1
2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,-0.471136,0.583858,-0.993069,1
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.811174,2.323876,-0.993069,1
4,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.471136,0.583858,-0.993069,1


In [3]:
X_all = df_all.iloc[:,0:-1].to_numpy()
y_all = df_all["y"].to_numpy()

### Random Forest, Grid Search

In [4]:
random_state = 132
RFC_grid = RandomForestClassifier(n_estimators = 100,
                                  random_state= random_state, 
                                  n_jobs = 1, 
                                  class_weight = "balanced")
RFC_grid

RandomForestClassifier(class_weight='balanced', n_jobs=1, random_state=132)

In [5]:
params_RFC = {'max_features': [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, None],
              'max_depth': [3, 4, 5, 6, 8, None],
              'min_samples_split': [2, 3, 4, 5, 6, 7]}

random_states_split = [123, 456, 789, 
                       741, 852, 963, 
                       159, 753, 951, 357]

gridCV_rfc = GridSearchCV(estimator = RFC_grid, 
                          param_grid = params_RFC, 
                          scoring = 'balanced_accuracy', 
                          n_jobs = -2,
                          refit = True, 
                          cv = 5, verbose = 4)
gridCV_rfc

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(class_weight='balanced', n_jobs=1,
                                              random_state=132),
             n_jobs=-2,
             param_grid={'max_depth': [3, 4, 5, 6, 8, None],
                         'max_features': [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6,
                                          0.65, None],
                         'min_samples_split': [2, 3, 4, 5, 6, 7]},
             scoring='balanced_accuracy', verbose=4)

In [6]:
%%time

BalAccuracy_rfc = []

for each_rs in random_states_split:
    start_time = time.time()
    X_other_temp, X_test_temp, y_other_temp, y_test_temp = \
                        train_test_split(X_all, y_all, 
                                         test_size = 0.1, 
                                         random_state = each_rs, 
                                         stratify = y_all)
    
    gridCV_rfc.fit(X_other_temp, y_other_temp)
    
    BalAccuracy_rfc.append(gridCV_rfc.best_score_)
    
    print(gridCV_rfc.best_params_, flush = True)
    
    end_time = time.time()
    print(f"Iteration {each_rs}: {end_time - start_time:.3f} seconds", flush = True)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
{'max_depth': 4, 'max_features': 0.6, 'min_samples_split': 2}
Iteration 123: 97.916 seconds
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
{'max_depth': 5, 'max_features': 0.6, 'min_samples_split': 2}
Iteration 456: 95.713 seconds
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
{'max_depth': 8, 'max_features': 0.3, 'min_samples_split': 7}
Iteration 789: 95.770 seconds
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
{'max_depth': 4, 'max_features': 0.6, 'min_samples_split': 2}
Iteration 741: 94.420 seconds
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
{'max_depth': 5, 'max_features': 0.65, 'min_samples_split': 3}
Iteration 852: 101.125 seconds
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
{'max_depth': 6, 'max_features': 0.3, 'min_samples_split': 2}
Iteration 963: 105.694 seconds
Fitting 5 folds for each of 324 candidates, totalling 1620 fi

In [7]:
gridCV_rfc.best_params_

{'max_depth': 8, 'max_features': 0.35, 'min_samples_split': 4}

In [8]:
print(f"{np.mean(BalAccuracy_rfc):.4f} ({np.std(BalAccuracy_rfc):.4f})")

0.6898 (0.0027)


In [9]:
import json

rfc_scores = {"rfc": BalAccuracy_rfc}

with open("../results/rfc_scores.json", "w") as outfile:
    json.dump(rfc_scores, outfile)
    
# with open("logistic.json", "r") as readfile:
#     dict_data = json.load(readfile)