# Import Statements

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Load Data

In [2]:
file = open('../../processed_data.pkl', 'rb')
data = pickle.load(file)
file.close()

In [3]:
train_x, train_y, test_x, test_y = data['train_x'], data['train_y'], data['test_x'], data['test_y']

# Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [5]:
random_forest_model = RandomForestClassifier(random_state=0, n_jobs=-1)

In [6]:
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 5, 10, 20, 40, 50],
    'min_samples_leaf': [5, 10, 20],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [7]:
# Perform Grid Search on paramaters specified by params

grid_search = GridSearchCV(estimator=random_forest_model, param_grid=params, cv=5, n_jobs=-1, verbose=1, scoring = 'accuracy')

In [8]:
grid_search.fit(train_x, train_y)

Fitting 5 folds for each of 126 candidates, totalling 630 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 5, 10, 20, 40, 50],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [5, 10, 20]},
             scoring='accuracy', verbose=1)

In [9]:
# Find best model

grid_search.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=40, min_samples_leaf=5,
                       n_jobs=-1, random_state=0)

In [10]:
best_random_forest_model = RandomForestClassifier(criterion='entropy', max_depth=40, min_samples_leaf=5, n_jobs=-1, random_state=0)

In [11]:
best_random_forest_model.fit(train_x, train_y)
best_random_forest_model_predictions = best_random_forest_model.predict(test_x)

In [12]:
print('Train Accuracy: {} %'.format(100*best_random_forest_model.score(train_x, train_y)))
print('Test Accuracy: {} %'.format(100*best_random_forest_model.score(test_x, test_y)))

Train Accuracy: 98.8896653466884 %
Test Accuracy: 95.96273291925466 %
