# Hyperparameter Tuning the Random Forest

1. Perform a randomized search to narrow down the range for each hyperparameter 
2. Perform a grid search with narrower ranges 

Source: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

# Setup

In [1]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Load the data

In [14]:
df = pd.read_csv(os.getcwd()+"/data/day.csv")
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


# Feature selection

In [3]:
y = df["cnt"]

# choose only importat predictors (see feature importance plot)
X = df.loc[:,['instant', 'weekday', 'workingday', 'temp', 'hum', 'windspeed']]

# add dummy-encoding of categorical variables
X = pd.concat([X,pd.get_dummies(df.mnth, prefix="mnth")],axis=1) # one-hot-encoded month
X = pd.concat([X,pd.get_dummies(df.weathersit, prefix="weathersit")],axis=1) # one-hot-encoded weathersit
X = pd.concat([X,pd.get_dummies(df.season, prefix="season")],axis=1) # one-hot-encoded season

# Train-Test-Split

In [4]:
RANDOM_SPLIT = True

if RANDOM_SPLIT:

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
else:

    # don't split random but take the first 18 month as train data and the last 6 month as test data
    test_idx = df[(df["yr"]==1) & (df["mnth"]>5)].index

    X_test=X.iloc[test_idx,]
    y_test=y.iloc[test_idx,]

    X_train=X.drop(test_idx)
    y_train=y.drop(test_idx)

# Hyperparameter Tuning the Random Forest

## Random Search Cross Validation

In [5]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state = 42)

from pprint import pprint

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


### Random Hyperparameter Grid

In [6]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


### Random Search Training

In [7]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose = 2, random_state = 42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   31.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.4min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [8]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

## Evaluate Random Search

In [9]:
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score

def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    
    print("R² = {:.4}".format(r2_score(test_labels, predictions)))
    
    mse = mean_squared_error(test_labels, predictions)
    print("MSE = {0:,.0f}".format(mse))
    print("RMSE = {0:,.0f}".format(np.sqrt(mse)))
    
    return mse

base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
best_random = rf_random.best_estimator_

In [10]:
print("Base Model Performance:")
base_mse = evaluate(base_model, X_test, y_test)

print("\nBest Model Performance (Random Search):")
random_mse = evaluate(best_random, X_test, y_test)

print('\nMSE improvement compared to base of {:0.2f}%.'.format( 100 * (random_mse - base_mse) / base_mse))

Base Model Performance:
R² = 0.8639
MSE = 524,713
RMSE = 724

Best Model Performance (Random Search):
R² = 0.8944
MSE = 406,799
RMSE = 638

MSE improvement compared to base of -22.47%.


## Grid Search with Cross Validation

In [11]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [125, 150, None],
    'max_features': ["sqrt"],
    'min_samples_leaf': [1, 2],
    'min_samples_split': [2, 3],
    'n_estimators': [300, 400, 500]
}

# Create a based model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:   33.8s finished


{'bootstrap': False,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

## Evaluate Grid Search

In [12]:
best_grid = grid_search.best_estimator_
print("Best Model Performance (Grid Search):")
grid_mse = evaluate(best_grid, X_test, y_test)

Best Model Performance (Grid Search):
R² = 0.899
MSE = 389,199
RMSE = 624


In [13]:
print('MSE improvement compared to base of {:0.2f}%.'.format( 100 * (grid_mse - base_mse) / base_mse))
print('MSE improvement compared to random search of {:0.2f}%.'.format( 100 * (grid_mse - random_mse) / random_mse))

MSE improvement compared to base of -25.83%.
MSE improvement compared to random search of -4.33%.
