# Random Forest Regression

## Import packages

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    root_mean_squared_error,
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    make_scorer,
)
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
CLEAN_DATA_FOLDER = "clean_data"
MODELS_FOLDER = "models"

## Load the dataframe

In [3]:
train_df = pd.read_csv(os.path.join(CLEAN_DATA_FOLDER, "train.csv"))
test_df = pd.read_csv(os.path.join(CLEAN_DATA_FOLDER, "test.csv")).sort_values(
    ["Day", "Line", "Service", "Direction Number", "Sequence"]
)

## Split into X and y

In [4]:
train_X = train_df[[x for x in train_df.columns if x not in ["On", "Off"]]]
train_y = train_df["On"]
test_X = test_df[[x for x in test_df.columns if x not in ["On", "Off"]]]
test_y = test_df["On"]

## Train the Random Forest Regressor Model

In [5]:
rf = RandomForestRegressor(
    n_estimators=70,
    max_depth=10,
    random_state=42,
    min_samples_split=14,
    min_samples_leaf=7,
    n_jobs=-1,
    verbose=1,
    criterion="poisson",
)
rf = rf.fit(X=train_X, y=train_y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   51.1s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  2.0min finished


In [6]:
train_y_pred = np.floor(rf.predict(train_X)).astype(int)
test_y_pred = np.floor(rf.predict(test_X)).astype(int)

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.9s
[Parallel(n_jobs=10)]: Done  70 out of  70 | elapsed:    1.8s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done  70 out of  70 | elapsed:    0.4s finished


## Report Train and Test results

In [10]:
print("train rmse:", root_mean_squared_error(train_y, train_y_pred))
print("train mae:", mean_absolute_error(train_y, train_y_pred))
print("train r2 score:", r2_score(train_y, train_y_pred))

train rmse: 30.808197380716646
train mae: 11.126226269148548
train r2 score: 0.6146830921831571


In [11]:
print("test rmse:", root_mean_squared_error(test_y, test_y_pred))
print("test mae:", mean_absolute_error(test_y, test_y_pred))
print("test r2 score:", r2_score(test_y, test_y_pred))

test rmse: 28.12373943507795
test mae: 10.738565845992186
test r2 score: 0.6609586390387721


## Export Model

In [12]:
pickle.dump(rf, open(os.path.join(MODELS_FOLDER, "base_random_forest.pkl"), "wb"))

In [13]:
del rf

## Hyperparameter Tuning with GridSearchCV

| n_estimators | max_depth | max_features | criterion |
| --- | --- | --- | --- |
| 100 | 10, 20 | None | squared_error, poisson |
| 100 | 70, None | sqrt | squared_error, poisson |
| 10 | 70 | 1.0 | squared_error, poisson |
| 10 | None | sqrt | squared_error, poisson |

### Declare base model and parameters

In [14]:
base_rf = RandomForestRegressor(random_state=42, min_samples_split=14, min_samples_leaf=7, n_jobs=-1)
param_grid = [
    {
        "criterion": ["squared_error", "poisson"],
        "n_estimators": [50],
        "max_depth": [20],
        "max_features": [1.0],
    },
    {
        "criterion": ["squared_error", "poisson"],
        "n_estimators": [50],
        "max_depth": [100],
        "max_features": ["sqrt"],
    },
    {
        "criterion": ["squared_error", "poisson"],
        "n_estimators": [10],
        "max_depth": [70],
        "max_features": [1.0],
    },
    {
        "criterion": ["squared_error", "poisson"],
        "n_estimators": [10],
        "max_depth": [100],
        "max_features": ["sqrt"],
    },
]

### Declare the scorer and grid search

In [15]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = GridSearchCV(base_rf, param_grid, scoring=scorer, n_jobs=-1, verbose=2, cv=3)

### Train the models

In [16]:
grid_search.fit(train_X, train_y)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END criterion=squared_error, max_depth=100, max_features=sqrt, n_estimators=50; total time= 8.1min
[CV] END criterion=squared_error, max_depth=100, max_features=sqrt, n_estimators=50; total time= 8.1min
[CV] END criterion=squared_error, max_depth=100, max_features=sqrt, n_estimators=50; total time= 8.1min
[CV] END criterion=poisson, max_depth=100, max_features=sqrt, n_estimators=50; total time= 9.0min
[CV] END criterion=squared_error, max_depth=70, max_features=1.0, n_estimators=10; total time= 4.4min
[CV] END criterion=squared_error, max_depth=70, max_features=1.0, n_estimators=10; total time= 4.6min
[CV] END criterion=squared_error, max_depth=20, max_features=1.0, n_estimators=50; total time=16.2min
[CV] END criterion=squared_error, max_depth=20, max_features=1.0, n_estimators=50; total time=16.3min
[CV] END criterion=squared_error, max_depth=20, max_features=1.0, n_estimators=50; total time=16.3min
[CV] END criterion=s

In [17]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,952.439005,3.939207,21.22672,1.319012,squared_error,20,1.0,50,"{'criterion': 'squared_error', 'max_depth': 20...",-646.296259,-632.324947,-616.958666,-631.859957,11.981534,3
1,983.470781,4.749416,26.966395,0.295409,poisson,20,1.0,50,"{'criterion': 'poisson', 'max_depth': 20, 'max...",-650.13212,-634.113301,-617.242829,-633.829417,13.428498,4
2,396.622631,1.605044,88.440941,1.966319,squared_error,100,sqrt,50,"{'criterion': 'squared_error', 'max_depth': 10...",-646.277362,-626.215375,-608.161506,-626.884748,15.56793,2
3,461.949658,13.443393,72.725101,17.017472,poisson,100,sqrt,50,"{'criterion': 'poisson', 'max_depth': 100, 'ma...",-643.378585,-624.761256,-608.955946,-625.698596,14.068605,1
4,238.388481,11.150457,21.470436,3.479873,squared_error,70,1.0,10,"{'criterion': 'squared_error', 'max_depth': 70...",-656.653354,-647.026251,-628.121018,-643.933541,11.851785,8
5,154.969249,40.149604,6.993307,7.433801,poisson,70,1.0,10,"{'criterion': 'poisson', 'max_depth': 70, 'max...",-657.416945,-645.035929,-624.18459,-642.212488,13.713162,7
6,70.715432,2.086524,11.87817,1.848672,squared_error,100,sqrt,10,"{'criterion': 'squared_error', 'max_depth': 10...",-659.802431,-641.834128,-622.700009,-641.445523,15.149493,6
7,67.959576,0.846589,6.977731,0.478771,poisson,100,sqrt,10,"{'criterion': 'poisson', 'max_depth': 100, 'ma...",-657.645542,-640.975928,-620.729546,-639.783672,15.094453,5


In [18]:
print(grid_search.best_params_)

{'criterion': 'poisson', 'max_depth': 100, 'max_features': 'sqrt', 'n_estimators': 50}


### Extract the best model

In [19]:
best_rf = grid_search.best_estimator_

In [20]:
train_y_pred = np.floor(best_rf.predict(train_X)).astype(int)
test_y_pred = np.floor(best_rf.predict(test_X)).astype(int)

### Plot the Tree

In [21]:
# _, ax = plt.subplots(3, 1, figsize=(32, 48))
# for i in range(3):
#     _ = plot_tree(
#         best_rf.estimators_[i],
#         max_depth=4,
#         feature_names=train_X.columns,
#         filled=True,
#         proportion=True,
#         rounded=True,
#         precision=2,
#         fontsize=9,
#         ax=ax[i],
#     )

### Feature Importance

In [22]:
# feat_imp = pd.DataFrame(
#     {
#         "Feature": [x for x in best_rf.feature_names_in_],
#         "Importance": [x for x in best_rf.feature_importances_],
#     }
# )
# _, ax = plt.subplots(1, 1, figsize=(16, 9))
# _ = sns.barplot(feat_imp, x="Feature", y="Importance")
# _ = plt.title("Feature Importance for Best Decision Tree Regressor")

### Visualize the Predictions

In [23]:
# line_fit = pd.DataFrame({"True": test_y, "Predicted": test_y_pred}, index=test_df["Day"])
# _, ax = plt.subplots(1, 1, figsize=(16,9))
# _ = sns.lineplot(line_fit, legend=True, ax=ax)

### Report Train and Test results

In [24]:
print("train rmse:", root_mean_squared_error(train_y, train_y_pred))
print("train mae:", mean_absolute_error(train_y, train_y_pred))
print("train r2 score:", r2_score(train_y, train_y_pred))

train rmse: 21.91425577774032
train mae: 5.819477091373891
train r2 score: 0.8050429507781465


In [25]:
print("test rmse:", root_mean_squared_error(test_y, test_y_pred))
print("test mae:", mean_absolute_error(test_y, test_y_pred))
print("test r2 score:", r2_score(test_y, test_y_pred))

test rmse: 25.305387944095497
test mae: 7.157881301310457
test r2 score: 0.7255061928119706


### Export Model

In [26]:
pickle.dump(best_rf, open(os.path.join(MODELS_FOLDER, "tuned_random_forest.pkl"), "wb"))