hyper-parameter search for a random forest model using gridsearch

In [1]:
from data_preprocess import data_preprocess, get_training_data, get_input_data, prepare_submission
import pandas as pd
import warnings
import sys

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
# load data
data = data_preprocess(one_hot_location=True)

features = ["direct_rad:W", "direct_rad_1h:J", "clear_sky_rad:W", "clear_sky_energy_1h:J", "diffuse_rad:W", "sun_elevation:d","is_day:idx", 
            "is_in_shadow:idx", "diffuse_rad_1h:J", "t_1000hPa:K","relative_humidity_1000hPa:p", "air_density_2m:kgm3", "A", "B", "C"]

X_train, targets = get_training_data(data, features)

In [3]:
# train a single RF and look at feature importance

from sklearn.ensemble import RandomForestRegressor

X = X_train.values
y = targets

forest_reg = RandomForestRegressor()
forest_reg.fit(X, y)

In [4]:
names_list = []
scores_list = []

for name, score, in zip(features, forest_reg.feature_importances_):
    names_list.append(name)
    scores_list.append(score)

feature_importance = pd.DataFrame({"feature": names_list, "relative_importance": scores_list})

In [5]:
feature_importance.sort_values("relative_importance", ascending=False)

Unnamed: 0,feature,relative_importance
0,direct_rad:W,0.4073669
12,A,0.3704958
4,diffuse_rad:W,0.0716371
1,direct_rad_1h:J,0.02244493
9,t_1000hPa:K,0.02075875
2,clear_sky_rad:W,0.02040643
5,sun_elevation:d,0.01928038
10,relative_humidity_1000hPa:p,0.01723909
3,clear_sky_energy_1h:J,0.01705353
11,air_density_2m:kgm3,0.01647087


## Hyper-parameter search

In [20]:
# run random forest hyperparam search
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

X = X_train.values
y = targets

param_grid = [
    {"n_estimators": [70, 80, 90], "max_features": [3]},
    {"bootstrap": [False], "n_estimators": [70, 80, 90], "max_features": [3]}
    ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring="neg_mean_squared_error", return_train_score=True)

grid_search.fit(X, y)

In [21]:
# get the best parameters from the grid_search
print(grid_search.best_params_)

# get the best model
best_model = grid_search.best_estimator_

{'max_features': 3, 'n_estimators': 80}


In [22]:
# predict for test-data using the best model

X_test = get_input_data()

predictions = best_model.predict(X_test[features].values)

In [23]:
# write the submission to file
submission = prepare_submission(X_test, predictions)
submission.to_csv('submissions/random_forest_tuned_params.csv', index=False)