In [14]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid

In [18]:
#set seed value
SEED = 7

#import the train, validation and test datasets
train_df=pd.read_parquet('/Users/Sheffin/Downloads/760Data/train.parquet',engine='fastparquet')
val_df=pd.read_parquet('/Users/Sheffin/Downloads/760Data/val.parquet',engine='fastparquet')
test_df=pd.read_parquet('/Users/Sheffin/Downloads/760Data/test.parquet',engine='fastparquet')

In [8]:
#split the datasets
X_train=train_df.drop(['r_id','r_useful'],axis=1)
y_train=train_df['r_useful']
X_val=val_df.drop(['r_id','r_useful'],axis=1)
y_val=val_df['r_useful']
X_test=test_df.drop(['r_id','r_useful'],axis=1)
y_test=test_df['r_useful']

In [16]:
#function to find the best parameters 
def experiment(parameters):
    grid = ParameterGrid(parameters)

    for i, parameters in enumerate(grid):
        print(f"\n{i+1}th model:")
        print("\n parameters used in model",parameters)
        randomforest = RandomForestRegressor(random_state = SEED)
        randomforest.set_params(**parameters)
        randomforest.fit(X_train, y_train)
               
        y_pred = randomforest.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mae = mean_absolute_error(y_val, y_pred) 
        print(f"{i+1}th model scores -> RMSE:", rmse, "MAE:", mae)
        

In [20]:
#n_estimators = n of trees
#max_features = max number of features considered for splitting a node
#max_depth = max number of levels in each decision tree
#min_samples_leaf = min number of data points allowed in a leaf node
#bootstrap = method for sampling data points (with or without replacement)
#ccp_alpha = cost complexity pruning

hyperparams = {
     'n_estimators': [20, 50, 70, 100], 
     'max_features': ['auto'], 
     'max_depth': [None], 
     'bootstrap': [True],
     'min_samples_leaf': [1]}

experiment(hyperparams)

1th model:

 parameters used in model {'bootstrap': True, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 20}
1th model scores -> RMSE: 12.86403129049243 MAE: 0.23599776167293995
2th model:

 parameters used in model {'bootstrap': True, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 50}
2th model scores -> RMSE: 12.367259409572226 MAE: 0.22844275233126396
3th model:

 parameters used in model {'bootstrap': True, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 70}
3th model scores -> RMSE: 12.397391783330594 MAE: 0.22749873440560037
4th model:

 parameters used in model {'bootstrap': True, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100}
4th model scores -> RMSE: 12.436233056160098 MAE: 0.22621447777595993


In [29]:
#model
randomforest = RandomForestRegressor(random_state = SEED, bootstrap=True, max_depth=None,max_features='auto', 
                                     min_samples_leaf=1, n_estimators=100, n_jobs=-1)
#fit the model on the train set
randomforest.fit(X_train, y_train)

#predict train set results
y_pred = randomforest.predict(X_train)
rmse = mean_squared_error(y_train, y_pred, squared=False)
mae = mean_absolute_error(y_train, y_pred) 
print(f"\n model train score -> RMSE:", rmse, "MAE:", mae)

#predict validation set results
y_pred_val = randomforest.predict(X_val)
rmse = mean_squared_error(y_val, y_pred_val, squared=False)
mae = mean_absolute_error(y_val, y_pred_val) 
print(f"\n model val score -> RMSE:", rmse, "MAE:", mae)

#predict test set results
y_pred_test = randomforest.predict(X_test)
rmse = mean_squared_error(y_test, y_pred_test, squared=False)
mae = mean_absolute_error(y_test, y_pred_test) 
print(f"\n model test score -> RMSE:", rmse, "MAE:", mae)


 model train score -> RMSE: 3.091398881803236 MAE: 0.07756503772926586

 model val score -> RMSE: 12.278492543419416 MAE: 0.22593024166907016

 model test score -> RMSE: 6.04689950763364 MAE: 0.19814207477258228
