In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import time


def train_random_forest_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    # Create a random forest regressor object
    rf = RandomForestRegressor()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and corresponding mean cross-validated score on validation set
    print(f'Best hyperparameters: {grid_search.best_params_}')
    #print(f'Best cross-validated score on validation set: {grid_search.best_score_}')

    # Use the best hyperparameters to create a random forest model
    best_rf = RandomForestRegressor(**grid_search.best_params_)
    best_rf.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_rf.predict(X_val)

    # Compute the RMSE on the validation set
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_rf.predict(X_test)

    # Compute the RMSE on the test set
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return best_rf, grid_search.best_params_, val_rmse, test_rmse, elapsed_time


In [2]:
# Parameters for the function
df = pd.read_csv('mvoies_processed_noTitle.csv')
target_col = "Rating"
param_grid = {
    'n_estimators': [25, 50],
    'max_depth': [1, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}
# Call the function
# model, best_params, val_rmse, test_rmse, time
model_rf = train_random_forest_model(df, target_col, param_grid)

Best hyperparameters: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}


In [4]:
param_grid = {
    'n_estimators': [25, 50, 100],
    'max_depth': [1, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}
# Call the function
# model, best_params, val_rmse, test_rmse, time
model_rf_2 = train_random_forest_model(df, target_col, param_grid)

Best hyperparameters: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}


In [13]:
# Define some sample data
data = {'Best Params': [model_rf[1], model_rf_2[1]],
        '': [model_rf[1], model_rf_2[1]],
        'val RMSE': [model_rf[2], model_rf_2[2]],
        'test RMSE': [model_rf[3], model_rf_2[3]],
        'time': [model_rf[4], model_rf_2[4]],
        'Comments': ["Standard settings", "changed n_est"]}

# Create a pandas DataFrame from the data dictionary
df = pd.DataFrame(data)

# Print the DataFrame
df.head()

Unnamed: 0,Best Params,Unnamed: 2,val RMSE,test RMSE,time,Comments
0,"{'max_depth': 5, 'min_samples_leaf': 2, 'min_s...","{'max_depth': 5, 'min_samples_leaf': 2, 'min_s...",0.602688,0.75615,13.533924,Standard settings
1,"{'max_depth': 5, 'min_samples_leaf': 2, 'min_s...","{'max_depth': 5, 'min_samples_leaf': 2, 'min_s...",0.597853,0.755815,31.293541,changed n_est
