In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import time
from sklearn.linear_model import LinearRegression
import joblib



def train_random_forest_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    
    # Print the shape of the datasets
   # print("Shape of X: ", X.shape)
    #print("Shape of X_train: ", X_train.shape)
    #print("Shape of X_test: ", X_test.shape)
    #print("Shape of X_val: ", X_val.shape)
    
    
    # Create a random forest regressor object
    rf = RandomForestRegressor()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and corresponding mean cross-validated score on validation set
    #print(f'Best hyperparameters: {grid_search.best_params_}')
    #print(f'Best cross-validated score on validation set: {grid_search.best_score_}')

    # Use the best hyperparameters to create a random forest model
    best_rf = RandomForestRegressor(**grid_search.best_params_)
    best_rf.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_rf.predict(X_val)

    # Compute the RMSE on the validation set
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_rf.predict(X_test)

    # Compute the RMSE on the test set
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return best_rf, grid_search.best_params_, val_rmse, test_rmse, elapsed_time

In [2]:
def train_linear_regression_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    # Print the shape of the datasets
    #print("Shape of X: ", X.shape)
    #print("Shape of X_train: ", X_train.shape)
    #print("Shape of X_test: ", X_test.shape)
   ## print("Shape of X_val: ", X_val.shape)

    # Create a linear regression object
    lr = LinearRegression()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and corresponding mean cross-validated score on validation set
    #print(f'Best hyperparameters: {grid_search.best_params_}')

    # Use the best hyperparameters to create a linear regression model
    best_lr = LinearRegression(**grid_search.best_params_)
    best_lr.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_lr.predict(X_val)

    # Compute the RMSE on the validation set
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_lr.predict(X_test)

    # Compute the RMSE on the test set
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

    end_time = time.time()
    elapsed_time = end_time - start_time

    return best_lr, grid_search.best_params_, val_rmse, test_rmse, elapsed_time

In [3]:
#Load dataset and set target_col
df = pd.read_csv('./data/mvoies_processed_noTitle.csv')
target_col = "Rating"

#### RF 

In [4]:
# Parameters for RF
rf_param_grid = {
    'n_estimators': [25, 50],
    'max_depth': [1, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}
# model, best_params, val_rmse, test_rmse, time
model_rf = train_random_forest_model(df, target_col, rf_param_grid)

#### Lin Reg

In [5]:
# Parameters for the function
lin_param_grid = {
    'fit_intercept': [True, False],
    'normalize': [True, False]
}
# Call the function
# model, best_params, val_rmse, test_rmse, time
model_lin = train_linear_regression_model(df, target_col, lin_param_grid)

#### Overview of models with tuned hyper params

In [6]:
models_overview = { 'Model': ['RF', 'Lin'],
                    'Val_RMSE': [model_rf[2], model_lin[2]],
                    'Test_RMSE': [model_rf[3], model_lin[3]],
                    'Best_params': [model_rf[1], model_lin[1]],}
df = pd.DataFrame(models_overview)
df.head()

Unnamed: 0,Model,Val_RMSE,Test_RMSE,Best_params
0,RF,0.602429,0.760805,"{'max_depth': 5, 'min_samples_leaf': 1, 'min_s..."
1,Lin,0.648123,0.753572,"{'fit_intercept': True, 'normalize': False}"


#### Save the models with joblib

In [7]:
filename = './models/model_rf.joblib'
joblib.dump(model_rf[0], filename)

filename = './models/model_lin.joblib'
joblib.dump(model_lin[0], filename)

['./models/model_lin.joblib']