In [19]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import time
from sklearn.linear_model import LinearRegression
import joblib
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline


In [20]:
def train_random_forest_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    
    # Print the shape of the datasets
   # print("Shape of X: ", X.shape)
    #print("Shape of X_train: ", X_train.shape)
    #print("Shape of X_test: ", X_test.shape)
    #print("Shape of X_val: ", X_val.shape)
    
    
    # Create a random forest regressor object
    rf = RandomForestRegressor()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and corresponding mean cross-validated score on validation set
    #print(f'Best hyperparameters: {grid_search.best_params_}')
    #print(f'Best cross-validated score on validation set: {grid_search.best_score_}')

    # Use the best hyperparameters to create a random forest model
    best_rf = RandomForestRegressor(**grid_search.best_params_)
    best_rf.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_rf.predict(X_val)

    # Compute the RMSE on the validation set
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_rf.predict(X_test)

    # Compute the RMSE on the test set
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return best_rf, grid_search.best_params_, val_rmse, test_rmse, elapsed_time

In [21]:
def train_linear_regression_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    # Print the shape of the datasets
    #print("Shape of X: ", X.shape)
    #print("Shape of X_train: ", X_train.shape)
    #print("Shape of X_test: ", X_test.shape)
   ## print("Shape of X_val: ", X_val.shape)

    # Create a linear regression object
    lr = LinearRegression()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and corresponding mean cross-validated score on validation set
    #print(f'Best hyperparameters: {grid_search.best_params_}')

    # Use the best hyperparameters to create a linear regression model
    best_lr = LinearRegression(**grid_search.best_params_)
    best_lr.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_lr.predict(X_val)

    # Compute the RMSE on the validation set
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_lr.predict(X_test)

    # Compute the RMSE on the test set
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

    end_time = time.time()
    elapsed_time = end_time - start_time

    return best_lr, grid_search.best_params_, val_rmse, test_rmse, elapsed_time

In [22]:
def train_svr_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    
    # Print the shape of the datasets
   # print("Shape of X: ", X.shape)
    #print("Shape of X_train: ", X_train.shape)
    #print("Shape of X_test: ", X_test.shape)
    #print("Shape of X_val: ", X_val.shape)
    
    
    # Create an SVR object
    svr = SVR()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and corresponding mean cross-validated score on validation set
    #print(f'Best hyperparameters: {grid_search.best_params_}')
    #print(f'Best cross-validated score on validation set: {grid_search.best_score_}')

    # Use the best hyperparameters to create an SVR model
    best_svr = SVR(**grid_search.best_params_)
    best_svr.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_svr.predict(X_val)

    # Compute the RMSE on the validation set
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_svr.predict(X_test)

    # Compute the RMSE on the test set
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return best_svr, grid_search.best_params_, val_rmse, test_rmse, elapsed_time

In [23]:
def train_dt_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    # Create a DecisionTreeRegressor object
    dt = DecisionTreeRegressor()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and corresponding mean cross-validated score on validation set
    #print(f'Best hyperparameters: {grid_search.best_params_}')
    #print(f'Best cross-validated score on validation set: {grid_search.best_score_}')

    # Use the best hyperparameters to create a DecisionTreeRegressor model
    best_dt = DecisionTreeRegressor(**grid_search.best_params_)
    best_dt.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_dt.predict(X_val)

    # Compute the RMSE on the validation set
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_dt.predict(X_test)

    # Compute the RMSE on the test set
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return best_dt, grid_search.best_params_, val_rmse, test_rmse, elapsed_time

In [24]:
def train_knn_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    # Create a KNN regressor object
    knn = KNeighborsRegressor()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and corresponding mean cross-validated score on validation set
    #print(f'Best hyperparameters: {grid_search.best_params_}')
    #print(f'Best cross-validated score on validation set: {grid_search.best_score_}')

    # Use the best hyperparameters to create a KNNRegressor model
    best_knn = KNeighborsRegressor(**grid_search.best_params_)
    best_knn.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_knn.predict(X_val)

    # Compute the RMSE on the validation set
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_knn.predict(X_test)

    # Compute the RMSE on the test set
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return best_knn, grid_search.best_params_, val_rmse, test_rmse, elapsed_time


In [25]:
def train_lasso_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    # Create a Lasso object
    lasso = Lasso()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and corresponding mean cross-validated score on validation set
    #print(f'Best hyperparameters: {grid_search.best_params_}')
    #print(f'Best cross-validated score on validation set: {grid_search.best_score_}')

    # Use the best hyperparameters to create a Lasso model
    best_lasso = Lasso(**grid_search.best_params_)
    best_lasso.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_lasso.predict(X_val)

    # Compute the RMSE on the validation set
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_lasso.predict(X_test)

    # Compute the RMSE on the test set
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return best_lasso, grid_search.best_params_, val_rmse, test_rmse, elapsed_time


In [26]:
def train_pol_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    # Create a Pipeline object that includes PolynomialFeatures and LinearRegression
    pol_regression = Pipeline([('poly', PolynomialFeatures()),
                               ('linear', LinearRegression())])

    # Set up the GridSearchCV object with the given param_grid and the pol_regression pipeline
    grid_search = GridSearchCV(pol_regression, param_grid, cv=5, return_train_score=True)

    # Fit the model on the training data
    grid_search.fit(X_trainval, y_trainval)

    # Use the best estimator from the grid search to predict the target variable on the validation set
    y_pred_val = grid_search.best_estimator_.predict(X_val)

    # Compute the RMSE on the validation set using the best estimator
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

    # Use the best estimator from the grid search to predict the target variable on the test set
    y_pred_test = grid_search.best_estimator_.predict(X_test)

    # Compute the RMSE on the test set using the best estimator
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return grid_search.best_estimator_, grid_search.best_params_, val_rmse, test_rmse, elapsed_time


### Load dataset and set target_col

In [27]:
df = pd.read_csv('./data/mvoies_processed_noTitle.csv')
target_col = "Rating"

#### RF 

In [28]:
# Parameters for RF
rf_param_grid = {
    'n_estimators': [25, 50],
    'max_depth': [1, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}
# model, best_params, val_rmse, test_rmse, time
model_rf = train_random_forest_model(df, target_col, rf_param_grid)

#### Lin Reg

In [29]:
# Parameters for the function
lin_param_grid = {
    'fit_intercept': [True, False],
    'normalize': [True, False]
}
# Call the function
# model, best_params, val_rmse, test_rmse, time
model_lin = train_linear_regression_model(df, target_col, lin_param_grid)

#### SVR

In [30]:
# Define the parameter grid
svr_param_grid = {
    'C': [1.0],
    'kernel': ['rbf'],
    'gamma': ['scale'],
    'epsilon': [0.1]
}

# Train the SVR model
model_svr = train_svr_model(df, target_col, svr_param_grid)

#### Descision Tree

In [31]:
# Define the hyperparameter grid for DecisionTreeRegressor
dt_param_grid = {
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5],
}

# Call the train_dt_model function with default hyperparameters
model_dt = train_dt_model(df, target_col, dt_param_grid)

#### KNN

In [32]:
# Load your data into a pandas DataFrame called 'df'
# Define your target column name as a string called 'target_col'

# Define the hyperparameter grid
param_grid = {
    'n_neighbors': [5],
    'weights': ['uniform'],
    'p': [2],
}

# Train the KNN model with default hyperparameters
model_knn = train_knn_model(df, target_col, param_grid)

#### Lasso

In [33]:
# Load your data into a pandas DataFrame called 'df'
# Define your target column name as a string called 'target_col'

# Define the hyperparameter grid
param_grid = {
    'alpha': [1.0],
    'fit_intercept': [True],
    'normalize': [False],
    'precompute': [False],
    'max_iter': [1000],
    'tol': [0.0001],
    'warm_start': [False],
    'positive': [False],
    'random_state': [None],
    'selection': ['cyclic']
}

# Train the Lasso model with default hyperparameters
model_lasso = train_lasso_model(df, target_col, param_grid)

#### Polynomal

In [34]:
param_grid = {
    'poly__degree': [2],
    'poly__include_bias': [True],
    'poly__interaction_only': [False],
    'linear__fit_intercept': [True],
    'linear__normalize': [False]
}
# Train the model with default hyperparameters
model_pol = train_pol_model(df, target_col, param_grid)

#### Overview of models with tuned hyper params

In [35]:
models_overview = { 'Model': ['RF', 'Lin', 'SVR', 'D T', 'KNN', 'Lasso', 'Polynomal'],
                    'Val_RMSE': [model_rf[2], model_lin[2], model_svr[2], model_dt[2], model_knn[2], model_lasso[2], model_pol[2]],
                    'Test_RMSE': [model_rf[3], model_lin[3], model_svr[3], model_dt[3], model_knn[3], model_lasso[3], model_pol[3]]}
df = pd.DataFrame(models_overview)
df.head(12)

Unnamed: 0,Model,Val_RMSE,Test_RMSE
0,RF,0.605432,0.756354
1,Lin,0.648123,0.753572
2,SVR,0.753211,0.893219
3,D T,0.681305,0.793774
4,KNN,0.703461,0.956798
5,Lasso,0.696739,0.815483
6,Polynomal,0.666103,0.878999


#### Save the models with joblib

In [37]:
def save_models():
    models = [model_rf[0], model_lin[0], model_svr[0], model_dt[0], model_knn[0], model_lasso[0], model_pol[0]]
    file_name = ["rf", "lin", "svr","dt", "knn", "lasso", "pol"]
    for i, model in enumerate(models):
        filename = f'./models/model_{file_name[i]}.joblib'
        joblib.dump(model, filename)

#save_models()