In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import time
from sklearn.linear_model import LinearRegression
import joblib
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline


In [2]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def train_random_forest_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    # Create a random forest regressor object
    rf = RandomForestRegressor()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Use the best hyperparameters to create a random forest model
    best_rf = RandomForestRegressor(**grid_search.best_params_)
    best_rf.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_rf.predict(X_val)

    # Compute the evaluation metrics on the validation set
    val_mae = mean_absolute_error(y_val, y_pred_val)
    val_mse = mean_squared_error(y_val, y_pred_val)
    val_rmse = np.sqrt(val_mse)
    val_r2 = r2_score(y_val, y_pred_val)

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_rf.predict(X_test)

    # Compute the evaluation metrics on the test set
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, y_pred_test)

    # Calculate MSE for validation and test sets
    val_mse = mean_squared_error(y_val, y_pred_val)
    test_mse = mean_squared_error(y_test, y_pred_test)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return best_rf, grid_search.best_params_, val_mae, val_rmse, val_mse, val_r2, test_mae, test_rmse, test_mse, test_r2, elapsed_time


In [3]:
def train_linear_regression_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    # Create a linear regression object
    lr = LinearRegression()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Use the best hyperparameters to create a linear regression model
    best_lr = LinearRegression(**grid_search.best_params_)
    best_lr.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_lr.predict(X_val)

    # Compute the MAE, MSE, RMSE, and R2 on the validation set
    val_mae = mean_absolute_error(y_val, y_pred_val)
    val_mse = mean_squared_error(y_val, y_pred_val)
    val_rmse = np.sqrt(val_mse)
    val_r2 = r2_score(y_val, y_pred_val)

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_lr.predict(X_test)

    # Compute the MAE, MSE, RMSE, and R2 on the test set
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, y_pred_test)

    end_time = time.time()
    elapsed_time = end_time - start_time

    return best_lr, grid_search.best_params_, val_mae, val_mse, val_rmse, val_r2, test_mae, test_mse, test_rmse, test_r2, elapsed_time


In [4]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def train_svr_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    
    # Print the shape of the datasets
   # print("Shape of X: ", X.shape)
    #print("Shape of X_train: ", X_train.shape)
    #print("Shape of X_test: ", X_test.shape)
    #print("Shape of X_val: ", X_val.shape)
    
    
    # Create an SVR object
    svr = SVR()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and corresponding mean cross-validated score on validation set
    #print(f'Best hyperparameters: {grid_search.best_params_}')
    #print(f'Best cross-validated score on validation set: {grid_search.best_score_}')

    # Use the best hyperparameters to create an SVR model
    best_svr = SVR(**grid_search.best_params_)
    best_svr.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_svr.predict(X_val)

    # Compute the RMSE on the validation set
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    
    # Compute the MAE on the validation set
    val_mae = mean_absolute_error(y_val, y_pred_val)
    
    # Compute the MSE on the validation set
    val_mse = mean_squared_error(y_val, y_pred_val)
    
    # Compute the R2 on the validation set
    val_r2 = r2_score(y_val, y_pred_val)

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_svr.predict(X_test)

    # Compute the RMSE on the test set
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    # Compute the MAE on the test set
    test_mae = mean_absolute_error(y_test, y_pred_test)
    
    # Compute the MSE on the test set
    test_mse = mean_squared_error(y_test, y_pred_test)
    
    # Compute the R2 on the test set
    test_r2 = r2_score(y_test, y_pred_test)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return best_svr, grid_search.best_params_, val_rmse, val_mae, val_mse, val_r2, test_rmse, test_mae, test_mse, test_r2, elapsed_time


In [5]:
import time
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def train_dt_model(df, target_col, param_grid, test_size=0.15, random_state=42):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=random_state)  # 0.1765 = 15/85

    # Create the base estimator
    base_estimator = DecisionTreeRegressor()

    # Create the grid search object
    grid_search = GridSearchCV(estimator=base_estimator, param_grid=param_grid, cv=5)

    # Train the grid search object
    grid_search.fit(X_trainval, y_trainval)

    # Get the best model and its parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Use the best model to predict the target variable on the validation set
    y_pred_val = best_model.predict(X_val)

    # Compute the evaluation metrics on the validation set
    val_mae = mean_absolute_error(y_val, y_pred_val)
    val_mse = mean_squared_error(y_val, y_pred_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    val_r2 = r2_score(y_val, y_pred_val)

    # Use the best model to predict the target variable on the test set
    y_pred_test = best_model.predict(X_test)

    # Compute the evaluation metrics on the test set
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_r2 = r2_score(y_test, y_pred_test)

    end_time = time.time()
    elapsed_time = end_time - start_time

    return best_model, best_params, val_mae, val_mse, val_rmse, val_r2, test_mae, test_mse, test_rmse, test_r2, elapsed_time


In [6]:
import time
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


def train_dt_vot_model(df, target_col, param_grid, cv=5):
    start_time = time.time()

    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    # Create the base estimator
    base_estimator = DecisionTreeRegressor()

    # Create the voting regressor model
    model = VotingRegressor(estimators=[
        ('estimator_1', base_estimator), 
        ('estimator_2', DecisionTreeRegressor(max_depth=3)), 
        ('estimator_3', DecisionTreeRegressor(max_depth=5))
    ])

    # Train the model
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv)
    grid_search.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = grid_search.predict(X_val)

    # Compute the evaluation metrics on the validation set
    val_mae = mean_absolute_error(y_val, y_pred_val)
    val_mse = mean_squared_error(y_val, y_pred_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    val_r2 = r2_score(y_val, y_pred_val)

    # Use the trained model to predict the target variable on the test set
    y_pred_test = grid_search.predict(X_test)

    # Compute the evaluation metrics on the test set
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_r2 = r2_score(y_test, y_pred_test)

    end_time = time.time()
    elapsed_time = end_time - start_time

    return grid_search.best_estimator_, grid_search.best_params_, val_mae, val_mse, val_rmse, val_r2, test_mae, test_mse, test_rmse, test_r2, elapsed_time


In [7]:
import time
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def train_dt_bag_model(df, target_col, param_grid=None):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85
    
    # Create the base estimator
    base_estimator = DecisionTreeRegressor()
    
    # Create the ensemble model
    if param_grid:
        model = GridSearchCV(estimator=base_estimator, param_grid=param_grid, cv=5)
    else:
        model = BaggingRegressor(base_estimator=base_estimator, n_estimators=10, random_state=42)
    
    # Train the model
    model.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = model.predict(X_val)

    # Compute the evaluation metrics on the validation set
    val_mae = mean_absolute_error(y_val, y_pred_val)
    val_mse = mean_squared_error(y_val, y_pred_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    val_r2 = r2_score(y_val, y_pred_val)

    # Use the trained model to predict the target variable on the test set
    y_pred_test = model.predict(X_test)

    # Compute the evaluation metrics on the test set
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_r2 = r2_score(y_test, y_pred_test)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Get the best parameters (if applicable)
    best_params = model.best_params_ if param_grid else None
    
    return model, best_params, val_mae, val_mse, val_rmse, val_r2, test_mae, test_mse, test_rmse, test_r2, elapsed_time


In [8]:
def train_knn_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    # Create a KNN regressor object
    knn = KNeighborsRegressor()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Use the best hyperparameters to create a KNNRegressor model
    best_knn = KNeighborsRegressor(**grid_search.best_params_)
    best_knn.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_knn.predict(X_val)

    # Calculate MAE on the validation set
    val_mae = mean_absolute_error(y_val, y_pred_val)

    # Calculate MSE on the validation set
    val_mse = mean_squared_error(y_val, y_pred_val)

    # Calculate RMSE on the validation set
    val_rmse = np.sqrt(val_mse)

    # Calculate R2 on the validation set
    val_r2 = r2_score(y_val, y_pred_val)

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_knn.predict(X_test)

    # Calculate MAE on the test set
    test_mae = mean_absolute_error(y_test, y_pred_test)

    # Calculate MSE on the test set
    test_mse = mean_squared_error(y_test, y_pred_test)

    # Calculate RMSE on the test set
    test_rmse = np.sqrt(test_mse)

    # Calculate R2 on the test set
    test_r2 = r2_score(y_test, y_pred_test)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return best_knn, grid_search.best_params_, val_mae, val_mse, val_rmse, val_r2, test_mae, test_mse, test_rmse, test_r2, elapsed_time


In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def train_lasso_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    # Create a Lasso object
    lasso = Lasso()

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Use the best hyperparameters to create a Lasso model
    best_lasso = Lasso(**grid_search.best_params_)
    best_lasso.fit(X_trainval, y_trainval)

    # Use the trained model to predict the target variable on the validation set
    y_pred_val = best_lasso.predict(X_val)

    # Compute the MAE, MSE, RMSE, and R2 on the validation set
    val_mae = mean_absolute_error(y_val, y_pred_val)
    val_mse = mean_squared_error(y_val, y_pred_val)
    val_rmse = np.sqrt(val_mse)
    val_r2 = r2_score(y_val, y_pred_val)

    # Use the trained model to predict the target variable on the test set
    y_pred_test = best_lasso.predict(X_test)

    # Compute the MAE, MSE, RMSE, and R2 on the test set
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, y_pred_test)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return best_lasso, grid_search.best_params_, val_mae, val_mse, val_rmse, val_r2, test_mae, test_mse, test_rmse, test_r2, elapsed_time


In [10]:
def train_pol_model(df, target_col, param_grid):
    start_time = time.time()
    # Split the data into training, validation, and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)  # 0.1765 = 15/85

    # Create a Pipeline object that includes PolynomialFeatures and LinearRegression
    pol_regression = Pipeline([('poly', PolynomialFeatures()),
                               ('linear', LinearRegression())])

    # Set up the GridSearchCV object with the given param_grid and the pol_regression pipeline
    grid_search = GridSearchCV(pol_regression, param_grid, cv=5, return_train_score=True)

    # Fit the model on the training data
    grid_search.fit(X_trainval, y_trainval)

    # Use the best estimator from the grid search to predict the target variable on the validation set
    y_pred_val = grid_search.best_estimator_.predict(X_val)

    # Compute the MAE, MSE, RMSE, and R2 on the validation set using the best estimator
    val_mae = mean_absolute_error(y_val, y_pred_val)
    val_mse = mean_squared_error(y_val, y_pred_val)
    val_rmse = np.sqrt(val_mse)
    val_r2 = r2_score(y_val, y_pred_val)

    # Use the best estimator from the grid search to predict the target variable on the test set
    y_pred_test = grid_search.best_estimator_.predict(X_test)

    # Compute the MAE, MSE, RMSE, and R2 on the test set using the best estimator
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, y_pred_test)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return grid_search.best_estimator_, grid_search.best_params_, val_mae, val_mse, val_rmse, val_r2, test_mae, test_mse, test_rmse, test_r2, elapsed_time


### Load dataset and set target_col

In [11]:
df = pd.read_csvdf = pd.read_csv('./data/movies_only_numeric.csv')
target_col = "Rating"

#### RF 

In [12]:
# Parameters for RF
rf_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}
# model, best_params, val_rmse, test_rmse, time
model_rf = train_random_forest_model(df, target_col, rf_param_grid)


#### Lin Reg

In [13]:
# Parameters for the function
lin_param_grid = {
    'fit_intercept': [True, False],
    'normalize': [True, False],
}
# Call the function
# model, best_params, val_rmse, test_rmse, time
model_lin = train_linear_regression_model(df, target_col, lin_param_grid)

#### SVR

In [14]:
# Define the parameter grid
svr_param_grid = {
    'C': [1.0],
    'kernel': ['rbf'],
    'gamma': ['scale'],
    'epsilon': [0.1]
}

# Train the SVR model
#model_svr = train_svr_model(df, target_col, svr_param_grid)

#### Descision Tree

In [15]:
# Define the hyperparameter grid for DecisionTreeRegressor
dt_param_grid = {
    'max_depth': [2, 4, 6],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3],
    'max_features': ['auto', 'sqrt'],
    'criterion': ['mse', 'mae']
}



# Call the train_dt_model function with default hyperparameters
model_dt = train_dt_model(df, target_col, dt_param_grid)

#### Descision Tree - Bagging

In [16]:
dt_bag_param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6]
}

model_dt_bag = train_dt_model(df, target_col, dt_bag_param_grid)



#### Descision Tree - Voting

In [17]:
# Define parameter grid
dt_vot_param_grid = {
    'weights': [[1,2,1], [1,3,1], [1,4,1], [1,5,1]],
    'n_jobs': [-1],
    'verbose': [1]
}

# Train decision tree model with the best parameters
model_dt_vot = train_dt_vot_model(df, target_col, dt_vot_param_grid)

#### KNN

In [18]:
# Load your data into a pandas DataFrame called 'df'
# Define your target column name as a string called 'target_col'

# Define the hyperparameter grid
knn_param_grid = {
    'n_neighbors': [5],
    'weights': ['uniform'],
    'p': [2],
}

# Train the KNN model with default hyperparameters
#model_knn = train_knn_model(df, target_col, param_grid)

#### Lasso

In [19]:
# Load your data into a pandas DataFrame called 'df'
# Define your target column name as a string called 'target_col'

# Define the hyperparameter grid
lasso_param_grid = {
    'alpha': [1.0],
    'fit_intercept': [True],
    'normalize': [False],
    'precompute': [False],
    'max_iter': [1000],
    'tol': [0.0001],
    'warm_start': [False],
    'positive': [False],
    'random_state': [None],
    'selection': ['cyclic']
}

# Train the Lasso model with default hyperparameters
#model_lasso = train_lasso_model(df, target_col, param_grid)

#### Polynomal

In [20]:
pol_param_grid = {
    'poly__degree': [2, 3],
    'linear__fit_intercept': [True, False],
    'linear__normalize': [True, False]
}


# Train the model with default hyperparameters
model_pol = train_pol_model(df, target_col, pol_param_grid)

#### Overview of models with tuned hyper params

In [21]:

models_overview = { 'Model': ['RF', 'Lin', 'Pol', 'DT', 'DT Vot', 'DT Bag'],
                'Val_Mae': [model_rf[2], model_lin[2], model_pol[2], model_dt[2], model_dt_vot[2], model_dt_bag[2]],
                'Val_Mse': [model_rf[3], model_lin[3], model_pol[3], model_dt[3], model_dt_vot[3], model_dt_bag[3]],
                'Val_Rmse': [model_rf[4], model_lin[4], model_pol[4], model_dt[4], model_dt_vot[4], model_dt_bag[4]],
                'Val_R2': [model_rf[5], model_lin[5], model_pol[5], model_dt[5], model_dt_vot[5], model_dt_bag[5]],
                'Test_Mae': [model_rf[6], model_lin[6], model_pol[6], model_dt[6], model_dt_vot[6], model_dt_bag[6]],
                'Test_Mse': [model_rf[7], model_lin[7], model_pol[7], model_dt[7], model_dt_vot[7], model_dt_bag[7]],
                'Test_Rmse': [model_rf[8], model_lin[8], model_pol[8], model_dt[8], model_dt_vot[8], model_dt_bag[8]], 
                'Test_R2': [model_rf[9], model_lin[9], model_pol[9], model_dt[9], model_dt_vot[9], model_dt_bag[9]],
                'Best_Params': [model_rf[1], model_lin[1], model_pol[1], model_dt[1], model_dt_vot[1], model_dt_bag[1]]}
df = pd.DataFrame(models_overview)
df.head(12)

Unnamed: 0,Model,Val_Mae,Val_Mse,Val_Rmse,Val_R2,Test_Mae,Test_Mse,Test_Rmse,Test_R2,Best_Params
0,RF,0.283971,0.375343,0.140883,0.774153,0.563977,0.73837,0.54519,0.328288,"{'max_depth': 10, 'min_samples_leaf': 2, 'min_..."
1,Lin,0.490564,0.420064,0.648123,0.326602,0.573687,0.56787,0.753572,0.300345,"{'fit_intercept': True, 'normalize': False}"
2,Pol,0.546667,0.499145,0.706502,0.199828,0.682752,1.010921,1.005445,-0.245524,"{'linear__fit_intercept': False, 'linear__norm..."
3,DT,0.5216,0.464177,0.681305,0.255885,0.607513,0.630077,0.793774,0.223702,"{'criterion': 'mse', 'max_depth': 4, 'max_feat..."
4,DT Vot,0.426909,0.315032,0.561277,0.494976,0.603543,0.629452,0.79338,0.224472,"{'n_jobs': -1, 'verbose': 1, 'weights': [1, 4,..."
5,DT Bag,0.503116,0.450275,0.671026,0.27817,0.627766,0.707227,0.840968,0.128648,"{'max_depth': 5, 'min_samples_split': 4}"


#### Save the models with joblib

In [22]:
def save_models():
    models = [model_rf[0], model_lin[0], model_pol[0], model_dt[0], model_dt_vot[0], model_dt_bag[0]]
    file_name = ["rf", "lin", "pol", "dt", "dt_vot", "dt_bag"]
    for i, model in enumerate(models):
        filename = f'./models/model_{file_name[i]}.joblib'
        joblib.dump(model, filename)

#save_models()