In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def train_models(df, target_col, models=[], hyperparams={}):
    # Split data into training and testing sets
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define models to train
    models_dict = {}
    if 'linreg' in models:
        models_dict['linreg'] = LinearRegression()
    if 'rf' in models:
        models_dict['rf'] = RandomForestRegressor()

    # Train and tune models
    best_models = {}
    best_params = {}
    for model_name, model in models_dict.items():
        if model_name in hyperparams:
            grid_search = GridSearchCV(model, hyperparams[model_name], scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_models[model_name] = grid_search.best_estimator_
            best_params[model_name] = grid_search.best_params_
        else:
            model.fit(X_train, y_train)
            best_models[model_name] = model
            best_params[model_name] = None

    # Print results as pandas table
    results_dict = {'Model': [], 'MAE': [], 'MSE': [], 'RMSE': [], 'R^2': [], 'Best Params': []}
    for model_name, model in best_models.items():
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)
        results_dict['Model'].append(model_name)
        results_dict['MAE'].append(mean_absolute_error(y_test, test_pred))
        results_dict['MSE'].append(mean_squared_error(y_test, test_pred))
        results_dict['RMSE'].append(mean_squared_error(y_test, test_pred, squared=False))
        results_dict['R^2'].append(r2_score(y_test, test_pred))
        results_dict['Best Params'].append(best_params[model_name])
    results_df = pd.DataFrame(results_dict)
    return results_df

    #return best_models

# Load data
df = pd.read_csv('../data/data.csv')
target_col = "Rating"

# Specify models to train
# rf, linreg
models = ['rf', 'linreg']

# Define hyperparameters
hyperparams = {
    'rf': {'n_estimators': [10, 50, 100], 
           'max_depth': [None, 5, 10]
          }, 
    'linreg': {}}

# Train models and print results
best_models = train_models(df, target_col, models, hyperparams)
best_models

Unnamed: 0,Model,MAE,MSE,RMSE,R^2,Best Params
0,linreg,0.580813,0.590428,0.768393,0.295584,{}
1,rf,0.547674,0.514827,0.717514,0.385781,"{'max_depth': 10, 'n_estimators': 100}"
