In [14]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from sklearn.model_selection import ShuffleSplit, GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import VotingRegressor

In [15]:
class RegressionGridSearch:
    def __init__(self, df_path):
        # Read input data from given path
        df = pd.read_csv(df_path, delimiter=',', index_col=False)

        # Split training and testing data 70:30
        X_train, X_test, y_train, y_test = train_test_split(
            df.iloc[:, 1:-1],
            df.iloc[:, -1],
            test_size=0.3,
            random_state=1,
        )

        # Scale numerical values with same method as paper being reproduced
        scaler = QuantileTransformer(n_quantiles=1000, random_state=1)
        scaler.fit(X_train)

        # Store X any y values in regression object instance
        self.X_train = scaler.transform(X_train)
        self.X_test = scaler.transform(X_test)
        self.y_train = y_train
        self.y_test = y_test

    def grid_search(self, regressor, parameters, print_results=False):
        # Measures used for each model
        scoring = ['neg_root_mean_squared_error', ]

        # Cross Validator used for Grid Search
        cv = ShuffleSplit(
            test_size=0.3, random_state=1
        )

        # Find the best model using grid-search with above cross-validation
        clf = GridSearchCV(
            regressor,
            param_grid=parameters,
            scoring=scoring,
            cv=cv,
            refit='neg_root_mean_squared_error'
        )
        clf.fit(X=self.X_train, y=self.y_train)

        # Print grid search results
        if print_results:
            print('Grid Search Results:')
            print(' (*) Best parameters set found on development set:', clf.best_params_)
            print(' (*) Best classifier score on development set:', clf.best_score_)
            print(' (*) Best classifier score on test set:', clf.score(self.X_test, self.y_test))

        # Return resulting estimator
        return(clf.best_estimator_)

    def evaluate_model(self, model, print_results=False):
        # Use given model to predict y values
        y_true, y_pred = self.y_test, model.predict(self.X_test)

        # Extract stats
        r2 = r2_score(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        mse = mean_squared_error(y_true, y_pred)
        rmse = mean_squared_error(y_true, y_pred, squared=False)
        mdae = median_absolute_error(y_true, y_pred)

        # Print model metrics
        if print_results:
            print('Evaluating regressor:')
            print(' (*) R^2 Score:', r2)
            print(' (*) Mean Absolute Error:', mae)
            print(' (*) Mean Squared Error:', mse)
            print(' (*) Root Mean Squared Error:', rmse)
            print(' (*) Median Absolute Error:', mdae)

        # Return stats in dict
        stats = {
            'r2': r2,
            'mae': mae,
            'mse': mse,
            'rmse': rmse,
            'mdae': mdae
        }
        return(stats)

In [16]:
def run_grid_search(regressor=None, parameters=None, ds_number=6):
    # Use one of the 5 datasets from study to initialize regression model search
    if ds_number == 1:
        df_path=r'./dataset/researchDataset/DS07012.csv' # DS1
    elif ds_number == 2:
        df_path=r'./dataset/researchDataset/DS07310.csv' # DS2
    elif ds_number == 3:
        df_path=r'./dataset/researchDataset/DS07410.csv' # DS3
    elif ds_number == 4:
        df_path=r'./dataset/researchDataset/DS07510.csv' # DS4
    elif ds_number == 5:
        df_path=r'./dataset/researchDataset/DS07610.csv' # DS5
    
    # Run grid search on provided regression model and params
    reg_gs = RegressionGridSearch(df_path)
    best_model = reg_gs.grid_search(regressor, parameters)
    model_stats = reg_gs.evaluate_model(best_model)
    return(model_stats)

In [17]:
def compute_regressor_stats(regressor, parameters, datasets=[1, 2, 3, 4, 5]):
    for ds_number in datasets:
        best_estimator_stats = run_grid_search(regressor, parameters, ds_number=ds_number)
        # For visualization and comparison of models we can update this to send stats somewhere other than print
        print(f'Best estimator stats on dataset {ds_number}:')
        print(best_estimator_stats)

In [18]:
regressor = DecisionTreeRegressor(random_state=1)
parameters = {
    'max_depth': range(3, 50, 5),
    'min_samples_split': range(2, 30, 2)
}
compute_regressor_stats(regressor, parameters, [5])

Best estimator stats on dataset 3:
{'r2': 0.4926700922053632, 'mae': 0.15367447949520668, 'mse': 0.04453985954009794, 'rmse': 0.21104468612144192, 'mdae': 0.11083928193309642}
Best estimator stats on dataset 4:
{'r2': 0.514867457762517, 'mae': 0.14847369070327543, 'mse': 0.042591093009905434, 'rmse': 0.206376096023511, 'mdae': 0.10507204405227522}
Best estimator stats on dataset 5:
{'r2': 0.5211559260151406, 'mae': 0.15040792895336572, 'mse': 0.0420390114385434, 'rmse': 0.20503417139233987, 'mdae': 0.11350829702566342}


In [None]:
regressor = RandomForestRegressor(random_state=1)
parameters = {
    'n_estimators': range(100, 200, 100),
    'max_depth': range(10, 50, 10)
}
compute_regressor_stats(regressor, parameters)

In [None]:
regressor = GradientBoostingRegressor(n_estimators=400, learning_rate=0.05, random_state=1)
parameters = {
    'max_depth': range(10, 50, 10),
    'min_samples_split': range(2, 30, 3)
}
compute_regressor_stats(regressor, parameters)

In [None]:
regressor = HistGradientBoostingRegressor(max_iter=400, learning_rate=0.05, random_state=1)
parameters = {
    'max_depth': range(10, 50, 10),
    'min_samples_leaf': range(5, 50, 10)
}
compute_regressor_stats(regressor, parameters)

In [None]:
regressor = linear_model.SGDRegressor(early_stopping=True, n_iter_no_change=5, random_state=1)
parameters = {
    'loss': ['squared_loss', 'huber', 'epsilon_insensitive'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'max_iter': range(50, 1000, 50),
    'learning_rate': ['invscaling', 'optimal', 'constant', 'adaptive'],
    'eta0': [0.1, 0.01],
    'average': [32, ]
}
compute_regressor_stats(regressor, parameters)

In [None]:
regressor = MLPRegressor(random_state=1)
parameters = {
    'hidden_layer_sizes': [(256, 100), (512, 256, 100), ],
    'activation': ['tanh', ],
    'solver': ['adam', ],
    'max_iter': range(50, 200, 50)
}
compute_regressor_stats(regressor, parameters)