In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from sklearn.model_selection import ShuffleSplit, GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import VotingRegressor

In [None]:
class RegressionGridSearch:
    def __init__(self, df_path):
        # Read input data from given path
        df = pd.read_csv(df_path, delimiter=',', index_col=False)

        # Split training and testing data 70:30
        X_train, X_test, y_train, y_test = train_test_split(
            df.iloc[:, 1:-1],
            df.iloc[:, -1],
            test_size=0.3,
            random_state=1,
        )

        # Scale numerical values with same method as paper being reproduced
        scaler = QuantileTransformer(n_quantiles=1000, random_state=1)
        scaler.fit(X_train)

        # Store X any y values in regression object instance
        self.X_train = scaler.transform(X_train)
        self.X_test = scaler.transform(X_test)
        self.y_train = y_train
        self.y_test = y_test

    def grid_search(self, regressor, parameters):
        # Measures used for each model
        scoring = ['neg_root_mean_squared_error', ]

        # Cross Validator used for Grid Search
        cv = ShuffleSplit(
            test_size=0.3, random_state=1
        )

        # Find the best model using grid-search with above cross-validation
        clf = GridSearchCV(
            regressor,
            param_grid=parameters,
            scoring=scoring,
            cv=cv,
            refit='neg_root_mean_squared_error'
        )
        clf.fit(X=self.X_train, y=self.y_train)

        # Print grid search results
        print('Grid Search Results:')
        print(' (*) Best parameters set found on development set:', clf.best_params_)
        print(' (*) Best classifier score on development set:', clf.best_score_)
        print(' (*) Best classifier score on test set:', clf.score(self.X_test, self.y_test))

        # Evaluate resulting estimator
        self.evaluate_model(clf.best_estimator_)

    def evaluate_model(self, model):
        # Use given model to predict y values
        y_true, y_pred = self.y_test, model.predict(self.X_test)

        # Print model metrics
        print('Evaluating regressor:')
        print(' (*) Regressor minimum prediction:', min(y_pred))
        print(' (*) Regressor maximum prediction:', max(y_pred))
        print(' (*) R^2 Score Uniform Average:', r2_score(y_true, y_pred, multioutput='uniform_average'))
        print(' (*) R^2 Score Variance Weighted:', r2_score(y_true, y_pred, multioutput='variance_weighted'))
        print(' (*) Mean Absolute Error:', mean_absolute_error(y_true, y_pred))
        print(' (*) Mean Squared Error:', mean_squared_error(y_true, y_pred))
        print(' (*) Root Mean Squared Error:', mean_squared_error(y_true, y_pred, squared=False))
        print(' (*) Median Absolute Error:', median_absolute_error(y_true, y_pred))

In [None]:
def init_regression_df(ds_number=6):
    if ds_number == 1:
        reg_gs = RegressionGridSearch(df_path=r'./dataset/researchDataset/DS07012.csv')  # DS1
    elif ds_number == 2:
        reg_gs = RegressionGridSearch(df_path=r'./dataset/researchDataset/DS07012.csv')  # DS2
    elif ds_number == 3:
        reg_gs = RegressionGridSearch(df_path=r'./dataset/researchDataset/DS07310.csv')  # DS3
    elif ds_number == 4:
        reg_gs = RegressionGridSearch(df_path=r'./dataset/researchDataset/DS07410.csv')  # DS4
    elif ds_number == 5:
        reg_gs = RegressionGridSearch(df_path=r'./dataset/researchDataset/DS07510.csv')  # DS5
    elif ds_number == 6:
        reg_gs = RegressionGridSearch(df_path=r'./dataset/researchDataset/DS07610.csv')  # DS6
    return(reg_gs)

In [None]:
def run_grid_search(regressor, parameters):
    # Input ds_number 1 through 6 to choose one of the 6 datasets available
    reg_gs = init_regression_df()
    reg_gs.grid_search(regressor, parameters)

In [None]:
regressor = DecisionTreeRegressor(random_state=1)
parameters = {
    'max_depth': range(3, 50, 5),
    'min_samples_split': range(2, 30, 2)
}
run_grid_search(regressor, parameters)

In [None]:
regressor = RandomForestRegressor(random_state=19, )
parameters = {
    'n_estimators': range(100, 200, 100),
    'max_depth': range(10, 50, 10)
}
run_grid_search(regressor, parameters)

In [None]:
regressor = GradientBoostingRegressor(n_estimators=400, learning_rate=0.05, random_state=17, )
parameters = {
    'max_depth': range(10, 50, 10),
    'min_samples_split': range(2, 30, 3)
}
run_grid_search(regressor, parameters)

In [None]:
regressor = HistGradientBoostingRegressor(max_iter=400, learning_rate=0.05, random_state=13, )
parameters = {
    'max_depth': range(10, 50, 10),
    'min_samples_leaf': range(5, 50, 10)
}
run_grid_search(regressor, parameters)

In [None]:
regressor = linear_model.SGDRegressor(early_stopping=True, n_iter_no_change=5, random_state=11, )
parameters = {
    'loss': ['squared_loss', 'huber', 'epsilon_insensitive'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'max_iter': range(50, 1000, 50),
    'learning_rate': ['invscaling', 'optimal', 'constant', 'adaptive'],
    'eta0': [0.1, 0.01],
    'average': [32, ]
}
run_grid_search(regressor, parameters)

In [None]:
regressor = MLPRegressor(random_state=7, )
parameters = {
    'hidden_layer_sizes': [(256, 100), (512, 256, 100), ],
    'activation': ['tanh', ],
    'solver': ['adam', ],
    'max_iter': range(50, 200, 50)
}
run_grid_search(regressor, parameters)