In [43]:
import pickle
from datetime import datetime
from pprint import pp

import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split

In [44]:
class MyUtil:
    @staticmethod
    def save_data(filename, data):
        with open(filename, "wb") as file:
            pickle.dump(data, file)

    @staticmethod
    def load_data(filename):
        with open(filename, "rb") as file:
            data = pickle.load(file)
        return data

    @staticmethod
    def get_dt():
        return datetime.now().strftime("%Y-%m-%d_%H-%M")

In [45]:
class DataHandler:
    def __init__(self, _X, _Y, scalerX, scalerY):
        self._X = _X
        self._Y = _Y
        self.scalerX = scalerX
        self.scalerY = scalerY
        self.X_train = None
        self.X_test = None
        self.Y_train = None
        self.Y_test = None

    def split_and_scale(self, test_size, random_state):
        _X_train, _X_test, _Y_train, _Y_test = train_test_split(
            self._X, self._Y, test_size=test_size, random_state=random_state
        )
        self.X_train = self.scalerX.fit_transform(_X_train)
        self.X_test = self.scalerX.transform(_X_test)

        self.Y_train = self.scalerY.fit_transform(_Y_train)
        self.Y_test = self.scalerY.transform(_Y_test)

    def get_train(self):
        return self.X_train, self.Y_train

    def get_test(self):
        return self.X_test, self.Y_test


In [46]:
class RegSwitcher(BaseEstimator):
    def __init__(self, base=None):
        self.base = base

    def fit(self, X, Y):
        self.base.fit(X, Y)
        self.is_fitted_ = True
        return self

    def predict(self, X):
        return self.base.predict(X)

In [47]:
df = pd.read_excel("data.xlsx", index_col="exp")
df.head()

Unnamed: 0_level_0,m1,m2,m3,s1__autocorrelation__lag_8,s1__autocorrelation__lag_9,s1__autocorrelation__lag_7,s1__autocorrelation__lag_6,s1__autocorrelation__lag_5,s1__autocorrelation__lag_4,s1__longest_strike_above_mean,...,s1__ar_coefficient__coeff_3__k_10,s1__approximate_entropy__m_2__r_0.1,s1__lempel_ziv_complexity__bins_3,s1__partial_autocorrelation__lag_4,"s1__fft_coefficient__attr_""abs""__coeff_7","s1__agg_autocorrelation__f_agg_""var""__maxlag_40",s1__spkt_welch_density__coeff_2,y1,y2,y3
exp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E001,150.223716,1176.177278,1.142097,-0.305434,-0.519191,-0.074829,0.159896,0.38579,0.590387,14,...,0.183996,0.158567,0.204152,-0.360084,0.293617,0.499488,5.886812e-08,55.460434,1.065917,114.57862
E002,102.534268,1483.654982,1.104716,-0.243785,-0.454262,-0.021002,0.202836,0.416423,0.608972,14,...,0.18437,0.144742,0.203008,-0.344364,6.142373,0.477743,3.643621e-06,50.640306,1.285666,124.651484
E003,119.890549,1254.897451,2.162773,-0.329006,-0.543405,-0.095913,0.142612,0.373002,0.582151,14,...,0.184036,0.144268,0.208163,-0.364611,26.783283,0.506435,0.0001590028,50.832405,1.154859,57.018054
E004,162.830799,1302.043195,1.308283,-0.065152,-0.266498,0.138913,0.337187,0.521401,0.683873,16,...,0.187213,0.137326,0.193662,-0.355441,33.227591,0.460547,0.0007926165,62.476545,1.025161,132.221218
E005,165.720956,1154.482314,1.56683,-0.304881,-0.518177,-0.074836,0.159321,0.384728,0.589003,14,...,0.183978,0.128546,0.19244,-0.357588,11.43947,0.49794,0.0001462831,57.634438,1.043776,92.160269


In [48]:
_X = df.iloc[:, :-3].values
_Y = df.iloc[:, -3:].values
print(_X.shape)
print(_Y.shape)

(100, 47)
(100, 3)


In [49]:
from sklearn.preprocessing import StandardScaler

data_handler = DataHandler(_X=_X, _Y=_Y, scalerX=StandardScaler(), scalerY=StandardScaler())

In [50]:
data_handler.split_and_scale(test_size=0.3, random_state=2)
X_train, X_test = data_handler.get_train()

print(X_train[0])

[ 1.45966997  0.09180624  1.42558589 -0.31565937 -0.36901858 -0.27462642
 -0.24168784 -0.21387188 -0.18842581 -0.17691713 -0.16138661  0.34609555
 -0.57729368 -0.12309496  0.31386586  0.49853154 -0.02907571 -0.02907571
  0.55865024  0.5321004   0.5798525   0.60704539  0.52419912 -0.71491714
 -0.18663598 -0.09845298 -0.26334793 -0.34938073  0.29698967  0.36797007
  0.3967732   0.45856338  0.62417979 -0.31489993 -0.47125572  0.77058225
 -0.17784209 -0.33041208  0.71952499 -0.11826807 -0.07758038 -0.08551676
 -0.31948663 -0.59830241 -0.47866752 -0.7457376  -0.23003802]


In [51]:
from sklearn.svm import SVR

svr = SVR()
print(svr.get_params())
svr.set_params(C=2)
print(svr.get_params())


{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}
{'C': 2, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [52]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

lg = LinearRegression()
svr = SVR()

print(svr.get_params())
rs = RegSwitcher(base=svr)
print(rs.get_params())
rs.set_params(base=lg)
print(rs.get_params())

{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}
{'base__C': 1.0, 'base__cache_size': 200, 'base__coef0': 0.0, 'base__degree': 3, 'base__epsilon': 0.1, 'base__gamma': 'scale', 'base__kernel': 'rbf', 'base__max_iter': -1, 'base__shrinking': True, 'base__tol': 0.001, 'base__verbose': False, 'base': SVR()}
{'base__copy_X': True, 'base__fit_intercept': True, 'base__n_jobs': None, 'base__positive': False, 'base__tol': 1e-06, 'base': LinearRegression()}


In [53]:
from sklearn.model_selection import ParameterGrid

param_grid_split = [
    {
        "random_state": [1, 2, 3, 4, 5],
        "test_size": [0.3]
    }
]

param_list_split = list(ParameterGrid(param_grid_split))
pp(param_list_split)

[{'random_state': 1, 'test_size': 0.3},
 {'random_state': 2, 'test_size': 0.3},
 {'random_state': 3, 'test_size': 0.3},
 {'random_state': 4, 'test_size': 0.3},
 {'random_state': 5, 'test_size': 0.3}]


In [54]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR

base_lr = MultiOutputRegressor(estimator=LinearRegression())
base_svr = MultiOutputRegressor(estimator=SVR())
base_rf = MultiOutputRegressor(estimator=RandomForestRegressor())
# base_gbr = MultiOutputRegressor(estimator=GradientBoostingRegressor())

# This is for testing
reg = RegSwitcher(base=base_svr)
pp(reg.get_params())

{'base__estimator__C': 1.0,
 'base__estimator__cache_size': 200,
 'base__estimator__coef0': 0.0,
 'base__estimator__degree': 3,
 'base__estimator__epsilon': 0.1,
 'base__estimator__gamma': 'scale',
 'base__estimator__kernel': 'rbf',
 'base__estimator__max_iter': -1,
 'base__estimator__shrinking': True,
 'base__estimator__tol': 0.001,
 'base__estimator__verbose': False,
 'base__estimator': SVR(),
 'base__n_jobs': None,
 'base': MultiOutputRegressor(estimator=SVR())}


In [55]:
param_grid_hyper = [
    {"base": [base_lr]},
    {"base": [base_svr], "base__estimator__C": [0.01, 0.1, 1]},
    {"base": [base_rf], "base__estimator__n_estimators": [10, 50, 200]},
    # {"base": [base_gbr], "base__estimator__max_depth": [1, 2, 3, 4, 5]},
]

In [56]:
# Initialize blank model (optional)
reg = RegSwitcher(base=None)

df_arr = []
for idx_split, param_split in enumerate(param_list_split):
    print(idx_split, param_split)
    data_handler.split_and_scale(**param_split)
    X_train, Y_train = data_handler.get_train()
    
    gs = GridSearchCV(
        estimator=reg,
        param_grid=param_grid_hyper,
        cv=3,
        # scoring="neg_mean_squared_error",
        scoring="r2",
        n_jobs=-1,
    )
    gs.fit(X_train, Y_train)
    _df = pd.DataFrame(gs.cv_results_)
    _df["id_split"] = idx_split
    _df["param_split"] = [param_split for _ in range(_df.shape[0])]
    df_arr.append(_df)

0 {'random_state': 1, 'test_size': 0.3}
1 {'random_state': 2, 'test_size': 0.3}
2 {'random_state': 3, 'test_size': 0.3}
3 {'random_state': 4, 'test_size': 0.3}
4 {'random_state': 5, 'test_size': 0.3}


In [57]:
df_cv = pd.concat(df_arr)
df_cv = df_cv.reset_index().rename(columns={"index": "id_gs"})

In [58]:
df_cv

Unnamed: 0,id_gs,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_base,param_base__estimator__C,param_base__estimator__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,id_split,param_split
0,0,0.021695,0.01477174,0.003337,0.001252202,MultiOutputRegressor(estimator=LinearRegressio...,,,{'base': MultiOutputRegressor(estimator=Linear...,0.038175,0.752024,-7.906866,-2.372222,3.92442,7,0,"{'random_state': 1, 'test_size': 0.3}"
1,1,0.010051,0.0007133905,0.004521,0.001230909,MultiOutputRegressor(estimator=SVR()),0.01,,{'base': MultiOutputRegressor(estimator=SVR())...,-0.002276,-0.041889,0.024655,-0.006503,0.02733,6,0,"{'random_state': 1, 'test_size': 0.3}"
2,2,0.013833,0.005323039,0.004528,0.001637938,MultiOutputRegressor(estimator=SVR()),0.1,,{'base': MultiOutputRegressor(estimator=SVR())...,0.302295,0.30957,0.302825,0.304896,0.003312,5,0,"{'random_state': 1, 'test_size': 0.3}"
3,3,0.007871,0.0006568503,0.005011,0.001222535,MultiOutputRegressor(estimator=SVR()),1.0,,{'base': MultiOutputRegressor(estimator=SVR())...,0.617707,0.814445,0.800537,0.744229,0.089645,3,0,"{'random_state': 1, 'test_size': 0.3}"
4,4,0.081997,0.007956818,0.003994,0.0006916958,MultiOutputRegressor(estimator=RandomForestReg...,,10.0,{'base': MultiOutputRegressor(estimator=Random...,0.64324,0.851726,0.721156,0.738707,0.086014,4,0,"{'random_state': 1, 'test_size': 0.3}"
5,5,0.409106,0.03079762,0.011504,0.002464282,MultiOutputRegressor(estimator=RandomForestReg...,,50.0,{'base': MultiOutputRegressor(estimator=Random...,0.735177,0.838299,0.712286,0.761921,0.05481,1,0,"{'random_state': 1, 'test_size': 0.3}"
6,6,1.039933,0.09878497,0.01786,0.0006411843,MultiOutputRegressor(estimator=RandomForestReg...,,200.0,{'base': MultiOutputRegressor(estimator=Random...,0.713442,0.848271,0.721348,0.76102,0.06178,2,0,"{'random_state': 1, 'test_size': 0.3}"
7,0,0.002,6.743496e-07,0.000999,3.371748e-07,MultiOutputRegressor(estimator=LinearRegressio...,,,{'base': MultiOutputRegressor(estimator=Linear...,-0.139373,0.777359,-6.449579,-1.937197,3.212609,7,1,"{'random_state': 2, 'test_size': 0.3}"
8,1,0.002,1.325077e-06,0.001,1.072147e-06,MultiOutputRegressor(estimator=SVR()),0.01,,{'base': MultiOutputRegressor(estimator=SVR())...,0.042289,0.032906,-0.041063,0.011377,0.037278,6,1,"{'random_state': 2, 'test_size': 0.3}"
9,2,0.002504,0.000408612,0.001003,4.010034e-06,MultiOutputRegressor(estimator=SVR()),0.1,,{'base': MultiOutputRegressor(estimator=SVR())...,0.328093,0.341163,0.244914,0.304724,0.042627,5,1,"{'random_state': 2, 'test_size': 0.3}"
