In this notebook we use the `./final/dataset.npz` dataset to train multiple ML models to create a surrogate models. The comparison between models will be presented in the next notebook.

During training process dataset is split into train, test and validate sets of sizes `700000`, `150000` and `150000`.

List of tested methods (with links to used implementations):
1. [Neural Networks](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html)
2. Linear regression models:
    1. [Linear Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression)
    2. [Ridge Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge)
    3. [Lasso (LARS)](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLars.html#sklearn.linear_model.LassoLars)
    4. [Elastic Net](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html#sklearn.linear_model.ElasticNet)
3. Decistion Trees models:
    1. [Boosting](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html)
    2. [Bagging](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html)
    3. [Random Forrest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
    4. [Extra Trees](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html)
4. [k-nearest neighbors](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html)

For each ML method we use [Optuna](https://optuna.readthedocs.io) to find best performing set of parameters.

In [1]:
# Loading dataset
import numpy as np

input_and_output = np.load("../final/dataset.npz")
inputs  = input_and_output["inputs"].astype(np.float64)
outputs = input_and_output["outputs"].astype(np.float64)

print(f"inputs shape: {inputs.shape} dtype: {inputs.dtype}, outputs shape: {outputs.shape}, dtype: {outputs.dtype}")
dataset_size = inputs.shape[0]

inputs shape: (1000000, 11) dtype: float64, outputs shape: (1000000, 200), dtype: float64


In [2]:
# In this problem we are interrested in order of magnitude rather than absolute value of the tumour size.
# To train the models output is first transformed with log_10. When the tumour size is smaller than 10^-9 L, 
# there is no way to find it, so we can limit the lower bound of tumour size with 10^-9

LOWER_LIMIT = -9

def output_transform(outputs: np.array) -> np.array:
    x = np.copy(outputs)
    zeros_in_output = x <= 0
    x[zeros_in_output] = 1
    y = np.log10(x)
    y[zeros_in_output] = LOWER_LIMIT
    y[y < LOWER_LIMIT] = LOWER_LIMIT
    return y
    
def output_untransform(transformed_outputs: np.array) -> np.array:
    lower_limits = transformed_outputs <= LOWER_LIMIT
    z = 10 ** transformed_outputs
    z[lower_limits] = 0
    return z

test_output = np.array([10**(-8), 10**(-5), 10**(-10), 10**(-9),1,2,10,0.0])

print(f"test output {test_output}")
print(f"transformed output: {output_transform(test_output)}")
print(f"original output is untouched after transform: {test_output}")
print(f"transformed and untransformed output: {output_untransform(output_transform(test_output))}")

outputs_order_of_magnitude = output_transform(outputs)

test output [1.e-08 1.e-05 1.e-10 1.e-09 1.e+00 2.e+00 1.e+01 0.e+00]
transformed output: [-8.      -5.      -9.      -9.       0.       0.30103  1.      -9.     ]
original output is untouched after transform: [1.e-08 1.e-05 1.e-10 1.e-09 1.e+00 2.e+00 1.e+01 0.e+00]
transformed and untransformed output: [1.e-08 1.e-05 0.e+00 0.e+00 1.e+00 2.e+00 1.e+01 0.e+00]


In [3]:
# Input data has an extra column with all ones - that we get rid of before training the model

def drop_treatment(input_data: np.ndarray) -> np.ndarray:
    """Drops treatment data from the dataset"""
    if input_data.shape[1] == 11:
        return input_data[:, 1:]

    return input_data

input_without_treatment = drop_treatment(inputs)

In [4]:
# Splitting dataset into train, test, validate subsets
train_size = int(dataset_size * 0.7)
test_size = int(dataset_size * 0.15)

X_train = input_without_treatment[:train_size, :]
Y_train = outputs_order_of_magnitude[:train_size, :]
print(f"train sizes: {X_train.shape}, {Y_train.shape}")
X_test = input_without_treatment[train_size:(train_size + test_size), :]
Y_test = outputs_order_of_magnitude[train_size:(train_size + test_size), :]
print(f"test sizes: {X_test.shape}, {Y_test.shape}")

train sizes: (700000, 10), (700000, 200)
test sizes: (150000, 10), (150000, 200)


In [5]:
# scaling inputs
from sklearn.preprocessing import MinMaxScaler
import pickle
from pathlib import Path


LOGNORMAL_PARAMETERS = (1, 2)

class CustomScaler:
    def __init__(self):
        super().__init__()
        self.scaler = MinMaxScaler()
        self.plot_loval = [0.0] * len(LOGNORMAL_PARAMETERS)
        self.plot_hival = [1.0] * len(LOGNORMAL_PARAMETERS)

    def transform(self, x: np.ndarray, copy=None) -> np.ndarray:
        res = self.scaler.transform(x)
        for i, parameter_index in enumerate(LOGNORMAL_PARAMETERS):
            res[:, parameter_index] = (x[:, parameter_index] - self.plot_loval[i]) / (self.plot_hival[i] - self.plot_loval[i])

        return res

    def fit(self, x, copy=None):
        self.scaler.fit(x)
        for i, parameter_index in enumerate(LOGNORMAL_PARAMETERS):
            column_values = x[:, parameter_index]

            quantile_1, quantile_3 = np.quantile(column_values, [0.25, 0.75], axis=0)
            iqr = quantile_3 - quantile_1

            loval = quantile_1 - 1.5 * iqr
            hival = quantile_3 + 1.5 * iqr

            wiskhi = np.compress(column_values <= hival, column_values)
            wisklo = np.compress(column_values >= loval, column_values)
            actual_hival = np.max(wiskhi)
            actual_loval = np.min(wisklo)

            self.plot_loval[i] = actual_loval
            self.plot_hival[i] = actual_hival

        return self

    def inverse_transform(self, x, copy=None):
        res = self.scaler.inverse_transform(x)
        for i, parameter_index in enumerate(LOGNORMAL_PARAMETERS):
            res[:, parameter_index] = x[:, parameter_index] * (self.plot_hival[i] - self.plot_loval[i]) + self.plot_loval[i]
        return res

scaler_path = Path(f"../final/scaler.pickle")
scaler = None
if scaler_path.exists():
    with scaler_path.open("rb") as scaler_file:
        scaler = pickle.load(scaler_file)
else:
    scaler = CustomScaler().fit(X_train)
    with scaler_path.open("wb") as opened_file:
        pickle.dump(scaler, opened_file)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# applying pca to outputs
from sklearn.decomposition import PCA


PCA_COMPONENTS=12

pca_path = Path(f"../final/pca{PCA_COMPONENTS}.pickle")

if pca_path.exists():
    with pca_path.open("rb") as opened_file:
        pca = pickle.load(opened_file)
    Y_train_pca = pca.transform(Y_train)
else: 
    pca = PCA(n_components=PCA_COMPONENTS)
    Y_train_pca = pca.fit_transform(Y_train)
    with pca_path.open("wb") as opened_file:
        pickle.dump(pca, opened_file)

print(pca.explained_variance_ratio_)

[9.63422836e-01 3.45880317e-02 1.13584299e-03 7.00114229e-04
 8.85658857e-05 3.47212159e-05 1.46535285e-05 6.29722232e-06
 3.64216009e-06 2.04051841e-06 1.09718389e-06 6.53377990e-07]


In [7]:
# Example of limitting running time of the loop by scheduling an sigalrm and adding a handler for it.

import optuna
import time
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import psutil
import os
import signal

class TimeoutError(Exception):
    pass

def timeout_handler(*args):
    raise TimeoutError(f"timeout {args}")

def keyboard_interrupt_handler(*args):
    os.kill(os.getpid(), signal.SIGINT)
    
signal.signal(signal.SIGALRM, timeout_handler)

try:
    signal.alarm(2)
    time.sleep(1)
    signal.alarm(0)
    print(f"1. no exception")
except TimeoutError as e:
    print(f"1. message: {e}") 
except KeyboardInterrupt as e:
    print(f"1. KEYBOARD: {e}") 

try:
    signal.alarm(2)
    time.sleep(5)
    signal.alarm(0)
    print(f"2. no exception")
except TimeoutError as e:
    print(f"2. message: {e}")
except KeyboardInterrupt as e:
    print(f"2. keyboard: {e}") 

1. no exception
2. message: timeout (14, <frame at 0x12ac55f10, file '/var/folders/xz/pys3pfc567s9bszzmlt87ytm0000gn/T/ipykernel_22073/3465348698.py', line 35, code <module>>)


In [8]:
# Hyperparameter optimization for neural network

from sklearn.neural_network import MLPRegressor

common_params={
    "tol": 1e-5,
    "n_iter_no_change": 5,
    "random_state": 42,
    "warm_start": False
}

def objective(trial):
    global common_params
    
    phase = min((trial.number // 30), 3)
    training_sizes = (0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (300, 600, 1200, 3600)
    
    max_iter = [1000, 2000, 4000, 8000][phase]
    batch_size = [500, 500, 1000, 2000][phase]
    
    alpha = trial.suggest_float("alpha", 0.001, 1.0, log=True)
    learning_rate = trial.suggest_categorical("learning_rate", ["constant", "invscaling", "adaptive"])
    learning_rate_init = trial.suggest_float("learning_rate_init", 0.0001, 0.05, log=True)
    power_t = trial.suggest_float("power_t", 0.1, 2.0, log=True) if learning_rate == "invscaling" else 0.5
    
    model_params = {
        "max_iter": max_iter,
        "batch_size": batch_size,
        
        "alpha": alpha,
        "learning_rate": learning_rate,
        "learning_rate_init": learning_rate_init,
        "power_t": power_t,
        
        "hidden_layer_sizes": [
            trial.suggest_int(f"layer1/3", 600, 1200, step=200),
            trial.suggest_int(f"layer2/3", 50, 400, step=50),
            *([trial.suggest_int(f"layer3/3", 10, 50, step=10)])
        ],
        **common_params
    }
    
    trial.set_user_attr("model_params", model_params)
    
    model = MLPRegressor(**trial.user_attrs["model_params"])
    
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    trial.set_user_attr("training_size", training_size)
    max_duration_s = training_max_duration_s[phase]
    trial.set_user_attr("max_duration_s", max_duration_s)
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        signal.alarm(0)
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
    except TimeoutError:
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="MLPRegressor", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    previous_handler = signal.signal(signal.SIGALRM, keyboard_interrupt_handler)
    study.optimize(objective, n_trials=trials_due)
    signal.signal(signal.SIGALRM, previous_handler)
    
print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-01 13:55:56,629][0m A new study created in RDB with name: MLPRegressor[0m
[32m[I 2023-03-01 13:56:24,522][0m Trial 0 finished with value: 0.0030842948816073206 and parameters: {'alpha': 0.21188838304714536, 'learning_rate': 'constant', 'learning_rate_init': 0.0006235441962792657, 'layer1/3': 800, 'layer2/3': 100, 'layer3/3': 40}. Best is trial 0 with value: 0.0030842948816073206.[0m
[32m[I 2023-03-01 13:58:03,293][0m Trial 1 finished with value: 0.0014728136016113513 and parameters: {'alpha': 0.007141506073780994, 'learning_rate': 'adaptive', 'learning_rate_init': 0.000568931303833518, 'layer1/3': 800, 'layer2/3': 400, 'layer3/3': 30}. Best is trial 1 with value: 0.0014728136016113513.[0m
[32m[I 2023-03-01 13:58:47,599][0m Trial 2 finished with value: 0.0026766198554106623 and parameters: {'alpha': 0.06127437245280689, 'learning_rate': 'constant', 'learning_rate_init': 0.0008351837928541461, 'layer1/3': 800, 'layer2/3': 400, 'layer3/3': 50}. Best is trial 1 wi

[32m[I 2023-03-01 14:34:46,533][0m Trial 24 finished with value: 0.0009788594947372743 and parameters: {'alpha': 0.11645783266679893, 'learning_rate': 'invscaling', 'learning_rate_init': 0.001120586770214973, 'power_t': 0.5316821914063279, 'layer1/3': 1200, 'layer2/3': 300, 'layer3/3': 20}. Best is trial 13 with value: 0.0008789369963895043.[0m
[32m[I 2023-03-01 14:38:01,406][0m Trial 25 finished with value: 0.0008823974150908036 and parameters: {'alpha': 0.08575701070549284, 'learning_rate': 'invscaling', 'learning_rate_init': 0.0011823010071701223, 'power_t': 0.530992215853774, 'layer1/3': 1000, 'layer2/3': 250, 'layer3/3': 20}. Best is trial 13 with value: 0.0008789369963895043.[0m
[32m[I 2023-03-01 14:38:26,507][0m Trial 26 finished with value: 0.002556487002145088 and parameters: {'alpha': 0.0367067393705702, 'learning_rate': 'invscaling', 'learning_rate_init': 0.0029115239021935053, 'power_t': 0.6618351670274185, 'layer1/3': 1000, 'layer2/3': 150, 'layer3/3': 10}. Best is

[32m[I 2023-03-01 16:23:29,898][0m Trial 46 finished with value: 0.0004155502404863049 and parameters: {'alpha': 0.06582983320200295, 'learning_rate': 'adaptive', 'learning_rate_init': 0.00029817577570265675, 'layer1/3': 600, 'layer2/3': 50, 'layer3/3': 10}. Best is trial 39 with value: 0.00038748865520281395.[0m
[32m[I 2023-03-01 16:33:33,800][0m Trial 47 finished with value: 0.00048016329638114427 and parameters: {'alpha': 0.06966802130899713, 'learning_rate': 'adaptive', 'learning_rate_init': 0.00025511841735134937, 'layer1/3': 600, 'layer2/3': 50, 'layer3/3': 10}. Best is trial 39 with value: 0.00038748865520281395.[0m
[32m[I 2023-03-01 16:43:38,376][0m Trial 48 finished with value: 0.00043650826118268616 and parameters: {'alpha': 0.05998242331754204, 'learning_rate': 'adaptive', 'learning_rate_init': 0.0002444458190002227, 'layer1/3': 600, 'layer2/3': 50, 'layer3/3': 10}. Best is trial 39 with value: 0.00038748865520281395.[0m
[32m[I 2023-03-01 16:53:45,968][0m Trial 49

[32m[I 2023-03-01 19:15:21,717][0m Trial 61 finished with value: 0.00032641352317357776 and parameters: {'alpha': 0.014214214698847592, 'learning_rate': 'adaptive', 'learning_rate_init': 0.00017358408220318164, 'layer1/3': 800, 'layer2/3': 150, 'layer3/3': 50}. Best is trial 60 with value: 0.00019177660633728667.[0m
[32m[I 2023-03-01 19:35:30,206][0m Trial 62 finished with value: 0.0001965345529835967 and parameters: {'alpha': 0.01433757195403651, 'learning_rate': 'adaptive', 'learning_rate_init': 0.0001734953284868109, 'layer1/3': 800, 'layer2/3': 150, 'layer3/3': 50}. Best is trial 60 with value: 0.00019177660633728667.[0m
[32m[I 2023-03-01 19:55:09,784][0m Trial 63 finished with value: 0.0002613115093557262 and parameters: {'alpha': 0.013362726651909747, 'learning_rate': 'adaptive', 'learning_rate_init': 0.00016165775626516665, 'layer1/3': 800, 'layer2/3': 150, 'layer3/3': 50}. Best is trial 60 with value: 0.00019177660633728667.[0m
[32m[I 2023-03-01 20:15:21,920][0m Tria

[32m[I 2023-03-02 00:17:28,409][0m Trial 76 finished with value: 0.0001792767026702128 and parameters: {'alpha': 0.00877184931912557, 'learning_rate': 'adaptive', 'learning_rate_init': 0.00021742984347552876, 'layer1/3': 800, 'layer2/3': 250, 'layer3/3': 40}. Best is trial 76 with value: 0.0001792767026702128.[0m
[32m[I 2023-03-02 00:37:38,549][0m Trial 77 finished with value: 0.0002160831223477979 and parameters: {'alpha': 0.0041698845964792145, 'learning_rate': 'adaptive', 'learning_rate_init': 0.00021671314614117696, 'layer1/3': 800, 'layer2/3': 250, 'layer3/3': 40}. Best is trial 76 with value: 0.0001792767026702128.[0m
[32m[I 2023-03-02 00:57:48,973][0m Trial 78 finished with value: 0.00019482417510311472 and parameters: {'alpha': 0.008659017751488952, 'learning_rate': 'adaptive', 'learning_rate_init': 0.00015485382697053943, 'layer1/3': 800, 'layer2/3': 250, 'layer3/3': 40}. Best is trial 76 with value: 0.0001792767026702128.[0m
[32m[I 2023-03-02 01:17:57,692][0m Trial

[32m[I 2023-03-02 07:34:07,440][0m Trial 92 finished with value: 9.933258073415425e-05 and parameters: {'alpha': 0.003775413021086052, 'learning_rate': 'adaptive', 'learning_rate_init': 0.00026893034072704496, 'layer1/3': 800, 'layer2/3': 350, 'layer3/3': 40}. Best is trial 91 with value: 9.596215167223074e-05.[0m
[32m[I 2023-03-02 08:34:22,254][0m Trial 93 finished with value: 9.017225257266805e-05 and parameters: {'alpha': 0.0030766762112440006, 'learning_rate': 'adaptive', 'learning_rate_init': 0.00022522792412950772, 'layer1/3': 800, 'layer2/3': 350, 'layer3/3': 40}. Best is trial 93 with value: 9.017225257266805e-05.[0m
[32m[I 2023-03-02 09:34:40,331][0m Trial 94 finished with value: 8.987444574725626e-05 and parameters: {'alpha': 0.0029879519050999255, 'learning_rate': 'adaptive', 'learning_rate_init': 0.00035901468189128335, 'layer1/3': 800, 'layer2/3': 350, 'layer3/3': 30}. Best is trial 94 with value: 8.987444574725626e-05.[0m
[32m[I 2023-03-02 10:34:56,465][0m Tria

model: MLPRegressor test dataset error: 8.987444574725626e-05 best_params: {'alpha': 0.0029879519050999255, 'layer1/3': 800, 'layer2/3': 350, 'layer3/3': 30, 'learning_rate': 'adaptive', 'learning_rate_init': 0.00035901468189128335}


In [10]:
# Test how neural network perform with 10 minute training cap on full dataset
from sklearn.neural_network import MLPRegressor

for learning_rate in ["constant", "invscaling", "adaptive"]:
    max_duration_s = 600
    common_params={
        "tol": 5e-6,
        "n_iter_no_change": 10,
        "random_state": 42,
        "warm_start": False,
        "max_iter": 10000,
        "batch_size": 2000,
        "learning_rate": learning_rate
    }

    def objective(trial):
        global common_params
        global max_duration_s

        alpha = trial.suggest_float("alpha", 0.0001, 1.0, log=True)
        learning_rate_init = trial.suggest_float("learning_rate_init", 0.0001, 0.05, log=True)
        power_t = trial.suggest_float("power_t", 0.1, 2.0, log=True) if learning_rate == "invscaling" else 0.5

        model_params = {
            "alpha": alpha,
            "learning_rate_init": learning_rate_init,
            "power_t": power_t,

            "hidden_layer_sizes": [
                trial.suggest_int(f"layer1/3", 600, 1200, step=200),
                trial.suggest_int(f"layer2/3", 50, 400, step=50),
                *([trial.suggest_int(f"layer3/3", 10, 50, step=10)])
            ],
            **common_params
        }

        trial.set_user_attr("model_params", model_params)

        model = MLPRegressor(**trial.user_attrs["model_params"])
        trial.set_user_attr("training_size", 1.0)
        trial.set_user_attr("max_duration_s", max_duration_s)

        try:
            signal.alarm(max_duration_s)
            model.fit(X_train_scaled, Y_train_pca)
            signal.alarm(0)
            Y_predict_pca = model.predict(X_test_scaled)
            Y_predict = pca.inverse_transform(Y_predict_pca)
            error = mean_squared_error(Y_test, Y_predict)
        except TimeoutError:
            raise optuna.exceptions.TrialPruned()
        return error

    study = optuna.create_study(study_name=f"MLPRegressor_{learning_rate}_{max_duration_s}", storage='sqlite:///../final/optuna.db', load_if_exists=True)
    trials_due = 40 - len(study.trials)
    if trials_due > 0:
        previous_handler = signal.signal(signal.SIGALRM, keyboard_interrupt_handler)
        study.optimize(objective, n_trials=trials_due)
        signal.signal(signal.SIGALRM, previous_handler)

    print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-04 23:31:57,171][0m Using an existing study with name 'MLPRegressor_constant_600' instead of creating a new one.[0m
[32m[I 2023-03-04 23:39:12,010][0m Trial 10 finished with value: 0.00044632827811190873 and parameters: {'alpha': 0.00018594913239497346, 'learning_rate_init': 0.015731157771255038, 'layer1/3': 1000, 'layer2/3': 200, 'layer3/3': 50}. Best is trial 6 with value: 0.00021952360825245112.[0m
[32m[I 2023-03-04 23:49:14,756][0m Trial 11 finished with value: 0.0001567602671812914 and parameters: {'alpha': 0.001036308109465393, 'learning_rate_init': 0.007813604456660923, 'layer1/3': 800, 'layer2/3': 250, 'layer3/3': 30}. Best is trial 11 with value: 0.0001567602671812914.[0m
[32m[I 2023-03-04 23:58:05,286][0m Trial 12 finished with value: 0.00034525974800220753 and parameters: {'alpha': 0.0008383598570330053, 'learning_rate_init': 0.013234195428356466, 'layer1/3': 800, 'layer2/3': 250, 'layer3/3': 30}. Best is trial 11 with value: 0.0001567602671812914.

[32m[I 2023-03-05 02:34:58,615][0m Trial 30 finished with value: 0.00018408387222956825 and parameters: {'alpha': 0.006596417743398532, 'learning_rate_init': 0.001087462870903596, 'layer1/3': 1000, 'layer2/3': 250, 'layer3/3': 20}. Best is trial 29 with value: 0.00012799958528685236.[0m
[32m[I 2023-03-05 02:45:01,474][0m Trial 31 finished with value: 0.00016201880164658241 and parameters: {'alpha': 0.0027474844058487137, 'learning_rate_init': 0.0018020096335111153, 'layer1/3': 800, 'layer2/3': 300, 'layer3/3': 30}. Best is trial 29 with value: 0.00012799958528685236.[0m
[32m[I 2023-03-05 02:55:04,167][0m Trial 32 finished with value: 7.95338600625236e-05 and parameters: {'alpha': 0.0021976121677802214, 'learning_rate_init': 0.0025086281095413575, 'layer1/3': 800, 'layer2/3': 250, 'layer3/3': 20}. Best is trial 32 with value: 7.95338600625236e-05.[0m
[32m[I 2023-03-05 03:05:11,374][0m Trial 33 finished with value: 0.0001764783380738596 and parameters: {'alpha': 0.002824064778

model: MLPRegressor_constant_600 test dataset error: 7.95338600625236e-05 best_params: {'alpha': 0.0021976121677802214, 'layer1/3': 800, 'layer2/3': 250, 'layer3/3': 20, 'learning_rate_init': 0.0025086281095413575}


[32m[I 2023-03-05 04:13:25,027][0m Trial 10 finished with value: 0.00014905156362845143 and parameters: {'alpha': 0.07851010613756759, 'learning_rate_init': 0.0024679885446470483, 'power_t': 0.7877040159572618, 'layer1/3': 1000, 'layer2/3': 200, 'layer3/3': 40}. Best is trial 10 with value: 0.00014905156362845143.[0m
[32m[I 2023-03-05 04:23:27,866][0m Trial 11 finished with value: 0.0008908958363721395 and parameters: {'alpha': 0.1082909475874544, 'learning_rate_init': 0.002094614855950445, 'power_t': 1.910431309953861, 'layer1/3': 1000, 'layer2/3': 200, 'layer3/3': 40}. Best is trial 10 with value: 0.00014905156362845143.[0m
[32m[I 2023-03-05 04:33:30,717][0m Trial 12 finished with value: 0.00037356818866161217 and parameters: {'alpha': 0.05451295305940302, 'learning_rate_init': 0.0020084252542920044, 'power_t': 0.7937969995781353, 'layer1/3': 800, 'layer2/3': 200, 'layer3/3': 40}. Best is trial 10 with value: 0.00014905156362845143.[0m
[32m[I 2023-03-05 04:43:33,800][0m Tr

[32m[I 2023-03-05 06:44:48,094][0m Trial 25 finished with value: 0.0001458599680097689 and parameters: {'alpha': 0.007000746945950041, 'learning_rate_init': 0.0007533688951957081, 'power_t': 1.4233644838634325, 'layer1/3': 1000, 'layer2/3': 100, 'layer3/3': 30}. Best is trial 25 with value: 0.0001458599680097689.[0m
[32m[I 2023-03-05 06:54:51,122][0m Trial 26 finished with value: 0.00041699118839577074 and parameters: {'alpha': 0.00912338966502857, 'learning_rate_init': 0.0002567127936763489, 'power_t': 1.396735609933043, 'layer1/3': 1000, 'layer2/3': 150, 'layer3/3': 20}. Best is trial 25 with value: 0.0001458599680097689.[0m
[32m[I 2023-03-05 07:04:55,753][0m Trial 27 finished with value: 9.539619281485134e-05 and parameters: {'alpha': 0.005389035139995959, 'learning_rate_init': 0.001587182673560063, 'power_t': 1.533157961904625, 'layer1/3': 1200, 'layer2/3': 50, 'layer3/3': 40}. Best is trial 27 with value: 9.539619281485134e-05.[0m
[32m[I 2023-03-05 07:14:59,383][0m Tria

model: MLPRegressor_invscaling_600 test dataset error: 8.527386328374639e-05 best_params: {'alpha': 0.0037839705738818854, 'layer1/3': 1200, 'layer2/3': 50, 'layer3/3': 40, 'learning_rate_init': 0.001572221328811536, 'power_t': 1.529276287774073}


[32m[I 2023-03-05 09:05:51,129][0m Using an existing study with name 'MLPRegressor_adaptive_600' instead of creating a new one.[0m
[32m[I 2023-03-05 09:15:54,132][0m Trial 10 finished with value: 0.001960311475482657 and parameters: {'alpha': 0.00010373156910349187, 'learning_rate_init': 0.033788340598837056, 'layer1/3': 600, 'layer2/3': 300, 'layer3/3': 40}. Best is trial 4 with value: 0.00013706229839877327.[0m
[32m[I 2023-03-05 09:25:57,593][0m Trial 11 finished with value: 0.00023999374595770997 and parameters: {'alpha': 0.02224140543470641, 'learning_rate_init': 0.0005945316844608408, 'layer1/3': 1000, 'layer2/3': 300, 'layer3/3': 50}. Best is trial 4 with value: 0.00013706229839877327.[0m
[32m[I 2023-03-05 09:33:12,753][0m Trial 12 finished with value: 0.0001625022125108751 and parameters: {'alpha': 0.002770182539674482, 'learning_rate_init': 0.012072567703886558, 'layer1/3': 600, 'layer2/3': 300, 'layer3/3': 50}. Best is trial 4 with value: 0.00013706229839877327.[0m

[32m[I 2023-03-05 11:54:04,825][0m Trial 26 finished with value: 0.00020172369637214916 and parameters: {'alpha': 0.006652842439836077, 'learning_rate_init': 0.0010538763289602022, 'layer1/3': 600, 'layer2/3': 350, 'layer3/3': 50}. Best is trial 4 with value: 0.00013706229839877327.[0m
[32m[I 2023-03-05 12:04:08,460][0m Trial 27 finished with value: 0.00041746976342834654 and parameters: {'alpha': 0.02012679413450818, 'learning_rate_init': 0.0026424887089610866, 'layer1/3': 800, 'layer2/3': 400, 'layer3/3': 30}. Best is trial 4 with value: 0.00013706229839877327.[0m
[32m[I 2023-03-05 12:14:11,687][0m Trial 28 finished with value: 0.00021735722657565272 and parameters: {'alpha': 0.001740174439555746, 'learning_rate_init': 0.011343515481995295, 'layer1/3': 1000, 'layer2/3': 300, 'layer3/3': 10}. Best is trial 4 with value: 0.00013706229839877327.[0m
[32m[I 2023-03-05 12:24:14,244][0m Trial 29 finished with value: 0.00015099666332789001 and parameters: {'alpha': 0.0005451987419

model: MLPRegressor_adaptive_600 test dataset error: 0.0001240866740246993 best_params: {'alpha': 0.006777069662715181, 'layer1/3': 800, 'layer2/3': 350, 'layer3/3': 20, 'learning_rate_init': 0.001549679834450489}


In [11]:
# Lasso (Linear with an L1 regularizer) model fit with Least Angle Regression

from sklearn.linear_model import LassoLars

common_params = {
    "random_state": 42
}

def objective(trial):
    global common_params
    phase = min((trial.number // 30), 4)

    training_sizes = (0.03, 0.1, 0.3, 1.0 if psutil.virtual_memory().total > 40 * 10**9 else 0.5)
    training_max_duration_s = (120, 180, 360, 600, 1200)
    
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    max_iter = [1000, 2000, 4000, 8000][phase]
    
    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    trial.set_user_attr("common_params", {
        "mas_iter": max_iter,
        **common_params
    })
    
    alpha = trial.suggest_float("alpha", 0.00001, 0.1, log=True)
    degree = trial.suggest_int("polynomial degree", 1, 5)
    
    model = make_pipeline(
        PolynomialFeatures(degree),
        LassoLars(
            alpha=alpha,
            max_iter=max_iter,
            **common_params
        )
    )
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except TimeoutError:
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="LassoLars", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)
    
print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-01 11:36:26,138][0m Using an existing study with name 'LassoLars' instead of creating a new one.[0m


model: LassoLars test dataset error: 0.024058488345672494 best_params: {'alpha': 0.0002644100143852797, 'polynomial degree': 5}


In [12]:
# Linear Regression model (least squares)

from sklearn.linear_model import LinearRegression

MAX_POLYNOMIAL_DEGREE = 4


def objective(trial):
    global common_params
    phase = min((trial.number // MAX_POLYNOMIAL_DEGREE), 4)

    training_sizes = (0.03, 0.1, 0.3, 1.0 if psutil.virtual_memory().total > 40 * 10**9 else 0.5)
    training_max_duration_s = (120, 180, 360, 600)
    
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    
    degree = (trial.number % MAX_POLYNOMIAL_DEGREE) + 1
    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    trial.set_user_attr("polynomial degree", degree)
    
    model = make_pipeline(
        PolynomialFeatures(degree),
        LinearRegression()
    )
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except TimeoutError:
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="LinearRegression", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 16 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)

print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-01 11:36:26,240][0m Using an existing study with name 'LinearRegression' instead of creating a new one.[0m


model: LinearRegression test dataset error: 0.02956507698551257 best_params: {}


In [13]:
# Linear least squares with l2 regularization
from sklearn.linear_model import Ridge

common_params = {
    "random_state": 42,
    "tol": 1e-5
}

MAX_POLYNOMIAL_DEGREE = 3

def objective(trial):
    global common_params
    phase = min((trial.number // 30), 4)

    training_sizes = (0.03, 0.1, 0.3, 1.0 if psutil.virtual_memory().total > 40 * 10**9 else 0.5)
    training_max_duration_s = (240, 360, 600, 1200)
    
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    max_iter = [1000, 2000, 4000, 8000][phase]
    
    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    trial.set_user_attr("common_params", {
        "mas_iter": max_iter,
        **common_params
    })
    
    alpha = trial.suggest_float("alpha", 0.00001, 0.1, log=True)
    solver = trial.suggest_categorical("solver", ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"])
    degree = trial.suggest_int("polynomial degree", 1, MAX_POLYNOMIAL_DEGREE)
    
    model = make_pipeline(
        PolynomialFeatures(degree),
        Ridge(
            alpha=alpha,
            max_iter=max_iter,
            solver=solver,
            **common_params
        )
    )
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except TimeoutError:
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="Ridge", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)
    
print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-01 11:36:26,319][0m Using an existing study with name 'Ridge' instead of creating a new one.[0m


model: Ridge test dataset error: 0.05766259340561942 best_params: {'alpha': 0.099006936464171, 'polynomial degree': 3, 'solver': 'svd'}


In [14]:
# Linear regression with combined L1 and L2 priors as regularizer
from sklearn.linear_model import ElasticNet

common_params = {
    "random_state": 42,
    "tol": 1e-5
}

MAX_POLYNOMIAL_DEGREE = 3

def objective(trial):
    global common_params
    phase = min((trial.number // 30), 4)

    training_sizes = (0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (240, 360, 600, 1200)
    
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    max_iter = [20000, 40000, 80000, 160000][phase]
    
    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    trial.set_user_attr("common_params", {
        "max_iter": max_iter,
        **common_params
    })
    
    l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)
    alpha = trial.suggest_float("alpha", 0.00001, 1.0, log=True)
    fit_intercept = trial.suggest_categorical("fit_intercept", [True, False])
    degree = trial.suggest_int("polynomial degree", 1, MAX_POLYNOMIAL_DEGREE)
    
    model = make_pipeline(
        PolynomialFeatures(degree),
        ElasticNet(
            alpha=alpha,
            l1_ratio=l1_ratio,
            fit_intercept=fit_intercept,
            max_iter=max_iter,
            **common_params
        )
    )
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except TimeoutError:
        raise optuna.exceptions.TrialPruned()
    return error

try:
    optuna.delete_study(study_name="ElasticNet", storage='sqlite:///../final/optuna.db')
except:
    pass

study = optuna.create_study(study_name="ElasticNet", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 20 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)
    
print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-01 11:36:26,486][0m A new study created in RDB with name: ElasticNet[0m
[32m[I 2023-03-01 11:36:28,861][0m Trial 0 finished with value: 0.1415348449670532 and parameters: {'l1_ratio': 0.05778395898737221, 'alpha': 0.00010890908496958414, 'fit_intercept': True, 'polynomial degree': 2}. Best is trial 0 with value: 0.1415348449670532.[0m
[32m[I 2023-03-01 11:36:30,327][0m Trial 1 finished with value: 0.6070698262213244 and parameters: {'l1_ratio': 0.9588414663412709, 'alpha': 0.9844233575379248, 'fit_intercept': False, 'polynomial degree': 3}. Best is trial 0 with value: 0.1415348449670532.[0m
[32m[I 2023-03-01 11:36:42,218][0m Trial 2 finished with value: 0.14912926052409606 and parameters: {'l1_ratio': 0.47305777105203706, 'alpha': 0.0021004298957074505, 'fit_intercept': False, 'polynomial degree': 2}. Best is trial 0 with value: 0.1415348449670532.[0m
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_

[32m[I 2023-03-01 12:03:38,873][0m Trial 18 finished with value: 0.06885557566370162 and parameters: {'l1_ratio': 0.8188677893858365, 'alpha': 3.577885069221632e-05, 'fit_intercept': True, 'polynomial degree': 3}. Best is trial 7 with value: 0.0688497401893624.[0m
[32m[I 2023-03-01 12:03:41,136][0m Trial 19 finished with value: 0.14150291163576503 and parameters: {'l1_ratio': 0.847656441311183, 'alpha': 1.4751266894820062e-05, 'fit_intercept': True, 'polynomial degree': 2}. Best is trial 7 with value: 0.0688497401893624.[0m


model: ElasticNet test dataset error: 0.0688497401893624 best_params: {'alpha': 3.6587810657886685e-05, 'fit_intercept': True, 'l1_ratio': 0.8232154401659981, 'polynomial degree': 3}


In [15]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

def objective(trial):
    trial_params = {}
    # poisson requires y > 0 which is not true in this case
    trial_params["loss"] = trial.suggest_categorical("loss", ["squared_error", "absolute_error", "quantile"])
    
    if trial_params["loss"] == "quantile":
        trial_params["quantile"] = trial.suggest_float("quantile", 0, 1)
    
    trial_params["learning_rate"] = trial.suggest_float("learning_rate", 0.00001, 0.5, log=True)
    trial_params["max_iter"] = trial.suggest_int("max_iter", 20, 200, step=20)
    if trial.suggest_categorical("regularize", [True, False]):
        trial_params["l2_regularization"] = trial.suggest_float("l2_regularization", 0.00001, 1.0, log=True)
    
    model = MultiOutputRegressor(HistGradientBoostingRegressor(
        random_state=42,
        **trial_params
    ))
    
    phase = min((trial.number // 30), 4)
    
    training_sizes = (0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (240, 360, 600, 1200)
    
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except KeyboardInterrupt as e:
        raise e
    except Exception as e:
        print(e)
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="HistGradientBoostingRegressor", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)

print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-01 12:03:41,793][0m Using an existing study with name 'HistGradientBoostingRegressor' instead of creating a new one.[0m


model: HistGradientBoostingRegressor test dataset error: 0.008577411300669628 best_params: {'l2_regularization': 0.001871946506788565, 'learning_rate': 0.1651977983535292, 'loss': 'squared_error', 'max_iter': 160, 'regularize': True}


In [16]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import BaggingRegressor

def objective(trial):
    phase = min((trial.number // 30), 4)
    
    training_sizes = (0.1, 0.1, 0.3, 1.0)
    training_max_duration_s = (240, 360, 600, 1200)
    
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    
    trial_params = {}
    
    trial_params["n_estimators"] = trial.suggest_int("max_iter", 10, 110, step=20)
    trial_params["max_samples"] = trial.suggest_int("max_samples", 10000, (training_size // 10000) * 10000, step=10000)
    trial_params["max_features"] = trial.suggest_int("max_features", 1, X_train_scaled.shape[1])
    trial_params["bootstrap"] = trial.suggest_categorical("bootstrap", [True, False])
    if trial_params["bootstrap"]:
        trial_params["oob_score"] = trial.suggest_categorical("oob_score", [True, False])
    trial_params["bootstrap_features"] = trial.suggest_categorical("bootstrap_features", [True, False])
    
    model = MultiOutputRegressor(BaggingRegressor(
        n_jobs=-1,
        random_state=42,
        **trial_params
    ))
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except KeyboardInterrupt as e:
        raise e
    except Exception as e:
        print(e)
        raise optuna.exceptions.TrialPruned()
    return error

# try:
#     optuna.delete_study(study_name="BaggingRegressor", storage='sqlite:///../final/optuna.db')
# except:
#     pass

study = optuna.create_study(study_name="BaggingRegressor", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)
    
print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-01 12:03:41,895][0m Using an existing study with name 'BaggingRegressor' instead of creating a new one.[0m


model: BaggingRegressor test dataset error: 0.012123721671364409 best_params: {'bootstrap': True, 'bootstrap_features': False, 'max_features': 10, 'max_iter': 50, 'max_samples': 320000, 'oob_score': False}


In [17]:
from sklearn.ensemble import RandomForestRegressor

def objective(trial):
    phase = min((trial.number // 30), 4)

    training_sizes = (0.01, 0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (120, 180, 360, 600, 1200)
    
    n_estimators = trial.suggest_int("n_estimators", 10, 200, step=10)
    criterion = trial.suggest_categorical("criterion", ["squared_error", "absolute_error", "friedman_mse"])
    min_samples_split = trial.suggest_int("min_samples_split", 2, 5)
    bootstrap = trial.suggest_categorical("bootstrap", [True])
    max_samples=trial.suggest_float("max_samples", 0.0, 0.2) if bootstrap else None
    max_features=trial.suggest_int("max_features", 1, X_train_scaled.shape[1] // 2)
    
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        criterion=criterion,
        min_samples_split=min_samples_split,
        bootstrap=bootstrap,
        max_samples=max_samples,
        max_features=max_features,
        n_jobs=-1,
        random_state=42
    )
    
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except KeyboardInterrupt as e:
        raise e
    except:
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="RandomForrest", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)

print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-01 12:03:42,031][0m A new study created in RDB with name: RandomForrest[0m
[32m[I 2023-03-01 12:04:02,168][0m Trial 0 finished with value: 0.17594851993192578 and parameters: {'n_estimators': 150, 'criterion': 'absolute_error', 'min_samples_split': 4, 'bootstrap': True, 'max_samples': 0.06166565672349294, 'max_features': 3}. Best is trial 0 with value: 0.17594851993192578.[0m
[32m[I 2023-03-01 12:04:03,733][0m Trial 1 finished with value: 0.2394594945929994 and parameters: {'n_estimators': 200, 'criterion': 'friedman_mse', 'min_samples_split': 4, 'bootstrap': True, 'max_samples': 0.05148208511841046, 'max_features': 2}. Best is trial 0 with value: 0.17594851993192578.[0m
[32m[I 2023-03-01 12:04:13,641][0m Trial 2 finished with value: 0.12898461251525709 and parameters: {'n_estimators': 70, 'criterion': 'absolute_error', 'min_samples_split': 3, 'bootstrap': True, 'max_samples': 0.10475322296497798, 'max_features': 5}. Best is trial 2 with value: 0.1289846125152

[32m[I 2023-03-01 12:05:47,004][0m Trial 27 finished with value: 0.0874541084196932 and parameters: {'n_estimators': 140, 'criterion': 'squared_error', 'min_samples_split': 3, 'bootstrap': True, 'max_samples': 0.18799189152058432, 'max_features': 5}. Best is trial 22 with value: 0.08599681361071332.[0m
[32m[I 2023-03-01 12:06:05,996][0m Trial 28 finished with value: 0.11011050688536927 and parameters: {'n_estimators': 140, 'criterion': 'absolute_error', 'min_samples_split': 4, 'bootstrap': True, 'max_samples': 0.15667177117576425, 'max_features': 4}. Best is trial 22 with value: 0.08599681361071332.[0m
[32m[I 2023-03-01 12:06:07,570][0m Trial 29 finished with value: 0.15186289192887228 and parameters: {'n_estimators': 170, 'criterion': 'friedman_mse', 'min_samples_split': 3, 'bootstrap': True, 'max_samples': 0.18999857109026694, 'max_features': 2}. Best is trial 22 with value: 0.08599681361071332.[0m
[32m[I 2023-03-01 12:09:07,641][0m Trial 30 pruned. [0m
[32m[I 2023-03-01

[32m[I 2023-03-01 12:16:12,160][0m Trial 56 finished with value: 0.057463310117426315 and parameters: {'n_estimators': 100, 'criterion': 'squared_error', 'min_samples_split': 3, 'bootstrap': True, 'max_samples': 0.19299594794731723, 'max_features': 5}. Best is trial 52 with value: 0.05690610027835616.[0m
[32m[I 2023-03-01 12:16:14,157][0m Trial 57 finished with value: 0.057970840355486136 and parameters: {'n_estimators': 80, 'criterion': 'squared_error', 'min_samples_split': 2, 'bootstrap': True, 'max_samples': 0.19475990003629096, 'max_features': 5}. Best is trial 52 with value: 0.05690610027835616.[0m
[32m[I 2023-03-01 12:16:15,979][0m Trial 58 finished with value: 0.060021973921913406 and parameters: {'n_estimators': 100, 'criterion': 'friedman_mse', 'min_samples_split': 3, 'bootstrap': True, 'max_samples': 0.19956641272401282, 'max_features': 5}. Best is trial 52 with value: 0.05690610027835616.[0m
[32m[I 2023-03-01 12:16:17,764][0m Trial 59 finished with value: 0.108214

[32m[I 2023-03-01 12:25:14,055][0m Trial 84 finished with value: 0.03570036344104194 and parameters: {'n_estimators': 80, 'criterion': 'squared_error', 'min_samples_split': 2, 'bootstrap': True, 'max_samples': 0.18826347084262673, 'max_features': 5}. Best is trial 74 with value: 0.0354479818909229.[0m
[32m[I 2023-03-01 12:25:20,232][0m Trial 85 finished with value: 0.038037315626068643 and parameters: {'n_estimators': 80, 'criterion': 'squared_error', 'min_samples_split': 2, 'bootstrap': True, 'max_samples': 0.1663775719843013, 'max_features': 5}. Best is trial 74 with value: 0.0354479818909229.[0m
[32m[I 2023-03-01 12:31:20,400][0m Trial 86 pruned. [0m
[32m[I 2023-03-01 12:31:28,201][0m Trial 87 finished with value: 0.035846418460570156 and parameters: {'n_estimators': 90, 'criterion': 'squared_error', 'min_samples_split': 2, 'bootstrap': True, 'max_samples': 0.18641782983324753, 'max_features': 5}. Best is trial 74 with value: 0.0354479818909229.[0m
[32m[I 2023-03-01 12:

model: RandomForrest test dataset error: 0.0276166738609489 best_params: {'bootstrap': True, 'criterion': 'friedman_mse', 'max_features': 4, 'max_samples': 0.1753336890830068, 'min_samples_split': 2, 'n_estimators': 60}


In [18]:
from sklearn.ensemble import ExtraTreesRegressor

def objective(trial):
    phase = min((trial.number // 30), 4)

    training_sizes = (0.01, 0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (120, 180, 360, 600, 1200)
    
    n_estimators = trial.suggest_int("n_estimators", 10, 200)
    criterion = trial.suggest_categorical("criterion", ["squared_error", "absolute_error", "friedman_mse"])
    min_samples_split = trial.suggest_int("min_samples_split", 2, 5)
    bootstrap = trial.suggest_categorical("bootstrap", [True])
    max_samples=trial.suggest_float("max_samples", 0.0, 0.2) if bootstrap else None
    max_features=trial.suggest_int("max_features", 1, X_train_scaled.shape[1] // 2)
    
    model = ExtraTreesRegressor(
        n_estimators=n_estimators,
        criterion=criterion,
        min_samples_split=min_samples_split,
        bootstrap=bootstrap,
        max_samples=max_samples,
        max_features=max_features,
        n_jobs=-1,
        random_state=42
    )
    
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except KeyboardInterrupt as e:
        raise e
    except:
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="ExtraTreesRegressor", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)
    
print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-01 12:32:41,001][0m A new study created in RDB with name: ExtraTreesRegressor[0m
[32m[I 2023-03-01 12:32:46,498][0m Trial 0 finished with value: 0.322134221928035 and parameters: {'n_estimators': 153, 'criterion': 'absolute_error', 'min_samples_split': 4, 'bootstrap': True, 'max_samples': 0.01255001625674428, 'max_features': 5}. Best is trial 0 with value: 0.322134221928035.[0m
[32m[I 2023-03-01 12:32:47,571][0m Trial 1 finished with value: 0.1849629620536771 and parameters: {'n_estimators': 76, 'criterion': 'friedman_mse', 'min_samples_split': 4, 'bootstrap': True, 'max_samples': 0.06194135536751255, 'max_features': 4}. Best is trial 1 with value: 0.1849629620536771.[0m
[32m[I 2023-03-01 12:33:06,884][0m Trial 2 finished with value: 0.21881731026489612 and parameters: {'n_estimators': 132, 'criterion': 'absolute_error', 'min_samples_split': 5, 'bootstrap': True, 'max_samples': 0.09732287208421518, 'max_features': 3}. Best is trial 1 with value: 0.184962962053

[32m[I 2023-03-01 12:35:08,571][0m Trial 27 finished with value: 0.2242402511123524 and parameters: {'n_estimators': 165, 'criterion': 'squared_error', 'min_samples_split': 3, 'bootstrap': True, 'max_samples': 0.17686462190807792, 'max_features': 2}. Best is trial 12 with value: 0.10075140090060769.[0m
[32m[I 2023-03-01 12:35:10,550][0m Trial 28 finished with value: 0.16743684691665484 and parameters: {'n_estimators': 200, 'criterion': 'squared_error', 'min_samples_split': 2, 'bootstrap': True, 'max_samples': 0.12904232972219576, 'max_features': 3}. Best is trial 12 with value: 0.10075140090060769.[0m
[32m[I 2023-03-01 12:36:10,154][0m Trial 29 finished with value: 0.12418089405395998 and parameters: {'n_estimators': 149, 'criterion': 'absolute_error', 'min_samples_split': 3, 'bootstrap': True, 'max_samples': 0.15753916508997662, 'max_features': 5}. Best is trial 12 with value: 0.10075140090060769.[0m
[32m[I 2023-03-01 12:38:25,630][0m Trial 30 finished with value: 0.0953828

[32m[I 2023-03-01 13:02:12,685][0m Trial 54 finished with value: 0.0932773002121084 and parameters: {'n_estimators': 130, 'criterion': 'absolute_error', 'min_samples_split': 4, 'bootstrap': True, 'max_samples': 0.19990750237134333, 'max_features': 4}. Best is trial 31 with value: 0.08364894335516544.[0m
[32m[I 2023-03-01 13:03:28,425][0m Trial 55 finished with value: 0.08285899346329825 and parameters: {'n_estimators': 138, 'criterion': 'absolute_error', 'min_samples_split': 3, 'bootstrap': True, 'max_samples': 0.1898498603429919, 'max_features': 5}. Best is trial 55 with value: 0.08285899346329825.[0m
[32m[I 2023-03-01 13:06:02,552][0m Trial 56 finished with value: 0.08475785910919541 and parameters: {'n_estimators': 134, 'criterion': 'absolute_error', 'min_samples_split': 2, 'bootstrap': True, 'max_samples': 0.1745027881426002, 'max_features': 5}. Best is trial 55 with value: 0.08285899346329825.[0m
[32m[I 2023-03-01 13:06:04,392][0m Trial 57 finished with value: 0.0787183

[32m[I 2023-03-01 13:07:19,450][0m Trial 81 finished with value: 0.053888741670837875 and parameters: {'n_estimators': 188, 'criterion': 'friedman_mse', 'min_samples_split': 2, 'bootstrap': True, 'max_samples': 0.16444203223545256, 'max_features': 5}. Best is trial 80 with value: 0.052391459030318416.[0m
[32m[I 2023-03-01 13:07:22,954][0m Trial 82 finished with value: 0.05280660291281766 and parameters: {'n_estimators': 189, 'criterion': 'friedman_mse', 'min_samples_split': 2, 'bootstrap': True, 'max_samples': 0.16576701976756905, 'max_features': 5}. Best is trial 80 with value: 0.052391459030318416.[0m
[32m[I 2023-03-01 13:07:26,549][0m Trial 83 finished with value: 0.05219238406087736 and parameters: {'n_estimators': 189, 'criterion': 'friedman_mse', 'min_samples_split': 2, 'bootstrap': True, 'max_samples': 0.16522455142817769, 'max_features': 5}. Best is trial 83 with value: 0.05219238406087736.[0m
[32m[I 2023-03-01 13:07:29,948][0m Trial 84 finished with value: 0.0533675

model: ExtraTreesRegressor test dataset error: 0.035638099610715714 best_params: {'bootstrap': True, 'criterion': 'friedman_mse', 'max_features': 5, 'max_samples': 0.18501919599037991, 'min_samples_split': 2, 'n_estimators': 182}


In [25]:
from sklearn.neighbors import KNeighborsRegressor

common_params = {
    "n_jobs": -1
}

def objective(trial):
    phase = min((trial.number // 30), 4)

    training_sizes = (0.01, 0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (120, 180, 360, 600, 1200)
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    
    trial_params = {}
    trial_params["n_neighbors"] = trial.suggest_int("n_neighbors", 1, 100)
    trial_params["weights"] = trial.suggest_categorical("weights", ["uniform", "distance"])
    trial_params["algorithm"] = trial.suggest_categorical("algorithm", ["ball_tree", "kd_tree", "brute"])
    if trial_params["algorithm"] != "brute":
        trial_params["leaf_size"] = trial.suggest_int("leaf_size", 10, 50)
    trial_params["p"] = trial.suggest_int("p", 1, 5)
    
    model_params = {
        **trial_params,
        **common_params
    }
    trial.set_user_attr("model_params", model_params)
    model = KNeighborsRegressor(**trial.user_attrs["model_params"])
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except KeyboardInterrupt as e:
        raise e
    except Exception as e:
        print(e)
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="KNeighborsRegressor", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)
    
print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-01 13:20:53,485][0m A new study created in RDB with name: KNeighborsRegressor[0m
[32m[I 2023-03-01 13:21:55,320][0m Trial 0 finished with value: 0.2955053826591534 and parameters: {'n_neighbors': 94, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 36, 'p': 3}. Best is trial 0 with value: 0.2955053826591534.[0m
[32m[I 2023-03-01 13:22:02,371][0m Trial 1 finished with value: 0.2454018788242259 and parameters: {'n_neighbors': 89, 'weights': 'distance', 'algorithm': 'brute', 'p': 1}. Best is trial 1 with value: 0.2454018788242259.[0m
[32m[I 2023-03-01 13:22:59,623][0m Trial 2 finished with value: 0.3138173698459204 and parameters: {'n_neighbors': 89, 'weights': 'uniform', 'algorithm': 'brute', 'p': 5}. Best is trial 1 with value: 0.2454018788242259.[0m
[32m[I 2023-03-01 13:23:55,900][0m Trial 3 finished with value: 0.25448517373485685 and parameters: {'n_neighbors': 49, 'weights': 'distance', 'algorithm': 'brute', 'p': 3}. Best is trial 1 with valu

model: KNeighborsRegressor test dataset error: 0.21196498486521023 best_params: {'algorithm': 'kd_tree', 'leaf_size': 13, 'n_neighbors': 9, 'p': 3, 'weights': 'uniform'}
