In this notebook we use the `./final/dataset.npz` dataset to train multiple ML models to create a surrogate models. The comparison between models will be presented in the next notebook.

During training process dataset is split into train, test and validate sets of sizes `700000`, `150000` and `150000`.

List of tested methods (with links to used implementations):
1. [Neural Networks](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html)
2. Linear regression model:
    1. [Elastic Net](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html#sklearn.linear_model.ElasticNet) L1 and L2 regularization combined
3. Decistion Trees models:
    1. [Boosting](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html)
    2. [Bagging](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html)
    3. [Random Forrest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
    4. [Extra Trees](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html)
4. [k-nearest neighbors](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html)

For each ML method we use [Optuna](https://optuna.readthedocs.io) to find best performing set of parameters.

In [1]:
# Loading dataset
import numpy as np

ls.mkdir("../final/ml_models")

input_and_output = np.load("../final/dataset.npz")
inputs  = input_and_output["inputs"].astype(np.float64)
outputs = input_and_output["outputs"].astype(np.float64)

print(f"inputs shape: {inputs.shape} dtype: {inputs.dtype}, outputs shape: {outputs.shape}, dtype: {outputs.dtype}")
dataset_size = inputs.shape[0]

inputs shape: (1000000, 11) dtype: float64, outputs shape: (1000000, 200), dtype: float64


In [2]:
# In this problem we are interrested in order of magnitude rather than absolute value of the tumour size.
# To train the models output is first transformed with log_10. When the tumour size is smaller than 10^-9 L, 
# there is no way to find it, so we can limit the lower bound of tumour size with 10^-9

LOWER_LIMIT = -7

def output_transform(outputs: np.array) -> np.array:
    x = np.copy(outputs)
    zeros_in_output = x <= 0
    x[zeros_in_output] = 1
    y = np.log10(x)
    y[zeros_in_output] = LOWER_LIMIT
    y[y < LOWER_LIMIT] = LOWER_LIMIT
    return y
    
def output_untransform(transformed_outputs: np.array) -> np.array:
    lower_limits = transformed_outputs <= LOWER_LIMIT
    z = 10 ** transformed_outputs
    z[lower_limits] = 0
    return z

test_output = np.array([10**(-8), 10**(-5), 10**(-10), 10**(-9),1,2,10,0.0])

print(f"test output {test_output}")
print(f"transformed output: {output_transform(test_output)}")
print(f"original output is untouched after transform: {test_output}")
print(f"transformed and untransformed output: {output_untransform(output_transform(test_output))}")

outputs_order_of_magnitude = output_transform(outputs)

test output [1.e-08 1.e-05 1.e-10 1.e-09 1.e+00 2.e+00 1.e+01 0.e+00]
transformed output: [-8.      -5.      -9.      -9.       0.       0.30103  1.      -9.     ]
original output is untouched after transform: [1.e-08 1.e-05 1.e-10 1.e-09 1.e+00 2.e+00 1.e+01 0.e+00]
transformed and untransformed output: [1.e-08 1.e-05 0.e+00 0.e+00 1.e+00 2.e+00 1.e+01 0.e+00]


In [3]:
# Input data has an extra column with all ones - that we get rid of before training the model

def drop_treatment(input_data: np.ndarray) -> np.ndarray:
    """Drops treatment data from the dataset"""
    if input_data.shape[1] == 11:
        return input_data[:, 1:]

    return input_data

input_without_treatment = drop_treatment(inputs)

In [4]:
# Splitting dataset into train, test, validate subsets
train_size = int(dataset_size * 0.7)
test_size = int(dataset_size * 0.15)

X_train = input_without_treatment[:train_size, :]
Y_train = outputs_order_of_magnitude[:train_size, :]
print(f"train sizes: {X_train.shape}, {Y_train.shape}")
X_test = input_without_treatment[train_size:(train_size + test_size), :]
Y_test = outputs_order_of_magnitude[train_size:(train_size + test_size), :]
print(f"test sizes: {X_test.shape}, {Y_test.shape}")

train sizes: (700000, 10), (700000, 200)
test sizes: (150000, 10), (150000, 200)


In [5]:
# scaling inputs
import pickle
from pathlib import Path

from sklearn.preprocessing import MinMaxScaler


LOGNORMAL_PARAMETERS = (1, 2)

class CustomScaler:
    def __init__(self):
        super().__init__()
        self.scaler = MinMaxScaler()
        self.plot_loval = [0.0] * len(LOGNORMAL_PARAMETERS)
        self.plot_hival = [1.0] * len(LOGNORMAL_PARAMETERS)

    def transform(self, x: np.ndarray, copy=None) -> np.ndarray:
        res = self.scaler.transform(x)
        for i, parameter_index in enumerate(LOGNORMAL_PARAMETERS):
            res[:, parameter_index] = (x[:, parameter_index] - self.plot_loval[i]) / (self.plot_hival[i] - self.plot_loval[i])

        return res

    def fit(self, x, copy=None):
        self.scaler.fit(x)
        for i, parameter_index in enumerate(LOGNORMAL_PARAMETERS):
            column_values = x[:, parameter_index]

            quantile_1, quantile_3 = np.quantile(column_values, [0.25, 0.75], axis=0)
            iqr = quantile_3 - quantile_1

            loval = quantile_1 - 1.5 * iqr
            hival = quantile_3 + 1.5 * iqr

            wiskhi = np.compress(column_values <= hival, column_values)
            wisklo = np.compress(column_values >= loval, column_values)
            actual_hival = np.max(wiskhi)
            actual_loval = np.min(wisklo)

            self.plot_loval[i] = actual_loval
            self.plot_hival[i] = actual_hival

        return self

    def inverse_transform(self, x, copy=None):
        res = self.scaler.inverse_transform(x)
        for i, parameter_index in enumerate(LOGNORMAL_PARAMETERS):
            res[:, parameter_index] = x[:, parameter_index] * (self.plot_hival[i] - self.plot_loval[i]) + self.plot_loval[i]
        return res

scaler_path = Path(f"../final/scaler.pickle")
scaler = None
if scaler_path.exists():
    with scaler_path.open("rb") as scaler_file:
        scaler = pickle.load(scaler_file)
else:
    scaler = CustomScaler().fit(X_train)
    with scaler_path.open("wb") as opened_file:
        pickle.dump(scaler, opened_file)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# applying pca to outputs
from sklearn.decomposition import PCA


PCA_COMPONENTS=12

pca_path = Path(f"../final/pca{PCA_COMPONENTS}.pickle")

if pca_path.exists():
    with pca_path.open("rb") as opened_file:
        pca = pickle.load(opened_file)
    Y_train_pca = pca.transform(Y_train)
else: 
    pca = PCA(n_components=PCA_COMPONENTS)
    Y_train_pca = pca.fit_transform(Y_train)
    with pca_path.open("wb") as opened_file:
        pickle.dump(pca, opened_file)

print(pca.explained_variance_ratio_)

[9.63422836e-01 3.45880317e-02 1.13584299e-03 7.00114229e-04
 8.85658857e-05 3.47212159e-05 1.46535285e-05 6.29722232e-06
 3.64216009e-06 2.04051841e-06 1.09718389e-06 6.53377990e-07]


In [7]:
# Example of limiting running time of the loop by scheduling an sigalrm and adding a handler for it.
import time
import optuna
import os
import signal

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


def timeout_handler(*args):
    raise TimeoutError(f"timeout {args}")

def keyboard_interrupt_handler(*args):
    os.kill(os.getpid(), signal.SIGINT)
    
signal.signal(signal.SIGALRM, timeout_handler)

try:
    signal.alarm(2)
    time.sleep(1)
    signal.alarm(0)
    print(f"1. no exception")
except TimeoutError as e:
    print(f"1. message: {e}") 
except KeyboardInterrupt as e:
    print(f"1. KEYBOARD: {e}") 

try:
    signal.alarm(2)
    time.sleep(5)
    signal.alarm(0)
    print(f"2. no exception")
except TimeoutError as e:
    print(f"2. message: {e}")
except KeyboardInterrupt as e:
    print(f"2. keyboard: {e}") 

1. no exception
2. message: timeout (14, <frame at 0x12de859a0, file '/var/folders/xz/pys3pfc567s9bszzmlt87ytm0000gn/T/ipykernel_92947/3465348698.py', line 35, code <module>>)


In [8]:
# Hyperparameter optimization for neural network

from sklearn.neural_network import MLPRegressor

common_params={
    "tol": 1e-5,
    "n_iter_no_change": 5,
    "random_state": 42,
    "warm_start": False
}

def objective(trial):
    global common_params
    
    phase = min((trial.number // 30), 3)
    training_sizes = (0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (300, 600, 1200, 3600)
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]

    learning_rate = trial.suggest_categorical("learning_rate", ["constant", "invscaling", "adaptive"])    
    model_params = {
        **common_params,
        "max_iter": [1000, 2000, 4000, 8000][phase],
        "batch_size": [500, 500, 1000, 2000][phase],
        
        "alpha": trial.suggest_float("alpha", 0.001, 1.0, log=True),
        "learning_rate": learning_rate,
        "learning_rate_init": trial.suggest_float("learning_rate_init", 0.0001, 0.05, log=True),
        "power_t": trial.suggest_float("power_t", 0.1, 2.0, log=True) if learning_rate == "invscaling" else 0.5,
        
        "hidden_layer_sizes": [
            trial.suggest_int(f"layer1/3", 600, 1200, step=200),
            trial.suggest_int(f"layer2/3", 50, 400, step=50),
            [trial.suggest_int(f"layer3/3", 10, 50, step=10)
        ]
    }
    
    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    trial.set_user_attr("model_params", model_params)
    
    model = MLPRegressor(**trial.user_attrs["model_params"])
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        signal.alarm(0)
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        try: 
            best_value = trial.study.best_value
        except: 
            best_value = float('inf')
        if error < best_value:
            with Path(f"../final/ml_models/{trial.study.study_name}").open("wb") as opened_file:
                pickle.dump(model, opened_file)
            
    except TimeoutError:
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="MLPRegressor", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    previous_handler = signal.signal(signal.SIGALRM, keyboard_interrupt_handler)
    study.optimize(objective, n_trials=trials_due)
    signal.signal(signal.SIGALRM, previous_handler)
    
print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-09 12:11:07,504][0m Using an existing study with name 'MLPRegressor' instead of creating a new one.[0m


model: MLPRegressor test dataset error: 8.987444574725626e-05 best_params: {'alpha': 0.0029879519050999255, 'layer1/3': 800, 'layer2/3': 350, 'layer3/3': 30, 'learning_rate': 'adaptive', 'learning_rate_init': 0.00035901468189128335}


In [9]:
# Test how neural network perform with 10 minute training cap on full dataset
from sklearn.neural_network import MLPRegressor

for learning_rate in ["constant", "invscaling", "adaptive"]:
    max_duration_s = 600
    common_params={
        "tol": 5e-6,
        "n_iter_no_change": 10,
        "random_state": 42,
        "warm_start": False,
        "max_iter": 10000,
        "batch_size": 10000,
        "learning_rate": learning_rate
    }

    def objective(trial):
        global common_params
        global max_duration_s

        alpha = trial.suggest_float("alpha", 0.0001, 1.0, log=True)
        learning_rate_init = trial.suggest_float("learning_rate_init", 0.0001, 0.05, log=True)
        power_t = trial.suggest_float("power_t", 0.1, 2.0, log=True) if learning_rate == "invscaling" else 0.5

        model_params = {
            "alpha": alpha,
            "learning_rate_init": learning_rate_init,
            "power_t": power_t,

            "hidden_layer_sizes": [
                trial.suggest_int(f"layer1/3", 600, 1200, step=200),
                trial.suggest_int(f"layer2/3", 50, 400, step=50),
                *([trial.suggest_int(f"layer3/3", 10, 50, step=10)])
            ],
            **common_params
        }

        trial.set_user_attr("model_params", model_params)

        model = MLPRegressor(**trial.user_attrs["model_params"])
        trial.set_user_attr("training_size", 1.0)
        trial.set_user_attr("max_duration_s", max_duration_s)

        try:
            signal.alarm(max_duration_s)
            model.fit(X_train_scaled, Y_train_pca)
            signal.alarm(0)
            Y_predict_pca = model.predict(X_test_scaled)
            Y_predict = pca.inverse_transform(Y_predict_pca)
            error = mean_squared_error(Y_test, Y_predict)
        except TimeoutError:
            raise optuna.exceptions.TrialPruned()
        return error

    study = optuna.create_study(study_name=f"MLPRegressor_{learning_rate}_{max_duration_s}", storage='sqlite:///../final/optuna.db', load_if_exists=True)
    trials_due = 40 - len(study.trials)
    if trials_due > 0:
        previous_handler = signal.signal(signal.SIGALRM, keyboard_interrupt_handler)
        study.optimize(objective, n_trials=trials_due)
        signal.signal(signal.SIGALRM, previous_handler)

    print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-09 12:11:07,699][0m Using an existing study with name 'MLPRegressor_constant_600' instead of creating a new one.[0m
[32m[I 2023-03-09 12:11:07,820][0m Using an existing study with name 'MLPRegressor_invscaling_600' instead of creating a new one.[0m


model: MLPRegressor_constant_600 test dataset error: 7.95338600625236e-05 best_params: {'alpha': 0.0021976121677802214, 'layer1/3': 800, 'layer2/3': 250, 'layer3/3': 20, 'learning_rate_init': 0.0025086281095413575}


[32m[I 2023-03-09 12:11:07,939][0m Using an existing study with name 'MLPRegressor_adaptive_600' instead of creating a new one.[0m


model: MLPRegressor_invscaling_600 test dataset error: 8.527386328374639e-05 best_params: {'alpha': 0.0037839705738818854, 'layer1/3': 1200, 'layer2/3': 50, 'layer3/3': 40, 'learning_rate_init': 0.001572221328811536, 'power_t': 1.529276287774073}
model: MLPRegressor_adaptive_600 test dataset error: 0.0001240866740246993 best_params: {'alpha': 0.006777069662715181, 'layer1/3': 800, 'layer2/3': 350, 'layer3/3': 20, 'learning_rate_init': 0.001549679834450489}


In [13]:
# Linear regression with combined L1 and L2 priors as regularizer
from sklearn.linear_model import ElasticNet

common_params = {
    "random_state": 42,
    "tol": 1e-5
}

MAX_POLYNOMIAL_DEGREE = 3

def objective(trial):
    global common_params
    phase = min((trial.number // 30), 4)

    training_sizes = (0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (240, 360, 600, 1200)
    
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    max_iter = [20000, 40000, 80000, 160000][phase]
    
    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    trial.set_user_attr("model_params", {
        **common_params,
        "max_iter": max_iter,
        "l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0),
        "alpha": trial.suggest_float("alpha", 0.00001, 1.0, log=True),
        "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False])
    })
    
    trial.set_user_attr("polynomial degree", trial.suggest_int("polynomial degree", 1, MAX_POLYNOMIAL_DEGREE))
    
    model = make_pipeline(
        PolynomialFeatures(trial.user_attrs("polynomial degree")),
        ElasticNet(**trial.user_attrs("model_params"))
    )
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except TimeoutError:
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="ElasticNet", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)
    
print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-09 12:11:08,618][0m Using an existing study with name 'ElasticNet' instead of creating a new one.[0m


model: ElasticNet test dataset error: 0.06892972685134742 best_params: {'alpha': 5.98103619625795e-05, 'fit_intercept': True, 'l1_ratio': 0.5905390879114973, 'polynomial degree': 3}


In [14]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

def objective(trial):
    trial_params = {}
    # poisson requires y > 0 which is not true in this case
    trial_params["loss"] = trial.suggest_categorical("loss", ["squared_error", "absolute_error", "quantile"])
    
    if trial_params["loss"] == "quantile":
        trial_params["quantile"] = trial.suggest_float("quantile", 0, 1)
    
    trial_params["learning_rate"] = trial.suggest_float("learning_rate", 0.00001, 0.5, log=True)
    trial_params["max_iter"] = trial.suggest_int("max_iter", 20, 200, step=20)
    if trial.suggest_categorical("regularize", [True, False]):
        trial_params["l2_regularization"] = trial.suggest_float("l2_regularization", 0.00001, 1.0, log=True)
    
    trial.set_user_attr("model_params", {
        "random_state": 42,
        **trial_params
    })
    
    model = MultiOutputRegressor(HistGradientBoostingRegressor(**trial.user_attrs("model_params")))
    
    phase = min((trial.number // 30), 4)
    
    training_sizes = (0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (240, 360, 600, 1200)
    
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    
    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except KeyboardInterrupt as e:
        raise e
    except Exception as e:
        print(e)
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="HistGradientBoostingRegressor", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)

print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-09 12:11:08,809][0m Using an existing study with name 'HistGradientBoostingRegressor' instead of creating a new one.[0m


model: HistGradientBoostingRegressor test dataset error: 0.008577411300669628 best_params: {'l2_regularization': 0.001871946506788565, 'learning_rate': 0.1651977983535292, 'loss': 'squared_error', 'max_iter': 160, 'regularize': True}


In [15]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import BaggingRegressor

def objective(trial):
    phase = min((trial.number // 30), 4)
    
    training_sizes = (0.1, 0.1, 0.3, 1.0)
    training_max_duration_s = (240, 360, 600, 1200)
    
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    
    trial_params = {}
    
    trial_params["n_estimators"] = trial.suggest_int("max_iter", 10, 110, step=20)
    trial_params["max_samples"] = trial.suggest_int("max_samples", 10000, (training_size // 10000) * 10000, step=10000)
    trial_params["max_features"] = trial.suggest_int("max_features", 1, X_train_scaled.shape[1])
    trial_params["bootstrap"] = trial.suggest_categorical("bootstrap", [True, False])
    if trial_params["bootstrap"]:
        trial_params["oob_score"] = trial.suggest_categorical("oob_score", [True, False])
    trial_params["bootstrap_features"] = trial.suggest_categorical("bootstrap_features", [True, False])
    
    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    trial.set_user_attr("model_params", {
        "n_jobs": -1,
        "random_state": 42,
        **trial_params
    })
    
    model = MultiOutputRegressor(BaggingRegressor(**trial.user_attrs("model_params")))
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except KeyboardInterrupt as e:
        raise e
    except Exception as e:
        print(e)
        raise optuna.exceptions.TrialPruned()
    return error

# try:
#     optuna.delete_study(study_name="BaggingRegressor", storage='sqlite:///../final/optuna.db')
# except:
#     pass

study = optuna.create_study(study_name="BaggingRegressor", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)
    
print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-09 12:11:08,985][0m Using an existing study with name 'BaggingRegressor' instead of creating a new one.[0m


model: BaggingRegressor test dataset error: 0.012123721671364409 best_params: {'bootstrap': True, 'bootstrap_features': False, 'max_features': 10, 'max_iter': 50, 'max_samples': 320000, 'oob_score': False}


In [16]:
from sklearn.ensemble import RandomForestRegressor

def objective(trial):
    phase = min((trial.number // 30), 4)

    training_sizes = (0.01, 0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (120, 180, 360, 600, 1200)
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    
    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    trial.set_user_attr("model_params", {
      "n_estimators": trial.suggest_int("n_estimators", 10, 200, step=10),
      "criterion": trial.suggest_categorical("criterion", ["squared_error", "absolute_error", "friedman_mse"]),
      "min_samples_split": trial.suggest_int("min_samples_split", 2, 5),
      "bootstrap": trial.suggest_categorical("bootstrap", [True]),
      "max_samples": trial.suggest_float("max_samples", 0.0, 0.2),
      "max_features": trial.suggest_int("max_features", 1, X_train_scaled.shape[1] // 2),
      "n_jobs": -1,
      "random_state": 42
    })
    
    model = RandomForestRegressor(**trial.user_attrs("model_params"))
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except KeyboardInterrupt as e:
        raise e
    except:
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="RandomForrest", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)

print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-09 12:11:09,283][0m Using an existing study with name 'RandomForrest' instead of creating a new one.[0m


model: RandomForrest test dataset error: 0.0276166738609489 best_params: {'bootstrap': True, 'criterion': 'friedman_mse', 'max_features': 4, 'max_samples': 0.1753336890830068, 'min_samples_split': 2, 'n_estimators': 60}


In [17]:
from sklearn.ensemble import ExtraTreesRegressor

def objective(trial):
    phase = min((trial.number // 30), 4)

    training_sizes = (0.01, 0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (120, 180, 360, 600, 1200)
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    
    trial.set_user_attr("model_params", {
      "n_estimators": trial.suggest_int("n_estimators", 10, 200),
      "criterion": trial.suggest_categorical("criterion", ["squared_error", "absolute_error", "friedman_mse"]),
      "min_samples_split": trial.suggest_int("min_samples_split", 2, 5),
      "bootstrap": trial.suggest_categorical("bootstrap", [True]),
      "max_samples": trial.suggest_float("max_samples", 0.0, 0.2),
      "max_features": trial.suggest_int("max_features", 1, X_train_scaled.shape[1] // 2),
      "n_jobs": -1,
      "random_state": 42
    }
    
    model = ExtraTreesRegressor(**trial.user_attrs("model_params"))
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except KeyboardInterrupt as e:
        raise e
    except:
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="ExtraTreesRegressor", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)
    
print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-09 12:11:09,480][0m Using an existing study with name 'ExtraTreesRegressor' instead of creating a new one.[0m


model: ExtraTreesRegressor test dataset error: 0.035638099610715714 best_params: {'bootstrap': True, 'criterion': 'friedman_mse', 'max_features': 5, 'max_samples': 0.18501919599037991, 'min_samples_split': 2, 'n_estimators': 182}


In [18]:
from sklearn.neighbors import KNeighborsRegressor

common_params = {
    "n_jobs": -1
}

def objective(trial):
    phase = min((trial.number // 30), 4)

    training_sizes = (0.01, 0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (120, 180, 360, 600, 1200)
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    
    trial_params = {}
    trial_params["n_neighbors"] = trial.suggest_int("n_neighbors", 1, 100)
    trial_params["weights"] = trial.suggest_categorical("weights", ["uniform", "distance"])
    trial_params["algorithm"] = trial.suggest_categorical("algorithm", ["ball_tree", "kd_tree", "brute"])
    if trial_params["algorithm"] != "brute":
        trial_params["leaf_size"] = trial.suggest_int("leaf_size", 10, 50)
    trial_params["p"] = trial.suggest_int("p", 1, 5)
    
    model_params = {
        **trial_params,
        **common_params
    }
    trial.set_user_attr("model_params", model_params)
    model = KNeighborsRegressor(**trial.user_attrs["model_params"])
    
    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size,:], Y_train_pca[:training_size,:])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except KeyboardInterrupt as e:
        raise e
    except Exception as e:
        print(e)
        raise optuna.exceptions.TrialPruned()
    return error

study = optuna.create_study(study_name="KNeighborsRegressor", storage='sqlite:///../final/optuna.db', load_if_exists=True)
trials_due = 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)
    
print(f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}")

[32m[I 2023-03-09 12:11:09,677][0m Using an existing study with name 'KNeighborsRegressor' instead of creating a new one.[0m


model: KNeighborsRegressor test dataset error: 0.21196498486521023 best_params: {'algorithm': 'kd_tree', 'leaf_size': 13, 'n_neighbors': 9, 'p': 3, 'weights': 'uniform'}
