# Train ML models

In this notebook we use the `./final/dataset.npz` dataset to train multiple ML models to create a surrogate models. The comparison between models will be presented in the next notebook.

During training process dataset is split into train, test and validate sets of sizes `700000`, `150000` and `150000`.

List of tested methods (with links to used implementations):
1. [Neural Networks](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html)
2. Linear regression model:
    1. [Elastic Net](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html#sklearn.linear_model.ElasticNet) L1 and L2 regularization combined
3. Decision Trees models:
    1. [Boosting](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html)
    2. [Bagging](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html)
    3. [Random Forrest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
    4. [Extra Trees](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html)
4. [k-nearest neighbors](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html)

For each ML method we use [Optuna](https://optuna.readthedocs.io) to find best performing set of parameters.

## Running whole notebook may take long time
During our our experiments run time was around 48 hours. To test if notebook is working fine, set `RUN_ITERATIONS_PERCENT` to small number (e.g. 3) and run only 3% of iterations.


In [1]:
RUN_ITERATIONS_PERCENT = 100  # runs only X percent of iterations. pick small number to test if notebook is working fine

In [2]:
# Loading dataset
import numpy as np
from pathlib import Path
import pickle

DATA_PATH = Path("../final/").resolve()
SQLITE_DB = f"sqlite:///{DATA_PATH}/optuna.db"

(DATA_PATH / "ml_models").mkdir(parents=True, exist_ok=True)

input_and_output = np.load(DATA_PATH / "dataset.npz")
inputs = input_and_output["inputs"].astype(np.float64)
outputs = input_and_output["outputs"].astype(np.float64)

print(
    f"inputs shape: {inputs.shape} dtype: {inputs.dtype}, outputs shape: {outputs.shape}, dtype: {outputs.dtype}"
)
dataset_size = inputs.shape[0]

inputs shape: (1000000, 11) dtype: float64, outputs shape: (1000000, 200), dtype: float64


In [3]:
# In this problem we are interested in order of magnitude rather than absolute value of the tumour size.
# To train the models output is first transformed with log_10. When the tumour size is smaller than 10^-9 L,
# there is no way to find it, so we can limit the lower bound of tumour size with 10^-9

LOWER_LIMIT = -7


def output_transform(outputs: np.array) -> np.array:
    x = np.copy(outputs)
    zeros_in_output = x <= 0
    x[zeros_in_output] = 1
    y = np.log10(x)
    y[zeros_in_output] = LOWER_LIMIT
    y[y < LOWER_LIMIT] = LOWER_LIMIT
    return y


def output_untransform(transformed_outputs: np.array) -> np.array:
    lower_limits = transformed_outputs <= LOWER_LIMIT
    z = 10**transformed_outputs
    z[lower_limits] = 0
    return z


test_output = np.array([10 ** (-8), 10 ** (-5), 10 ** (-10), 10 ** (-9), 1, 2, 10, 0.0])

print(f"test output {test_output}")
print(f"transformed output: {output_transform(test_output)}")
print(f"original output is untouched after transform: {test_output}")
print(
    f"transformed and untransformed output: {output_untransform(output_transform(test_output))}"
)

outputs_order_of_magnitude = output_transform(outputs)

test output [1.e-08 1.e-05 1.e-10 1.e-09 1.e+00 2.e+00 1.e+01 0.e+00]
transformed output: [-7.      -5.      -7.      -7.       0.       0.30103  1.      -7.     ]
original output is untouched after transform: [1.e-08 1.e-05 1.e-10 1.e-09 1.e+00 2.e+00 1.e+01 0.e+00]
transformed and untransformed output: [0.e+00 1.e-05 0.e+00 0.e+00 1.e+00 2.e+00 1.e+01 0.e+00]


In [4]:
# Input data has an extra column with all ones - that we get rid of before training the model


def drop_treatment(input_data: np.ndarray) -> np.ndarray:
    """Drops treatment data from the dataset"""
    if input_data.shape[1] == 11:
        return input_data[:, 1:]

    return input_data


input_without_treatment = drop_treatment(inputs)

In [5]:
# Splitting dataset into train (70%), test (15%), validate (15%) subsets
train_size = int(dataset_size * 0.7)
test_size = int(dataset_size * 0.15)

X_train = input_without_treatment[:train_size, :]
Y_train = outputs_order_of_magnitude[:train_size, :]
print(f"train sizes: {X_train.shape}, {Y_train.shape}")
X_test = input_without_treatment[train_size : (train_size + test_size), :]
Y_test = outputs_order_of_magnitude[train_size : (train_size + test_size), :]
print(f"test sizes: {X_test.shape}, {Y_test.shape}")

train sizes: (700000, 10), (700000, 200)
test sizes: (150000, 10), (150000, 200)


In [6]:
# scaling inputs
from custom_scaler import get_scaler

scaler = get_scaler(DATA_PATH, X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# applying pca to outputs
from sklearn.decomposition import PCA


PCA_COMPONENTS = 16

pca_path = DATA_PATH / f"pca{PCA_COMPONENTS}.pickle"

if pca_path.exists():
    with pca_path.open("rb") as opened_file:
        pca = pickle.load(opened_file)
    Y_train_pca = pca.transform(Y_train)
else:
    pca = PCA(n_components=PCA_COMPONENTS)
    Y_train_pca = pca.fit_transform(Y_train)
    with pca_path.open("wb") as opened_file:
        pickle.dump(pca, opened_file)

print(pca.explained_variance_ratio_)
from functools import reduce

print(
    list(reduce(lambda a, b: a + [a[-1] - b], pca.explained_variance_ratio_, [1.0]))[1:]
)

[9.65866883e-01 3.10364783e-02 2.12998980e-03 6.91833518e-04
 1.62496763e-04 5.55698571e-05 2.52885361e-05 1.32508249e-05
 6.56014178e-06 3.97412630e-06 2.41222953e-06 1.56883509e-06
 1.00395773e-06 6.69877073e-07 4.67876722e-07 3.31892112e-07]
[0.03413311696416754, 0.003096638699825127, 0.0009666489008957665, 0.00027481538295305835, 0.0001123186198628594, 5.67487627470774e-05, 3.146022667338151e-05, 1.8209401804249853e-05, 1.1649260025332503e-05, 7.675133723264175e-06, 5.262904194529818e-06, 3.6940691008915433e-06, 2.6901113730976307e-06, 2.0202342996103437e-06, 1.5523575778771536e-06, 1.2204654663490287e-06]


In [8]:
# Example of limiting running time of the loop by scheduling an sigalrm and adding a handler for it.

import optuna
import time
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import psutil
import os
import signal


def timeout_handler(*args):
    raise TimeoutError(f"timeout {args}")


def keyboard_interrupt_handler(*args):
    os.kill(os.getpid(), signal.SIGINT)


signal.signal(signal.SIGALRM, timeout_handler)

try:
    signal.alarm(2)
    time.sleep(1)
    signal.alarm(0)
    print(f"1. no exception")
except TimeoutError as e:
    print(f"1. message: {e}")
except KeyboardInterrupt as e:
    print(f"1. KEYBOARD: {e}")

try:
    signal.alarm(2)
    time.sleep(5)
    signal.alarm(0)
    print(f"2. no exception")
except TimeoutError as e:
    print(f"2. message: {e}")
except KeyboardInterrupt as e:
    print(f"2. keyboard: {e}")

1. no exception
2. message: timeout (14, <frame at 0x7f70b67803e0, file '/tmp/ipykernel_377653/1902325977.py', line 35, code <module>>)


In [9]:
def save_best_model(model, error, trial):
    try:
        best_value = trial.study.best_value
    except:
        best_value = float("inf")
    if error < best_value:
        print(f"Updating {trial.study.study_name}.pickle")
        with (DATA_PATH / f"{trial.study.study_name}.pickle").open("wb") as file_obj:
            pickle.dump(model, file_obj)


def mse_on_test_dataset(model):
    Y_predict_pca = model.predict(X_test_scaled)
    Y_predict = pca.inverse_transform(Y_predict_pca)
    return mean_squared_error(Y_test, Y_predict)

In [10]:
# Hyperparameter optimization for neural network

from sklearn.neural_network import MLPRegressor
from threadpoolctl import threadpool_limits


common_params = {
    "tol": 3e-6,
    "n_iter_no_change": 5,
    "random_state": 42,
    "warm_start": False,
    "batch_size": 10000,
    "max_iter": 5000,
}


def objective(trial):
    global common_params

    phase = min((trial.number // 30), 3)
    training_sizes = (0.05, 0.1, 0.3, 1.0)
    training_max_duration_s = (300, 600, 1200, 3600)
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]

    # learning_rate = trial.suggest_categorical("learning_rate", ["constant", "invscaling", "adaptive"])
    learning_rate = "constant"
    model_params = {
        **common_params,
        "alpha": trial.suggest_float("alpha", 0.001, 1.0, log=True),
        "learning_rate": learning_rate,
        "learning_rate_init": trial.suggest_float(
            "learning_rate_init", 0.0001, 0.1, log=True
        ),
        "power_t": trial.suggest_float("power_t", 0.1, 2.0, log=True)
        if learning_rate == "invscaling"
        else 0.5,
        "hidden_layer_sizes": [
            trial.suggest_int(f"layer1/3", 600, 1200, step=200),
            trial.suggest_int(f"layer2/3", 200, 600, step=50),
            trial.suggest_int(f"layer3/3", 10, 30, step=5),
        ],
        "random_state": trial.suggest_categorical("random_state", list(range(0, 32))),
        #         "hidden_layer_sizes": [700, 200, 30]
    }

    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    trial.set_user_attr("model_params", model_params)

    model = MLPRegressor(**trial.user_attrs["model_params"])

    try:
        signal.alarm(max_duration_s)
        with threadpool_limits(limits=8, user_api="blas"):
            model.fit(X_train_scaled[:training_size, :], Y_train_pca[:training_size, :])
        signal.alarm(0)
        error = mse_on_test_dataset(model)
        save_best_model(model, error, trial)

    except TimeoutError:
        raise optuna.exceptions.TrialPruned()
    return error


study = optuna.create_study(
    study_name="MLPRegressor",
    storage=SQLITE_DB,
    load_if_exists=True,
)
trials_due = 110 * RUN_ITERATIONS_PERCENT // 100 - len(study.trials)
if trials_due > 0:
    previous_handler = signal.signal(signal.SIGALRM, keyboard_interrupt_handler)
    study.optimize(objective, n_trials=trials_due)
    signal.signal(signal.SIGALRM, previous_handler)

print(
    f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}"
)

[32m[I 2023-04-07 12:01:26,313][0m Using an existing study with name 'MLPRegressor' instead of creating a new one.[0m


model: MLPRegressor test dataset error: 8.987444574725626e-05 best_params: {'alpha': 0.0029879519050999255, 'layer1/3': 800, 'layer2/3': 350, 'layer3/3': 30, 'learning_rate': 'adaptive', 'learning_rate_init': 0.00035901468189128335}


In [11]:
# Test how neural network perform with 10 minute training cap on full dataset
TIMEOUT_SECONDS = 300

from sklearn.neural_network import MLPRegressor
from threadpoolctl import threadpool_limits

for learning_rate in ["constant", "invscaling", "adaptive"]:
    common_params = {
        "tol": 1e-5,
        "n_iter_no_change": 5,
        "random_state": 42,
        "warm_start": False,
        "max_iter": 2000,
        "batch_size": 10000,
        "learning_rate": learning_rate,
    }

    def objective(trial):
        global common_params

        alpha = trial.suggest_float("alpha", 0.0001, 1.0, log=True)
        learning_rate_init = trial.suggest_float(
            "learning_rate_init", 0.0001, 0.05, log=True
        )
        power_t = (
            trial.suggest_float("power_t", 0.1, 2.0, log=True)
            if learning_rate == "invscaling"
            else 0.5
        )

        model_params = {
            **common_params,
            "alpha": alpha,
            "learning_rate_init": learning_rate_init,
            "power_t": power_t,
            "hidden_layer_sizes": [
                trial.suggest_int(f"layer1/3", 600, 1200, step=200),
                trial.suggest_int(f"layer2/3", 200, 600, step=50),
                trial.suggest_int(f"layer3/3", 10, 50, step=5),
            ],
            "batch_size": trial.suggest_int("batch_size", 10000, 25000, step=5000),
            "random_state": trial.suggest_categorical(
                "random_state", list(range(0, 32))
            ),
            # "hidden_layer_sizes": [600, 200, 50]
        }
        print(model_params)

        trial.set_user_attr("model_params", model_params)

        model = MLPRegressor(**trial.user_attrs["model_params"])
        trial.set_user_attr("training_size", 1.0)
        trial.set_user_attr("max_duration_s", TIMEOUT_SECONDS)

        try:
            signal.alarm(TIMEOUT_SECONDS)
            with threadpool_limits(limits=8, user_api="blas"):
                model.fit(X_train_scaled, Y_train_pca)
            signal.alarm(0)
            error = mse_on_test_dataset(model)
            save_best_model(model, error, trial)

        except TimeoutError:
            raise optuna.exceptions.TrialPruned()
        return error

    study = optuna.create_study(
        study_name=f"MLPRegressor_{learning_rate}_{TIMEOUT_SECONDS}",
        storage=SQLITE_DB,
        load_if_exists=True,
    )
    trials_due = 40 * RUN_ITERATIONS_PERCENT // 100 - len(study.trials)
    if trials_due > 0:
        previous_handler = signal.signal(signal.SIGALRM, keyboard_interrupt_handler)
        study.optimize(objective, n_trials=trials_due)
        signal.signal(signal.SIGALRM, previous_handler)

    print(
        f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}"
    )

[32m[I 2023-04-07 12:01:26,445][0m A new study created in RDB with name: MLPRegressor_constant_300[0m


{'tol': 1e-05, 'n_iter_no_change': 5, 'random_state': 30, 'warm_start': False, 'max_iter': 2000, 'batch_size': 25000, 'learning_rate': 'constant', 'alpha': 0.006101610233741692, 'learning_rate_init': 0.006839644538652477, 'power_t': 0.5, 'hidden_layer_sizes': [800, 400, 50]}


[32m[I 2023-04-07 12:06:29,209][0m Trial 0 finished with value: 0.0102477563408473 and parameters: {'alpha': 0.006101610233741692, 'learning_rate_init': 0.006839644538652477, 'layer1/3': 800, 'layer2/3': 400, 'layer3/3': 50, 'batch_size': 25000, 'random_state': 30}. Best is trial 0 with value: 0.0102477563408473.[0m
[32m[I 2023-04-07 12:06:29,239][0m Using an existing study with name 'MLPRegressor_invscaling_300' instead of creating a new one.[0m
[32m[I 2023-04-07 12:06:29,283][0m Using an existing study with name 'MLPRegressor_adaptive_300' instead of creating a new one.[0m


Updating MLPRegressor_constant_300.pickle
model: MLPRegressor_constant_300 test dataset error: 0.0102477563408473 best_params: {'alpha': 0.006101610233741692, 'batch_size': 25000, 'layer1/3': 800, 'layer2/3': 400, 'layer3/3': 50, 'learning_rate_init': 0.006839644538652477, 'random_state': 30}
model: MLPRegressor_invscaling_300 test dataset error: 0.013776365727441831 best_params: {'alpha': 0.04361488288211113, 'batch_size': 20000, 'layer1/3': 1200, 'layer2/3': 550, 'layer3/3': 50, 'learning_rate_init': 0.0005348826277957722, 'power_t': 0.4428082303763544, 'random_state': 25}
model: MLPRegressor_adaptive_300 test dataset error: 0.07229927344000217 best_params: {'alpha': 0.002743329373331669, 'batch_size': 25000, 'layer1/3': 1000, 'layer2/3': 500, 'layer3/3': 40, 'learning_rate_init': 0.00014888516824883286, 'random_state': 8}


In [12]:
optuna.delete_study(study_name="MLPRegressor_constant_300", storage=SQLITE_DB)

In [13]:
# Linear regression with combined L1 and L2 priors as regularizer
from sklearn.linear_model import ElasticNet

common_params = {"random_state": 42, "tol": 1e-5}

MAX_POLYNOMIAL_DEGREE = 3


def objective(trial):
    global common_params
    phase = min((trial.number // 30), 4)

    training_sizes = (0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (240, 360, 600, 1200)

    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    max_iter = [20000, 40000, 80000, 160000][phase]

    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    trial.set_user_attr(
        "model_params",
        {
            **common_params,
            "max_iter": max_iter,
            "l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0),
            "alpha": trial.suggest_float("alpha", 0.00001, 1.0, log=True),
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
        },
    )

    trial.set_user_attr(
        "polynomial degree",
        trial.suggest_int("polynomial degree", 1, MAX_POLYNOMIAL_DEGREE),
    )

    model = make_pipeline(
        PolynomialFeatures(trial.user_attrs["polynomial degree"]),
        ElasticNet(**trial.user_attrs["model_params"]),
    )

    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size, :], Y_train_pca[:training_size, :])
        signal.alarm(0)
        error = mse_on_test_dataset(model)
        save_best_model(model, error, trial)
    except TimeoutError:
        raise optuna.exceptions.TrialPruned()
    return error


study = optuna.create_study(
    study_name="ElasticNet", storage=SQLITE_DB, load_if_exists=True
)
trials_due = 100 * RUN_ITERATIONS_PERCENT // 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)

print(
    f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}"
)

[32m[I 2023-04-07 12:06:29,388][0m Using an existing study with name 'ElasticNet' instead of creating a new one.[0m


model: ElasticNet test dataset error: 0.06892972685134742 best_params: {'alpha': 5.98103619625795e-05, 'fit_intercept': True, 'l1_ratio': 0.5905390879114973, 'polynomial degree': 3}


In [14]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import HistGradientBoostingRegressor


def objective(trial):
    trial_params = {
        "loss": trial.suggest_categorical(
            # poisson requires y > 0 which is not true in this case
            "loss",
            ["squared_error", "absolute_error", "quantile"],
        ),
        "learning_rate": trial.suggest_float("learning_rate", 0.00001, 0.5, log=True),
        "max_iter": trial.suggest_int("max_iter", 20, 200, step=20),
    }

    if trial_params["loss"] == "quantile":
        trial_params["quantile"] = trial.suggest_float("quantile", 0, 1)

    if trial.suggest_categorical("regularize", [True, False]):
        trial_params["l2_regularization"] = trial.suggest_float(
            "l2_regularization", 0.00001, 1.0, log=True
        )

    trial.set_user_attr("model_params", {"random_state": 42, **trial_params})

    model = MultiOutputRegressor(
        HistGradientBoostingRegressor(**trial.user_attrs["model_params"])
    )

    phase = min((trial.number // 30), 4)

    training_sizes = (0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (240, 360, 600, 1200)

    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]

    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)

    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size, :], Y_train_pca[:training_size, :])
        signal.alarm(0)
        error = mse_on_test_dataset(model)
        save_best_model(model, error, trial)
    except KeyboardInterrupt:
        raise
    except Exception as e:
        print(e)
        raise optuna.exceptions.TrialPruned()
    return error


study = optuna.create_study(
    study_name="HistGradientBoostingRegressor",
    storage=SQLITE_DB,
    load_if_exists=True,
)
trials_due = 100 * RUN_ITERATIONS_PERCENT // 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)

print(
    f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}"
)

[32m[I 2023-04-07 12:06:29,480][0m Using an existing study with name 'HistGradientBoostingRegressor' instead of creating a new one.[0m


model: HistGradientBoostingRegressor test dataset error: 0.008577411300669628 best_params: {'l2_regularization': 0.001871946506788565, 'learning_rate': 0.1651977983535292, 'loss': 'squared_error', 'max_iter': 160, 'regularize': True}


In [15]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import BaggingRegressor


def objective(trial):
    phase = min((trial.number // 30), 4)

    training_sizes = (0.1, 0.1, 0.3, 1.0)
    training_max_duration_s = (240, 360, 600, 1200)

    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]

    trial_params = {
        "n_estimators": trial.suggest_int("max_iter", 10, 110, step=20),
        "max_samples": trial.suggest_int(
            "max_samples", 10000, (training_size // 10000) * 10000, step=10000
        ),
        "max_features": trial.suggest_int("max_features", 1, X_train_scaled.shape[1]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        "bootstrap_features": trial.suggest_categorical(
            "bootstrap_features", [True, False]
        ),
    }

    if trial_params["bootstrap"]:
        trial_params["oob_score"] = trial.suggest_categorical(
            "oob_score", [True, False]
        )

    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    trial.set_user_attr(
        "model_params", {"n_jobs": -1, "random_state": 42, **trial_params}
    )

    model = MultiOutputRegressor(BaggingRegressor(**trial.user_attrs["model_params"]))

    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size, :], Y_train_pca[:training_size, :])
        signal.alarm(0)
        error = mse_on_test_dataset(model)
        save_best_model(model, error, trial)
    except KeyboardInterrupt:
        raise
    except Exception as e:
        print(e)
        raise optuna.exceptions.TrialPruned()
    return error


# try:
#     optuna.delete_study(study_name="BaggingRegressor", storage=SQLITE_DB)
# except:
#     pass

study = optuna.create_study(
    study_name="BaggingRegressor",
    storage=SQLITE_DB,
    load_if_exists=True,
)
trials_due = 100 * RUN_ITERATIONS_PERCENT // 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)

print(
    f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}"
)

[32m[I 2023-04-07 12:06:29,631][0m Using an existing study with name 'BaggingRegressor' instead of creating a new one.[0m


model: BaggingRegressor test dataset error: 0.012123721671364409 best_params: {'bootstrap': True, 'bootstrap_features': False, 'max_features': 10, 'max_iter': 50, 'max_samples': 320000, 'oob_score': False}


In [16]:
from sklearn.ensemble import RandomForestRegressor


def objective(trial):
    phase = min((trial.number // 30), 4)

    training_sizes = (0.01, 0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (120, 180, 360, 600, 1200)
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]

    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)
    trial.set_user_attr(
        "model_params",
        {
            "n_estimators": trial.suggest_int("n_estimators", 10, 200, step=10),
            "criterion": trial.suggest_categorical(
                "criterion", ["squared_error", "absolute_error", "friedman_mse"]
            ),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 5),
            "bootstrap": trial.suggest_categorical("bootstrap", [True]),
            "max_samples": trial.suggest_float("max_samples", 0.0, 0.2),
            "max_features": trial.suggest_int(
                "max_features", 1, X_train_scaled.shape[1] // 2
            ),
            "n_jobs": -1,
            "random_state": 42,
        },
    )

    model = RandomForestRegressor(**trial.user_attrs["model_params"])

    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size, :], Y_train_pca[:training_size, :])
        Y_predict_pca = model.predict(X_test_scaled)
        Y_predict = pca.inverse_transform(Y_predict_pca)
        error = mean_squared_error(Y_test, Y_predict)
        signal.alarm(0)
    except KeyboardInterrupt:
        raise
    except:
        raise optuna.exceptions.TrialPruned()
    return error


study = optuna.create_study(
    study_name="RandomForrest",
    storage=SQLITE_DB,
    load_if_exists=True,
)
trials_due = 100 * RUN_ITERATIONS_PERCENT // 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)

print(
    f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}"
)

[32m[I 2023-04-07 12:06:29,710][0m Using an existing study with name 'RandomForrest' instead of creating a new one.[0m


model: RandomForrest test dataset error: 0.0276166738609489 best_params: {'bootstrap': True, 'criterion': 'friedman_mse', 'max_features': 4, 'max_samples': 0.1753336890830068, 'min_samples_split': 2, 'n_estimators': 60}


In [17]:
from sklearn.ensemble import ExtraTreesRegressor


def objective(trial):
    phase = min((trial.number // 30), 4)

    training_sizes = (0.01, 0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (120, 180, 360, 600, 1200)
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]
    trial.set_user_attr("training_size", training_size)
    trial.set_user_attr("max_duration_s", max_duration_s)

    trial.set_user_attr(
        "model_params",
        {
            "n_estimators": trial.suggest_int("n_estimators", 10, 200),
            "criterion": trial.suggest_categorical(
                "criterion", ["squared_error", "absolute_error", "friedman_mse"]
            ),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 5),
            "bootstrap": trial.suggest_categorical("bootstrap", [True]),
            "max_samples": trial.suggest_float("max_samples", 0.0, 0.2),
            "max_features": trial.suggest_int(
                "max_features", 1, X_train_scaled.shape[1] // 2
            ),
            "n_jobs": -1,
            "random_state": 42,
        },
    )

    model = ExtraTreesRegressor(**trial.user_attrs["model_params"])

    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size, :], Y_train_pca[:training_size, :])
        signal.alarm(0)
        error = mse_on_test_dataset(model)
        save_best_model(model, error, trial)
    except KeyboardInterrupt:
        raise
    except:
        raise optuna.exceptions.TrialPruned()
    return error


study = optuna.create_study(
    study_name="ExtraTreesRegressor",
    storage=SQLITE_DB,
    load_if_exists=True,
)
trials_due = 100 * RUN_ITERATIONS_PERCENT // 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)

print(
    f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}"
)

[32m[I 2023-04-07 12:06:29,797][0m Using an existing study with name 'ExtraTreesRegressor' instead of creating a new one.[0m


model: ExtraTreesRegressor test dataset error: 0.035638099610715714 best_params: {'bootstrap': True, 'criterion': 'friedman_mse', 'max_features': 5, 'max_samples': 0.18501919599037991, 'min_samples_split': 2, 'n_estimators': 182}


In [18]:
from sklearn.neighbors import KNeighborsRegressor

common_params = {"n_jobs": -1}


def objective(trial):
    phase = min((trial.number // 30), 4)

    training_sizes = (0.01, 0.03, 0.1, 0.3, 1.0)
    training_max_duration_s = (120, 180, 360, 600, 1200)
    training_size = int(training_sizes[phase] * X_train_scaled.shape[0])
    max_duration_s = training_max_duration_s[phase]

    trial_params = {}
    trial_params["n_neighbors"] = trial.suggest_int("n_neighbors", 1, 100)
    trial_params["weights"] = trial.suggest_categorical(
        "weights", ["uniform", "distance"]
    )
    trial_params["algorithm"] = trial.suggest_categorical(
        "algorithm", ["ball_tree", "kd_tree", "brute"]
    )
    if trial_params["algorithm"] != "brute":
        trial_params["leaf_size"] = trial.suggest_int("leaf_size", 10, 50)
    trial_params["p"] = trial.suggest_int("p", 1, 5)

    model_params = {**trial_params, **common_params}
    trial.set_user_attr("model_params", model_params)
    model = KNeighborsRegressor(**trial.user_attrs["model_params"])

    try:
        signal.alarm(max_duration_s)
        model.fit(X_train_scaled[:training_size, :], Y_train_pca[:training_size, :])
        error = mse_on_test_dataset(model)
        signal.alarm(0)
    except KeyboardInterrupt:
        raise
    except Exception as e:
        print(e)
        raise optuna.exceptions.TrialPruned()
    return error


study = optuna.create_study(
    study_name="KNeighborsRegressor",
    storage=SQLITE_DB,
    load_if_exists=True,
)
trials_due = 100 * RUN_ITERATIONS_PERCENT // 100 - len(study.trials)
if trials_due > 0:
    study.optimize(objective, n_trials=trials_due)

print(
    f"model: {study.study_name} test dataset error: {study.best_value} best_params: {study.best_params}"
)

[32m[I 2023-04-07 12:06:29,878][0m Using an existing study with name 'KNeighborsRegressor' instead of creating a new one.[0m


model: KNeighborsRegressor test dataset error: 0.21196498486521023 best_params: {'algorithm': 'kd_tree', 'leaf_size': 13, 'n_neighbors': 9, 'p': 3, 'weights': 'uniform'}
