In this notebook we have hand-adjusted the params of best performing ML model (Neural network implemenation `MLPRegressor`) selected from multiple models in notebook `02. Training ML models` to get even better performing one.


In [1]:
LOWER_LIMIT = -9
PCA_COMPONENTS=12

#loading dataset
import numpy as np

input_and_output = np.load("../final/dataset.npz")
inputs  = input_and_output["inputs"].astype(np.float64)
inputs  = input_and_output["inputs"].astype(np.float64)
outputs = input_and_output["outputs"].astype(np.float64)
dataset_size = inputs.shape[0]

print("loaded dataset")

# transforming time profiles to its orders of magnitude

def output_transform(outputs: np.array) -> np.array:
    x = np.copy(outputs)
    zeros_in_output = x <= 0
    x[zeros_in_output] = 1
    y = np.log10(x)
    y[zeros_in_output] = LOWER_LIMIT
    y[y < LOWER_LIMIT] = LOWER_LIMIT
    return y
    
def output_untransform(transformed_outputs: np.array) -> np.array:
    lower_limits = transformed_outputs <= LOWER_LIMIT
    z = 10 ** transformed_outputs
    z[lower_limits] = 0
    return z

def apply_size_limit(outputs: np.array) -> np.array:
    x = np.copy(outputs)
    x[x < LOWER_LIMIT] = LOWER_LIMIT
    return x

def apply_absolute_size_limit(outputs: np.array) -> np.array:
    limit = 10 ** LOWER_LIMIT
    x = np.copy(outputs)
    x[x < limit] = 0
    return x

outputs_order_of_magnitude = output_transform(outputs)
print("transformed to orders of magnitude")

# dropping treatment column in input

def drop_treatment(input_data: np.ndarray) -> np.ndarray:
    """Drops treatment data from the dataset"""
    if input_data.shape[1] == 11:
        return input_data[:, 1:]

    return input_data

input_without_treatment = drop_treatment(inputs)

print("dropped treatment column")

#splitting data into train, test, validate datasets 
train_size = int(dataset_size * 0.7)
test_size = int(dataset_size * 0.15)

X_train = input_without_treatment[:train_size, :]
Y_train = outputs_order_of_magnitude[:train_size, :]
Y_train_absolute = apply_absolute_size_limit(outputs[:train_size, :])
print(f"train sizes: {X_train.shape}, {Y_train.shape}")
X_test = input_without_treatment[train_size:(train_size + test_size), :]
Y_test = outputs_order_of_magnitude[train_size:(train_size + test_size), :]
Y_test_absolute = apply_absolute_size_limit(outputs[train_size:(train_size + test_size), :])
print(f"test sizes: {X_test.shape}, {Y_test.shape}")
print("train test split")

# scaling inputs

import pickle
from pathlib import Path

LOGNORMAL_PARAMETERS = (1, 2)

class CustomScaler:
    def __init__(self):
        super().__init__()
        self.scaler = MinMaxScaler()
        self.plot_loval = [0.0] * len(LOGNORMAL_PARAMETERS)
        self.plot_hival = [1.0] * len(LOGNORMAL_PARAMETERS)

    def transform(self, x: np.ndarray, copy=None) -> np.ndarray:
        res = self.scaler.transform(x)
        for i, parameter_index in enumerate(LOGNORMAL_PARAMETERS):
            res[:, parameter_index] = (x[:, parameter_index] - self.plot_loval[i]) / (self.plot_hival[i] - self.plot_loval[i])

        return res

    def fit(self, x, copy=None):
        self.scaler.fit(x)
        for i, parameter_index in enumerate(LOGNORMAL_PARAMETERS):
            column_values = x[:, parameter_index]

            quantile_1, quantile_3 = np.quantile(column_values, [0.25, 0.75], axis=0)
            iqr = quantile_3 - quantile_1

            loval = quantile_1 - 1.5 * iqr
            hival = quantile_3 + 1.5 * iqr

            wiskhi = np.compress(column_values <= hival, column_values)
            wisklo = np.compress(column_values >= loval, column_values)
            actual_hival = np.max(wiskhi)
            actual_loval = np.min(wisklo)

            self.plot_loval[i] = actual_loval
            self.plot_hival[i] = actual_hival

        return self

    def inverse_transform(self, x, copy=None):
        res = self.scaler.inverse_transform(x)
        for i, parameter_index in enumerate(LOGNORMAL_PARAMETERS):
            res[:, parameter_index] = x[:, parameter_index] * (self.plot_hival[i] - self.plot_loval[i]) + self.plot_loval[i]
        return res

scaler_path = Path(f"../final/scaler.pickle")
scaler = None
if scaler_path.exists():
    with scaler_path.open("rb") as scaler_file:
        scaler = pickle.load(scaler_file)
else:
    scaler = CustomScaler().fit(X_train)
    with scaler_path.open("wb") as opened_file:
        pickle.dump(scaler, opened_file)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("scaled")

# applying principal component analysis

from sklearn.decomposition import PCA

pca_path = Path(f"../final/pca{PCA_COMPONENTS}_{LOWER_LIMIT}.pickle")

if pca_path.exists():
    with pca_path.open("rb") as opened_file:
        pca = pickle.load(opened_file)
    Y_train_pca = pca.transform(Y_train)
else: 
    pca = PCA(n_components=PCA_COMPONENTS)
    Y_train_pca = pca.fit_transform(Y_train)
    with pca_path.open("wb") as opened_file:
        pickle.dump(pca, opened_file)

from functools import reduce
print(f"applied pca with {PCA_COMPONENTS} components. Unexplained variance ratio: {reduce(lambda a, b: a - b, pca.explained_variance_ratio_, 1.0)}")

import time
from sklearn.metrics import mean_squared_error

loaded dataset
transformed to orders of magnitude
dropped treatment column
train sizes: (700000, 10), (700000, 200)
test sizes: (150000, 10), (150000, 200)
train test split
scaled
applied pca with 12 components. Unexplained variance ratio: 1.5039885656489999e-06


In [2]:
from sklearn.neural_network import MLPRegressor
from threadpoolctl import threadpool_limits
from cpuinfo import get_cpu_info
import json


hidden_layer_sizes = [600, 100, 40]
training_start = time.time()
for k in range(5):
    last_file = f"../final/MLPRegressor_{'_'.join(str(i) for i in hidden_layer_sizes)}_{PCA_COMPONENTS}_{LOWER_LIMIT}_{k}.pickle"
    info_filename = f"../final/MLPRegressor_{'_'.join(str(i) for i in hidden_layer_sizes)}_{PCA_COMPONENTS}_{LOWER_LIMIT}_{k}.json"
    
    if Path(last_file).exists():
        print(f"loading previous {last_file}")
        with Path(last_file).open("rb") as opened_file:
            model = pickle.load(opened_file)
#         with Path(info_filename).open('r') as opened_file:
#             print(opened_file.read())
#         continue
    
    if k > 0:
        old_model = model
    model_params = {
        "alpha": 0.0040005316095293 / (2 ** k),
        "batch_size": 2000,
        "hidden_layer_sizes": hidden_layer_sizes,
        "learning_rate": "constant",
        "learning_rate_init": 0.00016798744315656234 / (2 ** k),
        "max_iter": 400,
        "n_iter_no_change": 5,
        "random_state": 42,
        "tol": 1e-05 / (2**k),
        "epsilon": 1e-08 / (2**k),
        "verbose": True,
        "warm_start": k > 0
    }
#     model = MLPRegressor(**model_params)
#     if k > 0:
#         for variable_name in ("coefs_", "t_", "n_outputs_", "n_layers_", "out_activation_", "intercepts_", "n_iter_", "loss_curve_", "best_loss_", "_no_improvement_count"):
#             setattr(model, variable_name, getattr(old_model, variable_name))
        
    with threadpool_limits(limits=get_cpu_info()["count"], user_api='blas'):
#         model.fit(X_train_scaled, Y_train_pca)
        error_test  = mean_squared_error(Y_test,  apply_size_limit(pca.inverse_transform(model.predict(X_test_scaled))))
        error_train = mean_squared_error(Y_train, apply_size_limit(pca.inverse_transform(model.predict(X_train_scaled))))
        error_test_absolute  = mean_squared_error(Y_test_absolute,  output_untransform(pca.inverse_transform(model.predict(X_test_scaled))))
        error_train_absolute = mean_squared_error(Y_train_absolute, output_untransform(pca.inverse_transform(model.predict(X_train_scaled))))

    print(f"error test: {error_test}, error train: {error_train} training_time: {time.time() - training_start:.1f}")
        
#     with Path(last_file).open("wb") as opened_file:
#         print(f"saving {last_file}")
#         pickle.dump(model, opened_file)
    with Path(info_filename).open('w') as opened_file:
        info = json.dumps({
            "cpu_info": {key: get_cpu_info()[key] for key in ["arch", "bits", "brand_raw", "count", "l2_cache_size"]},
            "pca_components": PCA_COMPONENTS,
            "pca_unexplained_variance_ratio": reduce(lambda a, b: a - b, pca.explained_variance_ratio_, 1.0),
            "tumour_lower_size_limit_l": 10 ** LOWER_LIMIT,
            "tumour_lower_size_limit_log10_l": LOWER_LIMIT,
            "model_params": model_params,
            "test_dataset": "[700000:850000] of ../final/dataset.npz",
            "test_error_orders_of_magnitude": error_test,
            "test_error_absolute": error_test_absolute,
            "train_dataset": "[:700000] of ../final/dataset.npz",
            "train_error_orders_of_magnitude": error_train,
            "train_error_absolute": error_train_absolute
        }, sort_keys=True, indent=4)
        print(f"saving info to file: {info_filename} {info}")
        opened_file.write(info)

loading previous ../final/MLPRegressor_600_100_40_12_-9_0.pickle
error test: 5.6158462752489024e-05, error train: 5.167492621982182e-05 training_time: 118.3
saving info to file: ../final/MLPRegressor_600_100_40_12_-9_0.json {
    "cpu_info": {
        "arch": "X86_64",
        "bits": 64,
        "brand_raw": "Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz",
        "count": 8,
        "l2_cache_size": 262144
    },
    "model_params": {
        "alpha": 0.0040005316095293,
        "batch_size": 2000,
        "epsilon": 1e-08,
        "hidden_layer_sizes": [
            600,
            100,
            40
        ],
        "learning_rate": "constant",
        "learning_rate_init": 0.00016798744315656234,
        "max_iter": 300,
        "n_iter_no_change": 5,
        "random_state": 42,
        "tol": 1e-05,
        "verbose": true,
        "warm_start": false
    },
    "pca_components": 12,
    "pca_unexplained_variance_ratio": 1.5039885656489999e-06,
    "test_dataset": "[700000:850000