In this notebook we have hand-adjusted the params of best performing ML model (Neural network implemenation `MLPRegressor`) selected from multiple models in notebook `02. Training ML models` to get even better performing one.


In [1]:
#loading dataset
import numpy as np

input_and_output = np.load("../final/dataset.npz")
inputs  = input_and_output["inputs"].astype(np.float64)
inputs  = input_and_output["inputs"].astype(np.float64)
outputs = input_and_output["outputs"].astype(np.float64)
dataset_size = inputs.shape[0]

print("loaded dataset")

loaded dataset


In [3]:
for LOWER_LIMIT in [-7]:
    for PCA_COMPONENTS in [13, 11, 14, 10, 16]:
        # transforming time profiles to its orders of magnitude
        def output_transform(outputs: np.array) -> np.array:
            x = np.copy(outputs)
            zeros_in_output = x <= 0
            x[zeros_in_output] = 1
            y = np.log10(x)
            y[zeros_in_output] = LOWER_LIMIT
            y[y < LOWER_LIMIT] = LOWER_LIMIT
            return y

        def output_untransform(transformed_outputs: np.array) -> np.array:
            lower_limits = transformed_outputs <= LOWER_LIMIT
            z = 10 ** transformed_outputs
            z[lower_limits] = 0
            return z

        def apply_size_limit(outputs: np.array) -> np.array:
            x = np.copy(outputs)
            x[x < LOWER_LIMIT] = LOWER_LIMIT
            return x

        def apply_absolute_size_limit(outputs: np.array) -> np.array:
            limit = 10 ** LOWER_LIMIT
            x = np.copy(outputs)
            x[x < limit] = 0
            return x

        outputs_order_of_magnitude = output_transform(outputs)
        print("transformed to orders of magnitude")

        # dropping treatment column in input

        def drop_treatment(input_data: np.ndarray) -> np.ndarray:
            """Drops treatment data from the dataset"""
            if input_data.shape[1] == 11:
                return input_data[:, 1:]

            return input_data

        input_without_treatment = drop_treatment(inputs)

        print("dropped treatment column")

        #splitting data into train, test, validate datasets 
        train_size = int(dataset_size * 0.7)
        test_size = int(dataset_size * 0.15)

        X_train = input_without_treatment[:train_size, :]
        Y_train = outputs_order_of_magnitude[:train_size, :]
        Y_train_absolute = apply_absolute_size_limit(outputs[:train_size, :])
        print(f"train sizes: {X_train.shape}, {Y_train.shape}")
        X_test = input_without_treatment[train_size:(train_size + test_size), :]
        Y_test = outputs_order_of_magnitude[train_size:(train_size + test_size), :]
        Y_test_absolute = apply_absolute_size_limit(outputs[train_size:(train_size + test_size), :])
        print(f"test sizes: {X_test.shape}, {Y_test.shape}")

        print("train test split")

        # scaling inputs

        import pickle
        from pathlib import Path

        LOGNORMAL_PARAMETERS = (1, 2)

        class CustomScaler:
            def __init__(self):
                super().__init__()
                self.scaler = MinMaxScaler()
                self.plot_loval = [0.0] * len(LOGNORMAL_PARAMETERS)
                self.plot_hival = [1.0] * len(LOGNORMAL_PARAMETERS)

            def transform(self, x: np.ndarray, copy=None) -> np.ndarray:
                res = self.scaler.transform(x)
                for i, parameter_index in enumerate(LOGNORMAL_PARAMETERS):
                    res[:, parameter_index] = (x[:, parameter_index] - self.plot_loval[i]) / (self.plot_hival[i] - self.plot_loval[i])

                return res

            def fit(self, x, copy=None):
                self.scaler.fit(x)
                for i, parameter_index in enumerate(LOGNORMAL_PARAMETERS):
                    column_values = x[:, parameter_index]

                    quantile_1, quantile_3 = np.quantile(column_values, [0.25, 0.75], axis=0)
                    iqr = quantile_3 - quantile_1

                    loval = quantile_1 - 1.5 * iqr
                    hival = quantile_3 + 1.5 * iqr

                    wiskhi = np.compress(column_values <= hival, column_values)
                    wisklo = np.compress(column_values >= loval, column_values)
                    actual_hival = np.max(wiskhi)
                    actual_loval = np.min(wisklo)

                    self.plot_loval[i] = actual_loval
                    self.plot_hival[i] = actual_hival

                return self

            def inverse_transform(self, x, copy=None):
                res = self.scaler.inverse_transform(x)
                for i, parameter_index in enumerate(LOGNORMAL_PARAMETERS):
                    res[:, parameter_index] = x[:, parameter_index] * (self.plot_hival[i] - self.plot_loval[i]) + self.plot_loval[i]
                return res

        scaler_path = Path(f"../final/scaler.pickle")
        scaler = None
        if scaler_path.exists():
            with scaler_path.open("rb") as scaler_file:
                scaler = pickle.load(scaler_file)
        else:
            scaler = CustomScaler().fit(X_train)
            with scaler_path.open("wb") as opened_file:
                pickle.dump(scaler, opened_file)

        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        print("scaled")

        # applying principal component analysis

        from sklearn.decomposition import PCA

        pca_path = Path(f"../final/pca{PCA_COMPONENTS}_{LOWER_LIMIT}.pickle")

        if pca_path.exists():
            with pca_path.open("rb") as opened_file:
                pca = pickle.load(opened_file)
            Y_train_pca = pca.transform(Y_train)
        else: 
            pca = PCA(n_components=PCA_COMPONENTS)
            Y_train_pca = pca.fit_transform(Y_train)
            with pca_path.open("wb") as opened_file:
                pickle.dump(pca, opened_file)

        from functools import reduce
        print(f"applied pca with {PCA_COMPONENTS} components. Unexplained variance ratio: {reduce(lambda a, b: a - b, pca.explained_variance_ratio_, 1.0)}")

        import time
        from sklearn.metrics import mean_squared_error

        from sklearn.neural_network import MLPRegressor
        from threadpoolctl import threadpool_limits
        from cpuinfo import get_cpu_info
        import json


        hidden_layer_sizes = [600, 100, 40]
        training_start = time.time()
        for k in range(5):
            last_file = f"../final/MLPRegressor_{'_'.join(str(i) for i in hidden_layer_sizes)}_{PCA_COMPONENTS}_{LOWER_LIMIT}_{k}.pickle"
            info_filename = f"../final/MLPRegressor_{'_'.join(str(i) for i in hidden_layer_sizes)}_{PCA_COMPONENTS}_{LOWER_LIMIT}_{k}.json"

            if Path(last_file).exists():
                print(f"loading previous {last_file}")
                with Path(last_file).open("rb") as opened_file:
                    model = pickle.load(opened_file)
#                 with Path(info_filename).open('r') as opened_file:
#                     print(opened_file.read())
#                 continue

            if k > 0:
                old_model = model
            model_params = {
                "alpha": 0.0040005316095293 / (2 ** k),
                "batch_size": 2000,
                "hidden_layer_sizes": hidden_layer_sizes,
                "learning_rate": "constant",
                "learning_rate_init": 0.00016798744315656234 / (2 ** k),
                "max_iter": 400,
                "n_iter_no_change": 5,
                "random_state": 42,
                "tol": 1e-05 / (2**k),
                "epsilon": 1e-08 / (2**k),
                "verbose": True,
                "warm_start": k > 0
            }
#             model = MLPRegressor(**model_params)
#             if k > 0:
#                 for variable_name in ("coefs_", "t_", "n_outputs_", "n_layers_", "out_activation_", "intercepts_", "n_iter_", "loss_curve_", "best_loss_", "_no_improvement_count"):
#                     setattr(model, variable_name, getattr(old_model, variable_name))

            with threadpool_limits(limits=get_cpu_info()["count"], user_api='blas'):
#                 model.fit(X_train_scaled, Y_train_pca)
                test_result = pca.inverse_transform(model.predict(X_test_scaled))
                error_test  = mean_squared_error(Y_test,  apply_size_limit(test_result))
                error_test_absolute  = mean_squared_error(Y_test_absolute,  output_untransform(test_result))
                
                train_result = pca.inverse_transform(model.predict(X_train_scaled))
                error_train = mean_squared_error(Y_train, apply_size_limit(train_result))
                error_train_absolute = mean_squared_error(Y_train_absolute, output_untransform(train_result))

            print(f"error test: {error_test}, error train: {error_train} training_time: {time.time() - training_start:.1f}")

#             with Path(last_file).open("wb") as opened_file:
#                 print(f"saving {last_file}")
#                 pickle.dump(model, opened_file)
            with Path(info_filename).open('w') as opened_file:
                info = json.dumps({
                    "cpu_info": {key: get_cpu_info()[key] for key in ["arch", "bits", "brand_raw", "count", "l2_cache_size"]},
                    "pca_components": PCA_COMPONENTS,
                    "pca_unexplained_variance_ratio": reduce(lambda a, b: a - b, pca.explained_variance_ratio_, 1.0),
                    "tumour_lower_size_limit_l": 10 ** LOWER_LIMIT,
                    "tumour_lower_size_limit_log10_l": LOWER_LIMIT,
                    "model_params": model_params,
                    "test_dataset": "[700000:850000] of ../final/dataset.npz",
                    "test_error_orders_of_magnitude": error_test,
                    "test_error_absolute": error_test_absolute,
                    "train_dataset": "[:700000] of ../final/dataset.npz",
                    "train_error_orders_of_magnitude": error_train,
                    "train_error_absolute": error_train_absolute
                }, sort_keys=True, indent=4)
                print(f"saving info to file: {info_filename} {info}")
                opened_file.write(info)

transformed to orders of magnitude
dropped treatment column
train sizes: (700000, 10), (700000, 200)
test sizes: (150000, 10), (150000, 200)
train test split
scaled
applied pca with 13 components. Unexplained variance ratio: 2.7163198894389894e-06
loading previous ../final/MLPRegressor_600_100_40_13_-7_0.pickle
error test: 0.00011813761815683225, error train: 0.00011329371104178528 training_time: 45.5
saving info to file: ../final/MLPRegressor_600_100_40_13_-7_0.json {
    "cpu_info": {
        "arch": "X86_64",
        "bits": 64,
        "brand_raw": "Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz",
        "count": 8,
        "l2_cache_size": 262144
    },
    "model_params": {
        "alpha": 0.0040005316095293,
        "batch_size": 2000,
        "epsilon": 1e-08,
        "hidden_layer_sizes": [
            600,
            100,
            40
        ],
        "learning_rate": "constant",
        "learning_rate_init": 0.00016798744315656234,
        "max_iter": 400,
        "n_iter_

error test: 6.869550937549261e-05, error train: 6.53459505601816e-05 training_time: 113.7
saving info to file: ../final/MLPRegressor_600_100_40_11_-7_1.json {
    "cpu_info": {
        "arch": "X86_64",
        "bits": 64,
        "brand_raw": "Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz",
        "count": 8,
        "l2_cache_size": 262144
    },
    "model_params": {
        "alpha": 0.00200026580476465,
        "batch_size": 2000,
        "epsilon": 5e-09,
        "hidden_layer_sizes": [
            600,
            100,
            40
        ],
        "learning_rate": "constant",
        "learning_rate_init": 8.399372157828117e-05,
        "max_iter": 400,
        "n_iter_no_change": 5,
        "random_state": 42,
        "tol": 5e-06,
        "verbose": true,
        "warm_start": true
    },
    "pca_components": 11,
    "pca_unexplained_variance_ratio": 5.323364604440634e-06,
    "test_dataset": "[700000:850000] of ../final/dataset.npz",
    "test_error_absolute": 1.399597156131

error test: 6.25432036238347e-05, error train: 5.8473181143412716e-05 training_time: 186.7
saving info to file: ../final/MLPRegressor_600_100_40_14_-7_2.json {
    "cpu_info": {
        "arch": "X86_64",
        "bits": 64,
        "brand_raw": "Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz",
        "count": 8,
        "l2_cache_size": 262144
    },
    "model_params": {
        "alpha": 0.001000132902382325,
        "batch_size": 2000,
        "epsilon": 2.5e-09,
        "hidden_layer_sizes": [
            600,
            100,
            40
        ],
        "learning_rate": "constant",
        "learning_rate_init": 4.1996860789140585e-05,
        "max_iter": 400,
        "n_iter_no_change": 5,
        "random_state": 42,
        "tol": 2.5e-06,
        "verbose": true,
        "warm_start": true
    },
    "pca_components": 14,
    "pca_unexplained_variance_ratio": 2.035651196169074e-06,
    "test_dataset": "[700000:850000] of ../final/dataset.npz",
    "test_error_absolute": 1.40740

error test: 5.279655548695672e-05, error train: 4.858001369342426e-05 training_time: 202.0
saving info to file: ../final/MLPRegressor_600_100_40_10_-7_3.json {
    "cpu_info": {
        "arch": "X86_64",
        "bits": 64,
        "brand_raw": "Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz",
        "count": 8,
        "l2_cache_size": 262144
    },
    "model_params": {
        "alpha": 0.0005000664511911625,
        "batch_size": 2000,
        "epsilon": 1.25e-09,
        "hidden_layer_sizes": [
            600,
            100,
            40
        ],
        "learning_rate": "constant",
        "learning_rate_init": 2.0998430394570292e-05,
        "max_iter": 400,
        "n_iter_no_change": 5,
        "random_state": 42,
        "tol": 1.25e-06,
        "verbose": true,
        "warm_start": true
    },
    "pca_components": 10,
    "pca_unexplained_variance_ratio": 7.759383996007976e-06,
    "test_dataset": "[700000:850000] of ../final/dataset.npz",
    "test_error_absolute": 1.30

error test: 4.8976780120705936e-05, error train: 4.500311607361084e-05 training_time: 294.6
saving info to file: ../final/MLPRegressor_600_100_40_16_-7_4.json {
    "cpu_info": {
        "arch": "X86_64",
        "bits": 64,
        "brand_raw": "Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz",
        "count": 8,
        "l2_cache_size": 262144
    },
    "model_params": {
        "alpha": 0.00025003322559558126,
        "batch_size": 2000,
        "epsilon": 6.25e-10,
        "hidden_layer_sizes": [
            600,
            100,
            40
        ],
        "learning_rate": "constant",
        "learning_rate_init": 1.0499215197285146e-05,
        "max_iter": 400,
        "n_iter_no_change": 5,
        "random_state": 42,
        "tol": 6.25e-07,
        "verbose": true,
        "warm_start": true
    },
    "pca_components": 16,
    "pca_unexplained_variance_ratio": 1.2306372410059777e-06,
    "test_dataset": "[700000:850000] of ../final/dataset.npz",
    "test_error_absolute": 1

In [4]:
from IPython.display import HTML, display

iterations = 5
LOWER_LIMIT = -7
html = f"<table>"
for label in ["test_error_orders_of_magnitude", "train_error_orders_of_magnitude", "test_error_absolute", "train_error_absolute"]:
    html += f"<tr><th colspan='{iterations+2}'>{label}</th></tr><tr><th>LIMIT</th><th>PCA</th>{''.join((f'<th>iteration{i}</th>') for i in range(iterations))}</tr>"
    for PCA_COMPONENTS in [10, 11, 12, 13, 14, 16]:
        html += f"<tr><td>1e-0{-LOWER_LIMIT}</td><td>{PCA_COMPONENTS}</td>"
        for k in range(iterations):
            info_filename = f"../final/MLPRegressor_{'_'.join(str(i) for i in hidden_layer_sizes)}_{PCA_COMPONENTS}_{LOWER_LIMIT}_{k}.json"
            f = open(info_filename)
            j = json.load(f)
            f.close()
            html += f"<td>{j[label]}</td>"
        html += "</tr>"
html += "</table>"
display(HTML(html))

test_error_orders_of_magnitude,test_error_orders_of_magnitude,test_error_orders_of_magnitude,test_error_orders_of_magnitude,test_error_orders_of_magnitude,test_error_orders_of_magnitude,test_error_orders_of_magnitude
LIMIT,PCA,iteration0,iteration1,iteration2,iteration3,iteration4
1e-07,10,6.872341093644454e-05,6.817219272590224e-05,5.855298348233891e-05,5.279655548695672e-05,5.05472751793201e-05
1e-07,11,7.663916193435797e-05,6.869550937549261e-05,5.6794118611448956e-05,5.017934667577738e-05,4.766670040913866e-05
1e-07,12,6.564965133072742e-05,6.182032128073571e-05,6.39486364816562e-05,5.585245307461e-05,5.08831237005242e-05
1e-07,13,0.00011813761815683225,6.904388293575632e-05,6.828909641597812e-05,6.146148122620193e-05,5.642585465093235e-05
1e-07,14,7.389150773852237e-05,8.348283364959819e-05,6.25432036238347e-05,5.7537412093511924e-05,5.4798156944876406e-05
1e-07,16,6.150291180409598e-05,5.808409724737274e-05,5.655322770004823e-05,5.380426177647104e-05,4.8976780120705936e-05
train_error_orders_of_magnitude,train_error_orders_of_magnitude,train_error_orders_of_magnitude,train_error_orders_of_magnitude,train_error_orders_of_magnitude,train_error_orders_of_magnitude,train_error_orders_of_magnitude
LIMIT,PCA,iteration0,iteration1,iteration2,iteration3,iteration4
1e-07,10,6.314488028094433e-05,6.353843709417273e-05,5.406155010518911e-05,4.858001369342426e-05,4.639399386297132e-05
1e-07,11,7.343942184578304e-05,6.53459505601816e-05,5.3626920337633704e-05,4.738456739098163e-05,4.510225961217227e-05
