Date: 19.05.2023 \
Author: Reto Hendry

This scripts is used to run the autoML tool on the best parameters and save the model which allows to extract the feature importance.

The best parameters are resample cube 3 and number of features 70. This can be seen in the script "etric_analysis_autoML_tools.ipynb"

In [24]:
import numpy as np
import pandas as pd
import h2o
import os
import sys
import datetime

from h2o.automl import H2OAutoML
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.base import BaseEstimator

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
from functions.function_get_label_df import get_label_df
from functions.function_get_component_array import get_component_array
from functions.function_get_best_features import get_best_features_sorted
from functions.function_resample_4d_array import resample_4d_array

## Save the best model which supports feature importance extraction

In [25]:
# parameters to define
component = [1]
resample_cube = 3
number_of_features = 70

##############################################

# get the label data
df_label = get_label_df(data_path_optional="../../data/")

# get the MVPA data arrays
component_array_5d = get_component_array(component, data_path_optional="../../data/")
print(f"shape of component_array_5d: {component_array_5d.shape}")

sample_array_4d = resample_4d_array(component_array_5d[0], resample_cube)
print(f"shape of resampled sample_array_4d: {sample_array_4d.shape}")

# get the sorted feature list
best_feature_list = get_best_features_sorted(
    sample_array_4d, df_label
)

# select desired number of features
best_features = best_feature_list[:number_of_features]

  df_label = (pd.read_excel(os.path.join(data_path, label_file),


shape of component_array_5d: (1, 90, 91, 109, 91)
Resampling in progress...


100%|██████████| 90/90 [00:00<00:00, 109.76it/s]


shape of resampled sample_array_4d: (90, 31, 37, 31)
Calculating list of best features ...


100%|██████████| 35556/35556 [00:34<00:00, 1044.55it/s]


In [26]:
# reshape 4d array to dataframe
sample_df = pd.DataFrame(
    sample_array_4d.reshape(sample_array_4d.shape[0], -1)
    ).iloc[:, best_features]

# split data into train and test
x_train, x_test, y_train, y_test = train_test_split(
    sample_df, 
    df_label["Cond"], 
    test_size=0.2, 
    random_state=42,
    stratify=df_label["Cond"]
)

# Convert the column names of x_train and x_test to strings
    # the conversion into strings is necessary for the H2O AutoML
    # the h2o dataframes convert all types to strings
x_train.columns = x_train.columns.astype(str)
x_test.columns = x_test.columns.astype(str)

# initialize h2o server
h2o.init(
    ip="localhost", 
    port=54323,
    nthreads=-1,
    min_mem_size=64,  # 64 GB
    max_mem_size=160,  # 160 GB
)

# Convert train and test sets to H2O DataFrames 
x_train_h2o = h2o.H2OFrame(pd.concat([x_train, y_train], axis=1))
x_test_h2o = h2o.H2OFrame(pd.concat([x_test, y_test], axis=1))

x_features = x_train.columns.tolist()
y_label = "Cond"

x_train_h2o[y_label] = x_train_h2o[y_label].asfactor()
x_test_h2o[y_label] = x_test_h2o[y_label].asfactor()

Checking whether there is an H2O instance running at http://localhost:54323. connected.


0,1
H2O_cluster_uptime:,21 mins 01 secs
H2O_cluster_timezone:,Europe/Zurich
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.1
H2O_cluster_version_age:,3 months and 10 days
H2O_cluster_name:,H2O_from_python_tahendry_a3aarn
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,160.0 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [27]:
# define the autoML model
h2o_model = H2OAutoML(
    max_runtime_secs=7200, 
    max_models=100, 
    nfolds=9,
    balance_classes=True,  # stratified sampling
    seed=1,  # reproducibility
    stopping_metric="auc",
    stopping_rounds=10,  # stop training if the score doesn't improve for 10 rounds
    verbosity="info",
    exclude_algos=["DeepLearning"],
)

print(f"Fitting tpot model on {len(best_features)} features on component {component}...")
h2o_model.train(x=x_features, y=y_label, training_frame=x_train_h2o)

best_model = h2o_model.leader

Fitting tpot model on 70 features on component [1]...
AutoML progress: |
16:41:58.477: Project: AutoML_1_20230519_164158
16:41:58.486: Setting stopping tolerance adaptively based on the training frame: 0.05
16:41:58.487: Build control seed: 1
16:41:58.487: training frame: Frame key: AutoML_1_20230519_164158_training_py_10_sid_966a    cols: 71    rows: 72  chunks: 1    size: 45711  checksum: 2728487036948607530
16:41:58.487: validation frame: NULL
16:41:58.487: leaderboard frame: NULL
16:41:58.487: blending frame: NULL
16:41:58.487: response column: Cond
16:41:58.487: fold column: null
16:41:58.487: weights column: null
16:41:58.509: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (7g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), lr_annealing (7g, 10w)]}, {DeepLearning : [def_1 

In [28]:
# cv-accuracy and cv-standard deviation from best model
cv_metrics = best_model.cross_validation_metrics_summary().as_data_frame()
cv_mean_accuracy = cv_metrics.loc[cv_metrics[""]=="accuracy", "mean"].values[0]
cv_std = cv_metrics.loc[cv_metrics[""]=="accuracy", "sd"].values[0]

### calculate test metrics
test_pred = best_model.predict(x_test_h2o).as_data_frame().loc[:,"predict"]
test_accuracy = accuracy_score(y_test, test_pred)
test_f1_score = f1_score(y_test, test_pred, average="weighted")
test_precision_score = precision_score(y_test, test_pred, average="weighted")

# print all the results
print(f"cv_mean_accuracy: {cv_mean_accuracy}")
print(f"cv_std: {cv_std}")
print(f"test_accuracy: {test_accuracy}")
print(f"test_f1_score: {test_f1_score}")
print(f"test_precision_score: {test_precision_score}")

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
cv_mean_accuracy: 0.89483327
cv_std: 0.095870316
test_accuracy: 0.9444444444444444
test_f1_score: 0.9442724458204333
test_precision_score: 0.9500000000000001


In [29]:
lb = h2o.automl.get_leaderboard(h2o_model, extra_columns = "ALL")
lb

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse,training_time_ms,predict_time_per_row_ms,algo
StackedEnsemble_BestOfFamily_1_AutoML_1_20230519_164158,0.892664,0.418394,0.885922,0.165251,0.367076,0.134745,6722,0.205516,StackedEnsemble
GLM_1_AutoML_1_20230519_164158,0.890347,0.41428,0.897775,0.205792,0.374644,0.140358,80,0.223664,GLM
StackedEnsemble_AllModels_1_AutoML_1_20230519_164158,0.881853,0.428248,0.869833,0.218533,0.375379,0.140909,7093,0.58007,StackedEnsemble
GBM_grid_1_AutoML_1_20230519_164158_model_31,0.866409,0.46841,0.849624,0.204247,0.390463,0.152462,250,0.140345,GBM
XGBoost_lr_search_selection_AutoML_1_20230519_164158_select_grid_model_2,0.864093,0.475633,0.8377,0.181853,0.392344,0.153934,464,0.127187,XGBoost
GBM_grid_1_AutoML_1_20230519_164158_model_4,0.856371,0.471172,0.836557,0.205019,0.392596,0.154132,220,0.135254,GBM
GBM_grid_1_AutoML_1_20230519_164158_model_26,0.854054,0.505611,0.854513,0.205792,0.403218,0.162585,302,0.11289,GBM
GBM_grid_1_AutoML_1_20230519_164158_model_18,0.850193,0.541393,0.804062,0.192278,0.4121,0.169826,497,0.155068,GBM
GBM_grid_1_AutoML_1_20230519_164158_model_25,0.844015,0.484889,0.842154,0.209653,0.399824,0.159859,305,0.138177,GBM
GBM_grid_1_AutoML_1_20230519_164158_model_10,0.840154,0.53479,0.827824,0.206564,0.426978,0.18231,274,0.141963,GBM


In [31]:
# save the first model which supports feature importance
model_for_fi = h2o.get_model(lb[1, "model_id"])
# save best model
id = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

h2o.save_model(
        model=model_for_fi, 
        path=f"../param_sweep_best_models/h2o_best_models/h2o_model_{id}",
        force=True,  # overwrite existing model
    )

print(f"saved model to ../param_sweep_best_models/h2o_best_models/h2o_model_{id}")

saved model to ../param_sweep_best_models/h2o_best_models/h2o_model_20230519_165118
