# h2o automl training pipeline (binomial) with stacked ensembles

this notebook trains models to predict `outcome_lead` using all variables except:
- `icu_stay_id`
- `time_window_index`

for each dataset (`oneicu`, `eicu`, `mimiciv`), we:
1. load the previously created train/test csvs from `../data/machine_learning`
2. train h2o automl (stacked ensembles **included**, nothing excluded)
3. save the **best overall model** and the **best glm** into `../data/machine_learning/models/<dataset>/`
4. write a manifest for later evaluation


In [1]:
from pathlib import Path
import json
import logging

import numpy as np
import pandas as pd

import h2o
from h2o.automl import H2OAutoML
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

import gc

# paths
data_dir = Path("../data/machine_learning")
models_root = Path("../output/models")
models_root.mkdir(parents=True, exist_ok=True)

# datasets to train
datasets = ["oneicu", "eicu", "mimiciv"]

# columns
id_col = "icu_stay_id"
time_index_col = "time_window_index"
target = "outcome_lead"

# automl settings (adjust as you like)
aml_seed = 813
aml_max_runtime_secs = 60*60*3
aml_sort_metric = "AUC"

# logging
logging.basicConfig(level=20, format="%(asctime)s %(levelname)s %(message)s")

# start h2o
h2o.init()
h2o.no_progress()  # hide per-iteration progress; comment this line to see progress

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,1 day 1 hour 20 mins
H2O_cluster_timezone:,Asia/Tokyo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,5 months and 8 days
H2O_cluster_name:,H2O_from_python_kinoshitatakashihiroshi_xo9ujw
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,5.248 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [2]:
def import_train_pd(dataset_name):
    """
    read train csv into a pandas dataframe (fast), with minimal dtype fuss.
    """
    train_path = data_dir / f"ml_{dataset_name}_train.csv"
    if not train_path.exists():
        raise FileNotFoundError(f"missing train file for {dataset_name} under {data_dir}")
    # low_memory=False avoids mixed-type inference churn on large csvs
    df = pd.read_csv(train_path, low_memory=False)
    return df


def split_patient_level_pd(df, id_col, seed=813, blend_frac=0.10, lb_frac=0.10):
    """
    split a pandas dataframe into base/blend/leaderboard by unique patient (icu_stay_id).
    """
    uniq = df[id_col].unique()
    rng = np.random.default_rng(seed)
    rng.shuffle(uniq)

    n = len(uniq)
    n_blend = int(n * blend_frac)
    n_lb    = int(n * lb_frac)

    blend_ids = set(uniq[:n_blend])
    lb_ids    = set(uniq[n_blend:n_blend + n_lb])
    base_ids  = set(uniq[n_blend + n_lb:])

    blend_pd = df.loc[df[id_col].isin(blend_ids), :].reset_index(drop=True)
    lb_pd    = df.loc[df[id_col].isin(lb_ids), :].reset_index(drop=True)
    base_pd  = df.loc[df[id_col].isin(base_ids), :].reset_index(drop=True)

    return base_pd, blend_pd, lb_pd


def to_h2o_binomial_numeric(df_pd, target, features):
    """
    convert pandas -> H2OFrame, enforce numeric features, and set binomial target.
    """
    hf = h2o.H2OFrame(df_pd)
    for c in features:
        hf[c] = hf[c].asnumeric()
    hf[target] = hf[target].asfactor()
    return hf


def feature_columns(hf, target, drop_cols):
    """
    return feature column names = all columns minus target and drop_cols that exist.
    """
    cols = [c for c in hf.columns if c != target and c not in drop_cols and c in hf.columns]
    return cols


def best_glm_from_automl(aml):
    """
    fetch the best glm trained during automl by auc (fallback to aucpr/logloss).
    raises if none is present.
    """
    lb_df = aml.leaderboard.as_data_frame()
    glm_rows = lb_df[lb_df["model_id"].str.contains("GLM_", na=False)]

    if glm_rows.empty:
        raise RuntimeError("automl did not produce a glm model. consider increasing max_runtime_secs or nfolds.")

    glm_rows = glm_rows.sort_values("auc", ascending=False)
    
    best_glm_id = glm_rows.iloc[0]["model_id"]
    return h2o.get_model(best_glm_id)


def save_models(dataset_name, leader, glm_model, x):
    """
    save leader and glm under models_root/dataset_name and write a simple manifest.
    """
    out_dir = (models_root / dataset_name)
    out_dir.mkdir(parents=True, exist_ok=True)

    leader_path = h2o.save_model(model=leader, path=out_dir.as_posix(), force=True)
    glm_path = h2o.save_model(model=glm_model, path=out_dir.as_posix(), force=True)

    manifest = {
        "dataset": dataset_name,
        "leader_model_path": leader_path,
        "glm_model_path": glm_path,
        "features": x,
        "target": target,
        "drop_columns": [id_col, time_index_col],
        "automl_settings": {
            "seed": aml_seed,
            "nfolds": 0,
            "sort_metric": aml_sort_metric,
        },
    }
    with open(out_dir / "manifest.json", "w", encoding="utf-8") as f:
        json.dump(manifest, f, indent=2)
    return leader_path, glm_path


In [3]:
results = {}

for name in datasets:
    logging.info("=== training dataset: %s ===", name)
    train_pd = import_train_pd(name)

    x = [c for c in train_pd.columns if c not in (target, id_col, time_index_col)]
    if not x:
        raise ValueError(f"no feature columns found for {name}.")

    # disjoint patient-level splits for base, blending, leaderboard
    base_pd, blend_pd, lb_pd = split_patient_level_pd(
        train_pd, id_col=id_col, seed=aml_seed, blend_frac=0.10, lb_frac=0.10
    )

    base_train = to_h2o_binomial_numeric(base_pd, target, x)
    blend      = to_h2o_binomial_numeric(blend_pd, target, x)
    lb_frame   = to_h2o_binomial_numeric(lb_pd, target, x)

    del train_pd, base_pd, blend_pd, lb_pd
    gc.collect()

    aml = H2OAutoML(
        seed=aml_seed,
        nfolds=0,
        max_models=20,
        max_runtime_secs=aml_max_runtime_secs,
        max_runtime_secs_per_model=1800,
        sort_metric=aml_sort_metric,
    )
    aml.train(
        x=x,
        y=target,
        training_frame=base_train,
        blending_frame=blend,
        leaderboard_frame=lb_frame,
    )

    # print full leaderboard for this dataset
    lb = aml.leaderboard
    lb_df = lb.as_data_frame(use_pandas=True, use_multi_thread=True)
    print(f"\n=== leaderboard for {name} (rows={lb.nrows}) ===")
    print(lb_df.to_string(index=True))

    # leader and best glm
    leader = aml.leader
    glm_model = best_glm_from_automl(aml)

    # save models + manifest
    leader_path, glm_path = save_models(name, leader, glm_model, x)

    # record summary
    results[name] = {
        "leader_model_path": leader_path,
        "glm_model_path": glm_path,
        "n_features": len(x),
    }

    h2o.remove_all()
    del aml, leader, glm_model, base_train, blend, lb_frame, lb, lb_df
    gc.collect()

2025-09-04 22:57:06,415 INFO === training dataset: oneicu ===



22:57:45.12: AutoML: XGBoost is not available; skipping it.


=== leaderboard for oneicu (rows=22) ===
                                                   model_id       auc   logloss     aucpr  mean_per_class_error      rmse       mse
0      StackedEnsemble_AllModels_1_AutoML_1_20250904_225743  0.976367  0.129648  0.902373              0.104635  0.194257  0.037736
1   StackedEnsemble_BestOfFamily_1_AutoML_1_20250904_225743  0.976210  0.130020  0.901833              0.100431  0.194601  0.037870
2                            GBM_3_AutoML_1_20250904_225743  0.975939  0.130802  0.900988              0.104147  0.194995  0.038023
3                            GBM_1_AutoML_1_20250904_225743  0.975919  0.130892  0.900698              0.100128  0.195254  0.038124
4               GBM_grid_1_AutoML_1_20250904_225743_model_5  0.975917  0.130698  0.901287              0.102391  0.194842  0.037963
5                            GBM_2_AutoML_1_20250904_225743  0.975907  0.130712  0.901166              0



2025-09-04 23:14:00,817 INFO === training dataset: eicu ===



23:14:12.389: AutoML: XGBoost is not available; skipping it.


=== leaderboard for eicu (rows=22) ===
                                                   model_id       auc   logloss     aucpr  mean_per_class_error      rmse       mse
0      StackedEnsemble_AllModels_1_AutoML_2_20250904_231411  0.946315  0.168314  0.763660              0.164269  0.221667  0.049136
1   StackedEnsemble_BestOfFamily_1_AutoML_2_20250904_231411  0.946045  0.168570  0.762705              0.165539  0.221861  0.049222
2                            GBM_3_AutoML_2_20250904_231411  0.946013  0.169021  0.761668              0.165520  0.222137  0.049345
3                            GBM_2_AutoML_2_20250904_231411  0.945903  0.169034  0.761859              0.163105  0.222045  0.049304
4               GBM_grid_1_AutoML_2_20250904_231411_model_5  0.945720  0.168990  0.762445              0.154708  0.221924  0.049250
5                            GBM_1_AutoML_2_20250904_231411  0.945712  0.169520  0.759074              0.



2025-09-04 23:21:52,323 INFO === training dataset: mimiciv ===



23:22:00.439: AutoML: XGBoost is not available; skipping it.


=== leaderboard for mimiciv (rows=22) ===
                                                   model_id       auc   logloss     aucpr  mean_per_class_error      rmse       mse
0      StackedEnsemble_AllModels_1_AutoML_3_20250904_232200  0.828924  0.258614  0.410010              0.280202  0.273937  0.075042
1   StackedEnsemble_BestOfFamily_1_AutoML_3_20250904_232200  0.828051  0.259190  0.409895              0.275477  0.274164  0.075166
2               GBM_grid_1_AutoML_3_20250904_232200_model_5  0.827274  0.259696  0.405491              0.282342  0.274495  0.075348
3                            GBM_2_AutoML_3_20250904_232200  0.825777  0.260702  0.398696              0.278549  0.275217  0.075744
4                            GLM_1_AutoML_3_20250904_232200  0.824442  0.269470  0.397602              0.276640  0.277073  0.076769
5      DeepLearning_grid_2_AutoML_3_20250904_232200_model_2  0.823670  0.265014  0.391196             





In [4]:
results

{'oneicu': {'leader_model_path': '/Users/kinoshitatakashihiroshi/Dropbox/VS_Code/research-database-description-2024/output/models/oneicu/StackedEnsemble_AllModels_1_AutoML_1_20250904_225743',
  'glm_model_path': '/Users/kinoshitatakashihiroshi/Dropbox/VS_Code/research-database-description-2024/output/models/oneicu/GLM_1_AutoML_1_20250904_225743',
  'n_features': 70},
 'eicu': {'leader_model_path': '/Users/kinoshitatakashihiroshi/Dropbox/VS_Code/research-database-description-2024/output/models/eicu/StackedEnsemble_AllModels_1_AutoML_2_20250904_231411',
  'glm_model_path': '/Users/kinoshitatakashihiroshi/Dropbox/VS_Code/research-database-description-2024/output/models/eicu/GLM_1_AutoML_2_20250904_231411',
  'n_features': 22},
 'mimiciv': {'leader_model_path': '/Users/kinoshitatakashihiroshi/Dropbox/VS_Code/research-database-description-2024/output/models/mimiciv/StackedEnsemble_AllModels_1_AutoML_3_20250904_232200',
  'glm_model_path': '/Users/kinoshitatakashihiroshi/Dropbox/VS_Code/rese

In [5]:
try:
    h2o.cluster().shutdown(prompt=False)
except Exception:
    pass

H2O session _sid_b19b closed.
