# ROC evaluation for H2O AutoML leaders and GLMs (OneICU / MIMIC‑IV / eICU)

This notebook:
- loads `leader` (best) and `glm` models saved by the H2O AutoML training pipeline for each dataset
- imports the corresponding **test** split from `../data/machine_learning`
- computes **AUROC** for both models (GLM and leader) on each dataset
- draws two ROC figures with **seaborn** (white background, **no grid lines**):
  1) GLM models compared across datasets
  2) Best (leader) models compared across datasets
- saves figures as half‑column width **TIFF** files at **600 dpi** under `../output/machine_learning`

Datasets and display settings:
- Datasets: `oneicu`, `mimiciv`, `eicu`
- Display labels: **OneICU**, **MIMIC‑IV**, **eICU**
- Colors: `oneicu=#6CC5B0FF`, `mimiciv=#4269D0FF`, `eicu=#FF8AB7FF`

Requirements: `h2o`, `seaborn`, `matplotlib`, `pandas`, `scikit-learn`

In [1]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
import h2o

import logging

data_dir = Path("../data/machine_learning")
models_root = Path("../output/models")
figs_dir = Path("../output/machine_learning")
figs_dir.mkdir(parents=True, exist_ok=True)

datasets = ["oneicu", "mimiciv", "eicu"]
labels = {"oneicu": "OneICU", "mimiciv": "MIMIC-IV", "eicu": "eICU"}
colors = {"oneicu": "#6CC5B0FF", "mimiciv": "#4269D0FF", "eicu": "#FF8AB7FF"}

target = "outcome_lead"

fig_width_in = 3.35
fig_height_in = 3.35
fig_dpi = 600

sns.set_theme(
    style="white",
    context="notebook",
    rc={
        "font.size": 10,        # base font size
        "axes.titlesize": 12,   # axes title
        "axes.labelsize": 10,   # x/y labels
        "legend.fontsize": 8,  # legend
        "xtick.labelsize": 8,  # ticks
        "ytick.labelsize": 8,
    },
)
plt.rcParams["axes.grid"] = False

h2o.init()
h2o.no_progress()

logging.basicConfig(level=20, format="%(asctime)s %(levelname)s %(message)s")

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "21.0.3" 2024-04-16; OpenJDK Runtime Environment Homebrew (build 21.0.3); OpenJDK 64-Bit Server VM Homebrew (build 21.0.3, mixed mode, sharing)
  Starting server from /Users/kinoshitatakashihiroshi/Dropbox/VS_Code/OneICU_profile_paper/.venv/lib/python3.12/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/c8/r1g7pwls4d504kywff1wyfl80000gn/T/tmpk9xttavn
  JVM stdout: /var/folders/c8/r1g7pwls4d504kywff1wyfl80000gn/T/tmpk9xttavn/h2o_kinoshitatakashihiroshi_started_from_python.out
  JVM stderr: /var/folders/c8/r1g7pwls4d504kywff1wyfl80000gn/T/tmpk9xttavn/h2o_kinoshitatakashihiroshi_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest

0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Asia/Tokyo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,5 months and 8 days
H2O_cluster_name:,H2O_from_python_kinoshitatakashihiroshi_bgj2eu
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,5.984 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [2]:
def import_test(name: str, feats: list[str]):
    """import the test CSV for a dataset and ensure binomial target."""
    test_path = data_dir / f"ml_{name}_test.csv"
    if not test_path.exists():
        raise FileNotFoundError(f"missing test file for {name}: {test_path}")
    test = h2o.import_file(test_path.as_posix())
    needed = list(feats) + [target]
    test = test[:, needed]
    for c in feats:
        test[c] = test[c].asnumeric()
    test[target] = test[target].asfactor()

    return test


def load_manifest(name: str) -> dict:
    """load the training manifest for a dataset (contains model paths + feature list)."""
    mpath = models_root / name / "manifest.json"
    if not mpath.exists():
        raise FileNotFoundError(f"missing manifest for {name}: {mpath}")
    with open(mpath, "r", encoding="utf-8") as f:
        return json.load(f)


def load_models_and_features(name: str):
    """return (leader_model, glm_model, features) for a dataset."""
    man = load_manifest(name)
    leader = h2o.load_model(man["leader_model_path"])
    glm_model = h2o.load_model(man["glm_model_path"])
    feats = man.get("features", [])
    return leader, glm_model, feats


def ensure_columns(test, feats):
    """restrict test frame to the training features + target; validate presence."""
    keep = [c for c in feats if c in test.columns] + [target]
    missing = [c for c in feats if c not in test.columns]
    if missing:
        raise ValueError(
            f"test frame is missing columns used in training: "
            f"{missing[:5]}{'...' if len(missing) > 5 else ''}"
        )
    return test[:, keep]


def roc_from_model(model, test):
    """compute ROC arrays and AUROC for a given H2O model and H2OFrame test."""
    pred = model.predict(test)
    # y_true as integers 0/1 (handles string factors "0"/"1")
    y_true = test[target].as_data_frame(use_pandas=True, use_multi_thread=True)[target]
    y_true = pd.to_numeric(y_true, errors="coerce").astype(int).values
    # predicted prob for positive class
    y_score = pred["p1"].as_data_frame(use_pandas=True, use_multi_thread=True).values.ravel().astype(float)
    fpr, tpr, _ = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    return fpr, tpr, roc_auc


def plot_roc_curves(curves, title: str, out_path: Path):
    """plot ROC curves with seaborn (white background, no grid), save as TIFF."""
    fig, ax = plt.subplots(figsize=(fig_width_in, fig_height_in))

    for c in curves:
        sns.lineplot(
            x=c["fpr"], y=c["tpr"], ax=ax,
            label=f"{c['label']} (AUROC={c['auc']:.3f})",
            color=c["color"], linewidth=1.8
        )

    # diagonal baseline
    ax.plot([0, 1], [0, 1], linestyle="--", linewidth=1.0, color="#888888")

    # cosmetics: white background, no grid, clean spines
    ax.set_xlim(0.0, 1.0)
    ax.set_ylim(0.0, 1.0)
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.set_title(title)
    ax.grid(False)
    sns.despine(ax=ax)
    ax.legend(loc="lower right", frameon=False)

    fig.tight_layout()
    fig.savefig(out_path, dpi=fig_dpi, format="tiff")
    plt.close(fig)


In [3]:
curves_glm = []
curves_leader = []
rows = []

for name in datasets:
    logging.info("=== dataset: %s | inference started ===", name)
    
    leader, glm_model, feats = load_models_and_features(name)

    # ensure test has exactly the training features + target
    test_use = import_test(name, feats)

    # glm roc
    fpr_g, tpr_g, auc_g = roc_from_model(glm_model, test_use)
    curves_glm.append({
        "dataset": name,
        "label": labels[name],
        "color": colors[name],
        "fpr": fpr_g,
        "tpr": tpr_g,
        "auc": auc_g,
    })

    # leader (best model) roc
    fpr_l, tpr_l, auc_l = roc_from_model(leader, test_use)
    curves_leader.append({
        "dataset": name,
        "label": labels[name],
        "color": colors[name],
        "fpr": fpr_l,
        "tpr": tpr_l,
        "auc": auc_l,
    })

    rows.append({
        "dataset": labels[name],
        "glm_auroc": round(auc_g, 4),
        "best_model_auroc": round(auc_l, 4),
        "n_test_rows": test_use.nrows,
    })

# print compact summary table
summary_df = pd.DataFrame(rows)
print(summary_df.to_string(index=False))

# plot and save figures
glm_tiff = figs_dir / "roc_glm_three_datasets.tiff"
best_tiff = figs_dir / "roc_best_three_datasets.tiff"
plot_roc_curves(curves_glm, "Logistic regression models", glm_tiff)
plot_roc_curves(curves_leader, "Best machine learning models", best_tiff)

print("saved figures:")
print(" -", glm_tiff)
print(" -", best_tiff)


2025-09-05 01:02:14,763 INFO === dataset: oneicu | inference started ===




2025-09-05 01:02:33,953 INFO === dataset: mimiciv | inference started ===




2025-09-05 01:02:41,190 INFO === dataset: eicu | inference started ===




 dataset  glm_auroc  best_model_auroc  n_test_rows
  OneICU     0.9696            0.9772       357631
MIMIC-IV     0.8321            0.8371       223016
    eICU     0.9353            0.9496       236381






saved figures:
 - ../output/machine_learning/roc_glm_three_datasets.tiff
 - ../output/machine_learning/roc_best_three_datasets.tiff
