In [17]:
!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

Looking in links: http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html


### Step 1: Install and Import H2O

In [23]:
import h2o
from h2o.automl import H2OAutoML
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,25 days 22 hours 25 mins
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,4 months and 20 days
H2O_cluster_name:,H2O_from_python_christinadong_8atvgy
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.917 Gb
H2O_cluster_total_cores:,14
H2O_cluster_allowed_cores:,14


### Step 2: Load Train/Test Data into H2O

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [25]:
df = pd.read_csv('/Users/christinadong/Documents/MLops/mlops-final/data/healthcare-dataset-stroke-data.csv')
#  Train-Test Split
# ============================
# Drop ID column if it exists
if 'id' in df.columns:
    df = df.drop(columns=['id'])
    
target_col = 'stroke'
X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTrain set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Train set shape: (4088, 10)
Test set shape: (1022, 10)


In [26]:
# Join X and y so H2OFrame has the target in the same table
train_pd = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
test_pd  = pd.concat([X_test.reset_index(drop=True),  y_test.reset_index(drop=True)],  axis=1)

train_h2o = h2o.H2OFrame(train_pd)
test_h2o  = h2o.H2OFrame(test_pd)

# Cast columns: make target categorical; make string/object columns categorical too
y = target_col
x = [c for c in train_h2o.columns if c != y]

# If this is classification, ensure factor
train_h2o[y] = train_h2o[y].asfactor()
test_h2o[y]  = test_h2o[y].asfactor()

# Optional: make object/string columns categorical
for c in x:
    if train_pd[c].dtype == "object":
        train_h2o[c] = train_h2o[c].asfactor()
        test_h2o[c]  = test_h2o[c].asfactor()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


### Step 3: Run AutoML

In [27]:
aml = H2OAutoML(
    max_runtime_secs=600, 
    seed=42,
    nfolds=5,
    balance_classes=True,
    sort_metric="AUCPR",
    stopping_metric="AUCPR",
    verbosity="info"
)
aml.train(x=x, y=y, training_frame=train_h2o)

AutoML progress: |
19:49:41.290: Project: AutoML_6_20250816_194941
19:49:41.290: Setting stopping tolerance adaptively based on the training frame: 0.015640281177246362
19:49:41.290: Build control seed: 42
19:49:41.290: training frame: Frame key: AutoML_6_20250816_194941_training_py_27_sid_b7e9    cols: 11    rows: 4088  chunks: 1    size: 44184  checksum: 4140185812486358258
19:49:41.290: validation frame: NULL
19:49:41.290: leaderboard frame: NULL
19:49:41.290: blending frame: NULL
19:49:41.290: response column: stroke
19:49:41.290: fold column: null
19:49:41.290: weights column: null
19:49:41.291: AutoML: XGBoost is not available; skipping it.
19:49:41.291: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (6g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), lr_annealing (6g, 10w

Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,42.0,42.0,8352.0,4.0,4.0,4.0,7.0,15.0,11.214286

Unnamed: 0,0,1,Error,Rate
0,2882.0,1007.0,0.2589,(1007.0/3889.0)
1,273.0,3615.0,0.0702,(273.0/3888.0)
Total,3155.0,4622.0,0.1646,(1280.0/7777.0)

metric,threshold,value,idx
max f1,0.0451149,0.8495887,290.0
max f2,0.0188047,0.916955,347.0
max f0point5,0.1060223,0.8364129,188.0
max accuracy,0.0652451,0.8365694,254.0
max precision,0.4533989,1.0,0.0
max recall,0.0121394,1.0,361.0
max specificity,0.4533989,1.0,0.0
max absolute_mcc,0.0451149,0.683112,290.0
max min_per_class_accuracy,0.0697953,0.8259193,246.0
max mean_per_class_accuracy,0.0652451,0.8365723,254.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100296,0.3738567,1.9746129,1.9746129,0.9871795,0.40152,0.9871795,0.40152,0.0198045,0.0198045,97.4612879,97.4612879,0.0195474
2,0.0203163,0.3429758,1.9252476,1.9496178,0.9625,0.3558838,0.9746835,0.3784131,0.0198045,0.0396091,92.5247557,94.9617779,0.0385805
3,0.0304745,0.3278892,2.0002572,1.9664976,1.0,0.3356479,0.9831224,0.364158,0.0203189,0.059928,100.0257202,96.6497586,0.0588994
4,0.0405041,0.3171271,2.0002572,1.9748571,1.0,0.3193547,0.9873016,0.3530639,0.0200617,0.0799897,100.0257202,97.485711,0.0789612
5,0.0514337,0.281831,1.8590626,1.9502508,0.9294118,0.3013831,0.975,0.3420817,0.0203189,0.1003086,85.9062576,95.0250772,0.0977373
6,0.1023531,0.2154002,1.7830576,1.8670742,0.8914141,0.2460963,0.9334171,0.2943301,0.0907922,0.1911008,78.3057556,86.7074247,0.1774726
7,0.1500579,0.1874631,1.7953791,1.8442817,0.8975741,0.2027298,0.9220223,0.2652095,0.0856481,0.276749,79.5379106,84.4281704,0.2533496
8,0.2018773,0.1693887,1.8414278,1.8435492,0.9205955,0.1776893,0.9216561,0.2427442,0.0954218,0.3721708,84.1427846,84.3549153,0.3405431
9,0.3015302,0.1352883,1.7111878,1.799805,0.8554839,0.1500184,0.8997868,0.2120992,0.1705247,0.5426955,71.1187774,79.9804987,0.4822686
10,0.4019545,0.10353,1.6032791,1.750705,0.8015365,0.1183595,0.8752399,0.1886793,0.1610082,0.7037037,60.327914,75.070496,0.6034209

Unnamed: 0,0,1,Error,Rate
0,3585.0,304.0,0.0782,(304.0/3889.0)
1,114.0,85.0,0.5729,(114.0/199.0)
Total,3699.0,389.0,0.1023,(418.0/4088.0)

metric,threshold,value,idx
max f1,0.1198857,0.2891156,142.0
max f2,0.0714271,0.4164058,219.0
max f0point5,0.205021,0.2601626,61.0
max accuracy,0.3842821,0.9522994,5.0
max precision,0.4418487,1.0,0.0
max recall,0.0009482,1.0,397.0
max specificity,0.4418487,1.0,0.0
max absolute_mcc,0.0993673,0.2709549,173.0
max min_per_class_accuracy,0.0502757,0.7487437,260.0
max mean_per_class_accuracy,0.0280742,0.7656397,310.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100294,0.2666467,8.5177105,8.5177105,0.4146341,0.316389,0.4146341,0.316389,0.0854271,0.0854271,751.7710504,751.7710504,0.0792559
2,0.0200587,0.2158186,4.5093761,6.5135433,0.2195122,0.2393438,0.3170732,0.2778664,0.0452261,0.1306533,350.9376149,551.3543326,0.1162537
3,0.0300881,0.1916068,4.5093761,5.8454876,0.2195122,0.2039552,0.2845528,0.2532293,0.0452261,0.1758794,350.9376149,484.5487601,0.1532515
4,0.0401174,0.1781861,2.505209,5.0104179,0.1219512,0.1847303,0.2439024,0.2361046,0.0251256,0.201005,150.5208972,401.0417943,0.1691202
5,0.0501468,0.1612327,4.0083344,4.8100012,0.195122,0.1705133,0.2341463,0.2229863,0.040201,0.241206,300.8334355,381.0001226,0.2008358
6,0.1000489,0.115854,3.9272835,4.3697215,0.1911765,0.1362964,0.2127139,0.1797474,0.1959799,0.4371859,292.7283476,336.9721468,0.3543883
7,0.1501957,0.0900534,2.7056257,3.8141195,0.1317073,0.1031933,0.1856678,0.1541878,0.1356784,0.5728643,170.5625689,281.4119457,0.4442966
8,0.2000978,0.0699427,1.9132919,3.3400745,0.0931373,0.079223,0.1625917,0.1354924,0.0954774,0.6683417,91.329195,234.0074455,0.4922039
9,0.3001468,0.0435644,1.2556671,2.645272,0.0611247,0.0558338,0.1287694,0.1089395,0.1256281,0.7939698,25.5667088,164.5272,0.519092
10,0.3999511,0.0272005,1.0573456,2.2490188,0.0514706,0.0347555,0.1094801,0.0904276,0.1055276,0.8994975,5.7345551,124.9018794,0.5251082

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8762079,0.0675440,0.8386308,0.9303178,0.9217604,0.7747858,0.9155447
aic,,0.0,,,,,
auc,0.8488876,0.0386519,0.8165709,0.8963701,0.8838980,0.8139921,0.8336068
err,0.1237921,0.0675440,0.1613692,0.0696821,0.0782396,0.2252142,0.0844553
err_count,101.2,55.1788,132.0,57.0,64.0,184.0,69.0
f0point5,0.2621635,0.0680947,0.1893287,0.3171642,0.3114187,0.1860465,0.3068592
f1,0.3143576,0.0573467,0.25,0.3736264,0.36,0.2580645,0.3300971
f2,0.4054349,0.0413604,0.3678930,0.4545455,0.4265403,0.4210526,0.3571429
lift_top_group,7.24316,2.559299,4.4336042,5.6805553,9.825826,6.189394,10.08642
loglikelihood,,0.0,,,,,

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
,2025-08-16 19:50:56,1 min 10.909 sec,0.0,0.6735225,1.5360139,0.5,0.4999357,1.0,0.5000643
,2025-08-16 19:50:56,1 min 10.916 sec,5.0,0.6600246,1.3805707,0.8725088,0.8444809,1.8573817,0.1963482
,2025-08-16 19:50:56,1 min 10.922 sec,10.0,0.6472321,1.2795387,0.8822187,0.8603506,1.9261736,0.1901762
,2025-08-16 19:50:56,1 min 10.929 sec,15.0,0.6356801,1.2033966,0.8866377,0.8640736,1.9502508,0.1933908
,2025-08-16 19:50:56,1 min 10.936 sec,20.0,0.6279096,1.1617033,0.8918899,0.8696068,1.9252476,0.1939051
,2025-08-16 19:50:56,1 min 10.943 sec,25.0,0.6200879,1.1247493,0.8964447,0.8752156,1.9746129,0.1897904
,2025-08-16 19:50:56,1 min 10.951 sec,30.0,0.614097,1.0981361,0.9006744,0.8808444,1.9489686,0.1832326
,2025-08-16 19:50:56,1 min 10.958 sec,35.0,0.6092045,1.0747616,0.9041368,0.883612,1.9794212,0.1760319
,2025-08-16 19:50:56,1 min 10.965 sec,40.0,0.60503,1.0555195,0.9080957,0.8868378,1.9746129,0.1693455
,2025-08-16 19:50:56,1 min 10.969 sec,42.0,0.6029978,1.0481216,0.9096188,0.8885604,1.9746129,0.1645879

variable,relative_importance,scaled_importance,percentage
age,2578.2583008,1.0,0.6711542
avg_glucose_level,404.95401,0.1570649,0.1054148
bmi,266.2915649,0.1032835,0.0693192
hypertension,226.3267975,0.0877828,0.0589158
ever_married,195.428421,0.0757986,0.0508726
smoking_status,74.3838654,0.0288504,0.0193631
work_type,44.5692711,0.0172866,0.011602
heart_disease,25.3855915,0.009846,0.0066082
gender,16.4829903,0.0063931,0.0042907
Residence_type,9.4479218,0.0036645,0.0024594


### Step 4: Leaderboard + pick the best model

In [28]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)
leader = aml.leader
leader.model_id

'GBM_grid_1_AutoML_6_20250816_194941_model_188'

### Step 5: Evaluate on test set

In [29]:
perf = leader.model_performance(test_h2o)
print(perf)                    

# Pull common metrics
test_auc    = perf.auc()
test_logloss= perf.logloss()
test_f1_opt = perf.F1()  # best F1 across thresholds
# Find the row with maximum F1
best_f1_row = max(test_f1_opt, key=lambda x: x[1])  # x[1] = F1 value

thresh = best_f1_row[0]
test_f1_opt = best_f1_row[1]
print({"AUC": test_auc, "LogLoss": test_logloss, "Best F1": test_f1_opt, "Threshold": thresh})

# Confusion matrix at F1-optimal threshold
cm = perf.confusion_matrix(metrics="f1", thresholds=[thresh])
cm

ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.04162539499875424
RMSE: 0.2040230256582679
LogLoss: 0.16167443264747958
Mean Per-Class Error: 0.22635802469135802
AUC: 0.8373559670781893
AUCPR: 0.2677956860964926
Gini: 0.6747119341563785

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.08378066947222473
       0    1    Error    Rate
-----  ---  ---  -------  --------------
0      843  129  0.1327   (129.0/972.0)
1      16   34   0.32     (16.0/50.0)
Total  859  163  0.1419   (145.0/1022.0)

Maximum Metrics: Maximum metrics at their respective thresholds
metric                       threshold    value     idx
---------------------------  -----------  --------  -----
max f1                       0.0837807    0.319249  140
max f2                       0.0814816    0.469169  147
max f0point5                 0.236792     0.357143  11
max accuracy                 0.293372     0.95499   5
max precision                0.386469     1         0
max recall                   

[Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.08378066947222473
       0    1    Error    Rate
-----  ---  ---  -------  --------------
0      843  129  0.1327   (129.0/972.0)
1      16   34   0.32     (16.0/50.0)
Total  859  163  0.1419   (145.0/1022.0),
 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.08378066947222473
       0    1    Error    Rate
-----  ---  ---  -------  --------------
0      843  129  0.1327   (129.0/972.0)
1      16   34   0.32     (16.0/50.0)
Total  859  163  0.1419   (145.0/1022.0)]

### Step 6: Save the model for deployment

In [30]:
model_dir = "models_h2o"
native_path = h2o.save_model(leader, path=model_dir, force=True)
mojo_path   = leader.download_mojo(path=model_dir, get_genmodel_jar=True)
print(native_path, mojo_path)

/Users/christinadong/Documents/MLops/auto_ml/models_h2o/GBM_grid_1_AutoML_6_20250816_194941_model_188 /Users/christinadong/Documents/MLops/mlops-final/models/models_h2o/GBM_grid_1_AutoML_6_20250816_194941_model_188.zip


### Step 7: Get predictions on test set

In [31]:
pred = leader.predict(test_h2o)  # columns: p0, p1, predict
pred.head()
# Optionally join predictions back to original rows:
pred_with_labels = test_h2o.cbind(pred)
pred_with_labels.head()

gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,predict,p0,p1
Male,63.0,0,0,Yes,Private,Rural,78.23,34.8,never smoked,0,0,0.970624,0.0293761
Female,43.0,0,0,Yes,Private,Urban,86.67,33.3,never smoked,0,0,0.994789,0.00521058
Female,23.0,0,0,No,Private,Urban,126.67,28.7,smokes,0,0,0.998116,0.00188383
Female,21.0,0,0,No,Private,Urban,208.17,24.9,never smoked,0,0,0.997395,0.00260484
Male,67.0,0,0,Yes,Private,Rural,113.34,26.3,formerly smoked,0,1,0.948039,0.0519612
Male,0.16,0,0,No,children,Urban,114.71,17.4,Unknown,0,0,0.997885,0.0021152
Female,47.0,0,0,Yes,Private,Urban,210.95,50.1,Unknown,0,0,0.973011,0.0269893
Female,20.0,0,0,No,Govt_job,Rural,73.0,20.8,never smoked,0,0,0.99832,0.00167975
Female,15.0,0,0,No,Private,Rural,79.2,22.4,never smoked,0,0,0.998017,0.00198261
Female,2.0,0,0,No,children,Rural,100.66,18.5,Unknown,0,0,0.998375,0.00162478


### Step 8: Change at least two features and re-validate

In [38]:
changed = test_h2o[:] 
num_cols = [c for c in x if test_pd[c].dtype != "object"]

# pick two numeric features to perturb
feat_a, feat_b = ("bmi", "avg_glucose_level") if {"bmi","avg_glucose_level"}.issubset(set(num_cols)) else (num_cols[0], num_cols[1])

# add small noise / shift  (tune scale as you like)
changed[feat_a] = changed[feat_a] * 1.10           # +10%
changed[feat_b] = changed[feat_b] + 5              # +5 absolute units

perf_changed = leader.model_performance(changed)

# Find the row with maximum F1
test_f1_opt = perf_changed.F1()
best_f1_row = max(test_f1_opt, key=lambda x: x[1])  # x[1] = F1 value
test_f1_opt = best_f1_row[1]
print("Changed data metrics:", {
    "AUC": perf_changed.auc(),
    "LogLoss": perf_changed.logloss(),
    "Best F1": test_f1_opt
})

Changed data metrics: {'AUC': 0.8422325102880658, 'LogLoss': 0.16061327593965388, 'Best F1': 0.3401360544217687}


### Step 9: Quick monitoring hooks you can show

In [41]:
import pandas as pd, datetime as dt

def summarize_perf(perf_obj, tag):
    test_f1_opt = perf_obj.F1()
    best_f1_row = max(test_f1_opt, key=lambda x: x[1]) 
    test_f1_opt = best_f1_row[1]
    return {
        "when": dt.datetime.utcnow().isoformat(),
        "tag": tag,
        "auc": perf_obj.auc(),
        "logloss": perf_obj.logloss(),
        "f1_best": test_f1_opt
    }

logs = []
logs.append(summarize_perf(perf, "baseline_test"))
logs.append(summarize_perf(perf_changed, "changed_numeric"))  # if you ran Option A
pd.DataFrame(logs).to_csv("automl_scoring_log.csv", index=False)


In [42]:
log = pd.read_csv("automl_scoring_log.csv")
log

Unnamed: 0,when,tag,auc,logloss,f1_best
0,2025-08-17T01:05:35.821397,baseline_test,0.837356,0.161674,0.319249
1,2025-08-17T01:05:35.821468,changed_numeric,0.842233,0.160613,0.340136
