## Using Automated Machine Learning With H2O

### Setting Up The H2O Environment

In [1]:
# Importing necessary tools
import h2o
from h2o.automl import H2OAutoML
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [2]:
# Initializing our H2O cluster
h2o.init(name="h2ocluster", nthreads=6)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.301-b09, mixed mode)
  Starting server from C:\Users\tigra\AppData\Local\Programs\Python\Python39\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\tigra\AppData\Local\Temp\tmpsk53oka1
  JVM stdout: C:\Users\tigra\AppData\Local\Temp\tmpsk53oka1\h2o_tigra_started_from_python.out
  JVM stderr: C:\Users\tigra\AppData\Local\Temp\tmpsk53oka1\h2o_tigra_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Asia/Yerevan
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.7
H2O_cluster_version_age:,1 month and 5 days
H2O_cluster_name:,h2ocluster
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.545 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,6


### Preparing Data For Training

#### Loading data

In [3]:
# Loading a dataset for training
data = datasets.load_breast_cancer()

In [4]:
# Viewing the keys in the dataset dict
print(data.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


#### Splitting data into train and test sets

In [6]:
# Splitting our data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data["data"], 
                                                    data["target"], 
                                                    test_size=0.2, 
                                                    stratify=data["target"])

#### Converting data into H2OFrame objects

In [7]:
# Creating H2OFrame objects for train data
train_features = h2o.H2OFrame(X_train, column_names=list(data["feature_names"]))
train_labels = h2o.H2OFrame(y_train, column_names=["target"])

# Creating H2OFrame objects for test data
test_features = h2o.H2OFrame(X_test, column_names=list(data["feature_names"]))
test_labels = h2o.H2OFrame(y_test, column_names=["target"])

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [8]:
# Joining our features and labels
train_frame = train_features.cbind(train_labels)
test_frame = test_features.cbind(test_labels)    

In [9]:
# Viewing our training frame
print(train_frame)

mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
19.73,19.82,130.7,1206.0,0.1062,0.1849,0.2417,0.0974,0.1733,0.06697,0.7661,0.78,4.115,92.81,0.008482,0.05057,0.068,0.01971,0.01467,0.007259,25.28,25.59,159.8,1933.0,0.171,0.5955,0.8489,0.2507,0.2749,0.1297,0
14.27,22.55,93.77,629.8,0.1038,0.1154,0.1463,0.06139,0.1926,0.05982,0.2027,1.851,1.895,18.54,0.006113,0.02583,0.04645,0.01276,0.01451,0.003756,15.29,34.27,104.3,728.3,0.138,0.2733,0.4234,0.1362,0.2698,0.08351,0
13.43,19.63,85.84,565.4,0.09048,0.06288,0.05858,0.03438,0.1598,0.05671,0.4697,1.147,3.142,43.4,0.006003,0.01063,0.02151,0.009443,0.0152,0.001868,17.98,29.87,116.6,993.6,0.1401,0.1546,0.2644,0.116,0.2884,0.07371,0
12.16,18.03,78.29,455.3,0.09087,0.07838,0.02916,0.01527,0.1464,0.06284,0.2194,1.19,1.678,16.26,0.004911,0.01666,0.01397,0.005161,0.01454,0.001858,13.34,27.87,88.83,547.4,0.1208,0.2279,0.162,0.0569,0.2406,0.07729,1
11.29,13.04,72.23,388.0,0.09834,0.07608,0.03265,0.02755,0.1769,0.0627,0.1904,0.5293,1.164,13.17,0.006472,0.01122,0.01282,0.008849,0.01692,0.002817,12.32,16.18,78.27,457.5,0.1358,0.1507,0.1275,0.0875,0.2733,0.08022,1
12.21,14.09,78.78,462.0,0.08108,0.07823,0.06839,0.02534,0.1646,0.06154,0.2666,0.8309,2.097,19.96,0.004405,0.03026,0.04344,0.01087,0.01921,0.004622,13.13,19.29,87.65,529.9,0.1026,0.2431,0.3076,0.0914,0.2677,0.08824,1
10.05,17.53,64.41,310.8,0.1007,0.07326,0.02511,0.01775,0.189,0.06331,0.2619,2.015,1.778,16.85,0.007803,0.01449,0.0169,0.008043,0.021,0.002778,11.16,26.84,71.98,384.0,0.1402,0.1402,0.1055,0.06499,0.2894,0.07664,1
9.683,19.34,61.05,285.7,0.08491,0.0503,0.02337,0.009615,0.158,0.06235,0.2957,1.363,2.054,18.24,0.00744,0.01123,0.02337,0.009615,0.02203,0.004154,10.93,25.59,69.1,364.2,0.1199,0.09546,0.0935,0.03846,0.2552,0.0792,1
20.16,19.66,131.1,1274.0,0.0802,0.08564,0.1155,0.07726,0.1928,0.05096,0.5925,0.6863,3.868,74.85,0.004536,0.01376,0.02645,0.01247,0.02193,0.001589,23.06,23.03,150.2,1657.0,0.1054,0.1537,0.2606,0.1425,0.3055,0.05933,0
12.56,19.07,81.92,485.8,0.0876,0.1038,0.103,0.04391,0.1533,0.06184,0.3602,1.478,3.212,27.49,0.009853,0.04235,0.06271,0.01966,0.02639,0.004205,13.37,22.43,89.02,547.4,0.1096,0.2002,0.2388,0.09265,0.2121,0.07188,1





#### Identifying features and labels

In [10]:
# Specifying feature and target names for training
x = train_frame.columns
y = "target"
x.remove(y)

#### Converting integer labels to categoricals

In [11]:
# Converting labels to categoricals
train_frame[y] = train_frame[y].asfactor()
test_frame[y] = test_frame[y].asfactor()

### Training Our AutoML Model

#### Setting the model up

In [12]:
# Creating our AutoML object
aml = H2OAutoML(seed=1, max_runtime_secs=600)

#### Training the model

In [13]:
# Training the AutoML object
aml.train(x=x, y=y, training_frame=train_frame)

AutoML progress: |
14:53:30.993: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████████████████████████| 100%


#### Obtaining the best model

In [14]:
# Viewing training results
lb = aml.leaderboard

print(lb)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_BestOfFamily_AutoML_20211008_145330,0.998225,0.0511393,0.998923,0.0140867,0.121112,0.0146682
DeepLearning_grid__1_AutoML_20211008_145330_model_2,0.998204,0.0860982,0.998914,0.0194014,0.128415,0.0164904
DeepLearning_grid__3_AutoML_20211008_145330_model_5,0.998039,0.202314,0.998859,0.0217234,0.14378,0.0206726
GLM_1_AutoML_20211008_145330,0.997936,0.06819,0.998758,0.0182147,0.135505,0.0183616
StackedEnsemble_AllModels_AutoML_20211008_145330,0.997833,0.110866,0.998703,0.0223426,0.141886,0.0201316
DeepLearning_grid__1_AutoML_20211008_145330_model_7,0.99773,0.08553,0.998635,0.0223426,0.13293,0.0176704
DeepLearning_grid__1_AutoML_20211008_145330_model_8,0.997668,0.0705946,0.998616,0.0258514,0.136865,0.0187322
DeepLearning_grid__1_AutoML_20211008_145330_model_12,0.997626,0.106878,0.998574,0.019969,0.131534,0.0173012
DeepLearning_grid__2_AutoML_20211008_145330_model_3,0.997523,0.113439,0.998522,0.0299794,0.150921,0.0227771
GBM_grid__1_AutoML_20211008_145330_model_5,0.997482,0.0631751,0.998483,0.024097,0.134737,0.018154





In [15]:
# Obtaining the best model
best_model = aml.leader # Equivalent to aml.get_best_model()

In [16]:
# Obtaining the best model based on a performance metric
best_model_auc = aml.get_best_model(criterion="auc")

# Obtaining the best model based on model type
best_model_dl = aml.get_best_model(algorithm="DeepLearning")

# Obtaining the best model based on model type and performance metric
best_model_dl_auc = aml.get_best_model(algorithm="DeepLearning", criterion="auc")

#### Inspecting the best model

In [17]:
# Inspecting the model
print(best_model)

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_BestOfFamily_AutoML_20211008_145330

No model summary for this model

ModelMetricsBinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.00024973344538977067
RMSE: 0.015802956855910565
LogLoss: 0.0023149158320803468
Null degrees of freedom: 454
Residual degrees of freedom: 452
Null deviance: 601.3803498951446
Residual deviance: 2.106573407193115
AIC: 8.106573407193114
AUC: 1.0
AUCPR: 1.0
Gini: 1.0

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9705015773163431: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,170.0,0.0,0.0,(0.0/170.0)
1,1,0.0,285.0,0.0,(0.0/285.0)
2,Total,170.0,285.0,0.0,(0.0/455.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.970502,1.0,250.0
1,max f2,0.970502,1.0,250.0
2,max f0point5,0.970502,1.0,250.0
3,max accuracy,0.970502,1.0,250.0
4,max precision,0.999812,1.0,0.0
5,max recall,0.970502,1.0,250.0
6,max specificity,0.999812,1.0,0.0
7,max absolute_mcc,0.970502,1.0,250.0
8,max min_per_class_accuracy,0.970502,1.0,250.0
9,max mean_per_class_accuracy,0.970502,1.0,250.0



Gains/Lift Table: Avg response rate: 62.64 %, avg score: 62.70 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010989,0.999777,1.596491,1.596491,1.0,0.999791,1.0,0.999791,0.017544,0.017544,59.649123,59.649123,0.017544
1,2,0.021978,0.999766,1.596491,1.596491,1.0,0.99977,1.0,0.999781,0.017544,0.035088,59.649123,59.649123,0.035088
2,3,0.030769,0.999763,1.596491,1.596491,1.0,0.999764,1.0,0.999776,0.014035,0.049123,59.649123,59.649123,0.049123
3,4,0.041758,0.999755,1.596491,1.596491,1.0,0.999758,1.0,0.999771,0.017544,0.066667,59.649123,59.649123,0.066667
4,5,0.050549,0.999747,1.596491,1.596491,1.0,0.999751,1.0,0.999768,0.014035,0.080702,59.649123,59.649123,0.080702
5,6,0.101099,0.999721,1.596491,1.596491,1.0,0.999733,1.0,0.99975,0.080702,0.161404,59.649123,59.649123,0.161404
6,7,0.151648,0.999699,1.596491,1.596491,1.0,0.999711,1.0,0.999737,0.080702,0.242105,59.649123,59.649123,0.242105
7,8,0.2,0.999672,1.596491,1.596491,1.0,0.999688,1.0,0.999725,0.077193,0.319298,59.649123,59.649123,0.319298
8,9,0.301099,0.999594,1.596491,1.596491,1.0,0.999635,1.0,0.999695,0.161404,0.480702,59.649123,59.649123,0.480702
9,10,0.4,0.999441,1.596491,1.596491,1.0,0.999519,1.0,0.999651,0.157895,0.638596,59.649123,59.649123,0.638596




ModelMetricsBinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.014668164562887349
RMSE: 0.12111219824149567
LogLoss: 0.05113934087108624
Null degrees of freedom: 454
Residual degrees of freedom: 452
Null deviance: 605.388807931112
Residual deviance: 46.53680019268848
AIC: 52.53680019268848
AUC: 0.9982249742002065
AUCPR: 0.9989231171861452
Gini: 0.996449948400413

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5547333780959276: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,167.0,3.0,0.0176,(3.0/170.0)
1,1,3.0,282.0,0.0105,(3.0/285.0)
2,Total,170.0,285.0,0.0132,(6.0/455.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.554733,0.989474,249.0
1,max f2,0.287465,0.990928,257.0
2,max f0point5,0.554733,0.989474,249.0
3,max accuracy,0.554733,0.986813,249.0
4,max precision,0.999927,1.0,0.0
5,max recall,0.086097,1.0,263.0
6,max specificity,0.999927,1.0,0.0
7,max absolute_mcc,0.554733,0.971827,249.0
8,max min_per_class_accuracy,0.604893,0.982353,247.0
9,max mean_per_class_accuracy,0.554733,0.985913,249.0



Gains/Lift Table: Avg response rate: 62.64 %, avg score: 62.59 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010989,0.999886,1.596491,1.596491,1.0,0.999908,1.0,0.999908,0.017544,0.017544,59.649123,59.649123,0.017544
1,2,0.021978,0.99985,1.596491,1.596491,1.0,0.999862,1.0,0.999885,0.017544,0.035088,59.649123,59.649123,0.035088
2,3,0.030769,0.99982,1.596491,1.596491,1.0,0.999826,1.0,0.999868,0.014035,0.049123,59.649123,59.649123,0.049123
3,4,0.041758,0.999796,1.596491,1.596491,1.0,0.999808,1.0,0.999852,0.017544,0.066667,59.649123,59.649123,0.066667
4,5,0.050549,0.999787,1.596491,1.596491,1.0,0.99979,1.0,0.999842,0.014035,0.080702,59.649123,59.649123,0.080702
5,6,0.101099,0.999684,1.596491,1.596491,1.0,0.999733,1.0,0.999787,0.080702,0.161404,59.649123,59.649123,0.161404
6,7,0.151648,0.999604,1.596491,1.596491,1.0,0.999644,1.0,0.999739,0.080702,0.242105,59.649123,59.649123,0.242105
7,8,0.2,0.999499,1.596491,1.596491,1.0,0.999572,1.0,0.999699,0.077193,0.319298,59.649123,59.649123,0.319298
8,9,0.301099,0.999278,1.596491,1.596491,1.0,0.999389,1.0,0.999595,0.161404,0.480702,59.649123,59.649123,0.480702
9,10,0.4,0.998471,1.596491,1.596491,1.0,0.998992,1.0,0.999446,0.157895,0.638596,59.649123,59.649123,0.638596






#### Inference with the best model

In [18]:
# Doing inference with the best model
predictions = best_model.predict(test_frame)

# Viewing predictions
print(predictions)

stackedensemble prediction progress: |████████████████████████████████████| 100%


predict,p0,p1
1,0.000238515,0.999761
1,0.000487733,0.999512
1,0.000243753,0.999756
0,0.99926,0.000739764
1,0.000295035,0.999705
1,0.00184455,0.998155
1,0.000331569,0.999668
1,0.000508383,0.999492
1,0.00356408,0.996436
1,0.0102853,0.989715





#### Saving and loading the best model

In [19]:
# Saving our model
model_path = h2o.save_model(model=best_model,   
                            path="/tmp/leader_model", 
                            force=True)

In [20]:
# Loading our model
model = h2o.load_model(model_path)