In [42]:
import pandas as pd

import h2o
from h2o.automl import H2OAutoML, get_leaderboard

from v_time import timeit_out
from sklearn import metrics

from tqdm import tqdm_notebook as tqdm

# 1. Init h2o session

There are 2 ways of working with h2o:

* native
* using spark (with pysparkling)

Both use MapReduce so they can scale to multiple machines.

In [4]:
mode = "h2o"
# mode = "spark"

if mode == "h2o":
    h2o.init()    

else:
    from pyspark.sql import SparkSession
    from pysparkling import H2OContext

    spark = SparkSession.builder.appName("h2o_auto_ml").getOrCreate()
    hc = H2OContext.getOrCreate(spark)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) Client VM (build 25.151-b12, mixed mode, sharing)



  You have a 32-bit version of Java. H2O works best with 64-bit Java.
  Please download the latest 64-bit Java SE JDK from Oracle.




  Starting server from c:\miniconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Villoro\AppData\Local\Temp\tmp5xzbitjz
  JVM stdout: C:\Users\Villoro\AppData\Local\Temp\tmp5xzbitjz\h2o_Villoro_started_from_python.out
  JVM stderr: C:\Users\Villoro\AppData\Local\Temp\tmp5xzbitjz\h2o_Villoro_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Europe/Paris
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,5 days
H2O cluster name:,H2O_from_python_Villoro_bp712g
H2O cluster total nodes:,1
H2O cluster free memory:,247.5 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


# 2. Get data

In [5]:
# Import a sample binary outcome train/test set into H2O
train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

# Identify predictors and response
x = train.columns
y = "response"
x.remove(y)

# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


# 3. Train

In [6]:
training_minutes = 2

# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1, max_runtime_secs=training_minutes*60)
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |
09:21:33.51: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████████████████████████| 100%


# 4. Check results

In [7]:
# Optionally add extra model information to the leaderboard
lb = get_leaderboard(aml, extra_columns='ALL')

# Print all rows (instead of default 10 rows)
lb.head(rows=lb.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse,training_time_ms,predict_time_per_row_ms
StackedEnsemble_AllModels_AutoML_20191222_092133,0.786416,0.555428,0.803856,0.324173,0.434064,0.188411,1376,0.112101
StackedEnsemble_BestOfFamily_AutoML_20191222_092133,0.783793,0.557936,0.800862,0.330988,0.435146,0.189352,793,0.047164
GBM_5_AutoML_20191222_092133,0.780862,0.559708,0.79783,0.325399,0.436083,0.190168,1070,0.012534
GBM_1_AutoML_20191222_092133,0.778997,0.56159,0.796523,0.326697,0.437003,0.190971,873,0.011205
GBM_2_AutoML_20191222_092133,0.778338,0.561527,0.79632,0.329805,0.437199,0.191143,844,0.011891
GBM_grid__1_AutoML_20191222_092133_model_2,0.777556,0.600409,0.797979,0.353497,0.453296,0.205477,2550,0.021524
GBM_3_AutoML_20191222_092133,0.776389,0.563906,0.793284,0.328065,0.438274,0.192084,958,0.011582
GBM_4_AutoML_20191222_092133,0.770758,0.570912,0.790371,0.353743,0.441681,0.195082,1153,0.012144
DRF_1_AutoML_20191222_092133,0.765151,0.580246,0.783285,0.340491,0.445292,0.198285,1786,0.016763
XRT_1_AutoML_20191222_092133,0.765134,0.582172,0.783059,0.349171,0.446013,0.198928,2626,0.017842




# 5. Test models

In [37]:
y_test = test.as_data_frame()["response"]

In [41]:
out = {}
for name in tqdm(lb.as_data_frame()["model_id"].tolist()):
    
    out[name] = {}
    
    # Predict time
    model = h2o.get_model(name)
    predictions, out[name]["predict_time"] = timeit_out(model.predict)(test)
    
    # AUC
    y_pred = predictions.as_data_frame()["predict"]
    out[name]["auc"] = metrics.roc_auc_score(y_test, y_pred)

HBox(children=(IntProgress(value=0, max=13), HTML(value='')))

stackedensemble prediction progress: |████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
drf prediction progress: |████████████████████████████████████████████████| 100%
drf prediction progress: |████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
deeplearning prediction progress: |███████████████████████████████████████| 100%
glm prediction progress: |██

In [50]:
df = pd.DataFrame(out).T
df["train_time"] = lb.as_data_frame().set_index("model_id")["training_time_ms"]/1000
df["origin"] = f"AutoML_{training_minutes}min"
df

Unnamed: 0,predict_time,auc,train_time,origin
StackedEnsemble_AllModels_AutoML_20191222_092133,0.706728,0.678378,1.376,AutoML_2min
StackedEnsemble_BestOfFamily_AutoML_20191222_092133,0.482212,0.674103,0.793,AutoML_2min
GBM_5_AutoML_20191222_092133,0.086991,0.703021,1.07,AutoML_2min
GBM_1_AutoML_20191222_092133,0.077191,0.706582,0.873,AutoML_2min
GBM_2_AutoML_20191222_092133,0.081018,0.707154,0.844,AutoML_2min
GBM_grid__1_AutoML_20191222_092133_model_2,0.109668,0.696259,2.55,AutoML_2min
GBM_3_AutoML_20191222_092133,0.081856,0.700383,0.958,AutoML_2min
GBM_4_AutoML_20191222_092133,0.089506,0.713829,1.153,AutoML_2min
DRF_1_AutoML_20191222_092133,0.100762,0.634871,1.786,AutoML_2min
XRT_1_AutoML_20191222_092133,0.101667,0.632838,2.626,AutoML_2min


In [53]:
df.to_csv(f"results/AutoML_{training_minutes}min.csv")