In [1]:
import h2o
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_144"; Java(TM) SE Runtime Environment (build 1.8.0_144-b01); Java HotSpot(TM) 64-Bit Server VM (build 25.144-b01, mixed mode)
  Starting server from /Users/yananli/anaconda2/lib/python2.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/zs/20dq6j0n6m59m3yz07h5lp6h0000gn/T/tmpwqDi2N
  JVM stdout: /var/folders/zs/20dq6j0n6m59m3yz07h5lp6h0000gn/T/tmpwqDi2N/h2o_yananli_started_from_python.out
  JVM stderr: /var/folders/zs/20dq6j0n6m59m3yz07h5lp6h0000gn/T/tmpwqDi2N/h2o_yananli_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,27 days
H2O cluster name:,H2O_from_python_yananli_zgp09i
H2O cluster total nodes:,1
H2O cluster free memory:,1.778 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [4]:
url = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/airlines/allyears2k_headers.zip"
data = h2o.import_file(url)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [5]:
train, valid, test = data.split_frame([0.8, 0.1], seed = 123)

In [6]:
y = "IsArrDelayed"
ignoreFields = ["ArrDelay", "DepDelay", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsDepDelayed", "IsArrDelayed", "ActualElapsedTime", "ArrTime", "TailNum"]
x = [i for i in train.names if i not in ignoreFields]

In [7]:
nfolds = 5
train2 = train.rbind(valid)

In [8]:
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

In [9]:
m_GLM = H2OGeneralizedLinearEstimator(
    family = "binomial",
    model_id = "glm_def",
    nfolds = nfolds,
    fold_assignment = "Modulo",
    keep_cross_validation_predictions = True
)
m_GLM.train(x, y, train2)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [10]:
m_GBM = H2OGradientBoostingEstimator(
    model_id = "gbm_def",
    nfolds = nfolds,
    fold_assignment = "Modulo",
    keep_cross_validation_predictions = True
)
m_GBM.train(x, y, train2)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [11]:
m_RF = H2ORandomForestEstimator(
    model_id = "rf_def",
    nfolds = nfolds,
    fold_assignment = "Modulo",
    keep_cross_validation_predictions = True
)
m_RF.train(x, y, train2)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [12]:
models = [m_GLM.model_id, m_GBM.model_id, m_RF.model_id]

In [13]:
m_SE = H2OStackedEnsembleEstimator(
    model_id = "SE_glm_gbm_rf",
    base_models = models
)
m_SE.train(x, y, train2)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


### Analyzing the performance

In [14]:
all_models = [m_GLM, m_GBM, m_RF, m_SE]

In [15]:
names = ["GLM", "GBM", "RF", "SE"]

In [16]:
pd.Series(map(lambda x: x.logloss(), all_models), names)

GLM    0.573459
GBM    0.508888
RF     0.515060
SE     0.243823
dtype: float64

In [17]:
pd.Series(map(lambda x: x.auc(), all_models), names)

GLM    0.768770
GBM    0.850308
RF     0.833800
SE     0.990153
dtype: float64

In [18]:
pd.Series(map(lambda x: x.auc(xval = True), all_models), names)

GLM    0.761623
GBM    0.803619
RF     0.835628
SE          NaN
dtype: float64

In [19]:
test_perf = list(map(lambda x: x.model_performance(test), all_models))

In [20]:
pd.Series(map(lambda p: p.logloss(), test_perf), names)

GLM    0.577728
GBM    0.534036
RF     0.478401
SE     0.471865
dtype: float64

In [21]:
pd.Series(map(lambda p: p.auc(), test_perf), names)

GLM    0.769849
GBM    0.815807
RF     0.851701
SE     0.854022
dtype: float64

## Mojo and Pojo: save model

In [22]:
m_GBM.save_mojo("/tmp/models/")
m_GBM.download_mojo("/tmp/models/")

In [None]:
m_GBM.download_pojo("/tmp/models/")

In [None]:
m_GBM.save_model_details("/tmp/models/")

In [None]:
h2o.save_model(m_GBM, "/tmp/models/")

In [None]:
h2o.save_model(m_SE, "/tmp/models/")