In [None]:
#https://www.kdnuggets.com/2020/01/h2o-framework-machine-learning.html

In [None]:
import pandas as pd
import numpy as np
import h2o
import featuretools as ft
pd.set_option('display.width', 5000)

In [None]:
h2o.init()

In [None]:
bank_df = h2o.upload_file("../data/bank-additional-full.csv")

In [None]:
bank_df.head()

In [None]:
# show 6th row
print(bank_df[5,:])

In [None]:
# In the next cell we extract the names of columns into the variable x. 
# Then, we remove the name of the target column (y) from this 
# list. Also, we write the name of the target variable in the 
# variable y.
x = bank_df.names
x.remove("y")
print(x)
Y = "y"

In [None]:
train, test = bank_df.split_frame([0.7], seed=42)

In [None]:
from h2o.estimators import H2ORandomForestEstimator

In [None]:
rf = H2ORandomForestEstimator(ntrees=200)
rf.train(x=x,
         y=Y,
         training_frame=train,
         validation_frame=test)

In [None]:
print(rf)

In [None]:
#manually compute the accuracy on the test set.
predictions_rf = rf.predict(test)
(predictions_rf["predict"] == test["y"]).mean()

In [None]:
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
dl = H2ODeepLearningEstimator(hidden=[100, 10, 4],activation='Tanh')
dl.train(x=x, y=Y, training_frame=train, validation_frame=test)
predictions_dl = dl.predict(test)
print((predictions_dl["predict"] == test["y"]).mean())

In [None]:
print (dl)

In [None]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
gb = H2OGradientBoostingEstimator()
gb.train(x=x,
         y=Y,
         training_frame=train,
         validation_frame=test)

In [None]:
print(gb)

In [None]:
# XGBoost
from h2o.estimators.xgboost import H2OXGBoostEstimator
#param = {
#         "ntrees" : 400,
#         "max_depth" : 4,
#         "learn_rate" : 0.01,
#         "sample_rate" : 0.4,
#         "col_sample_rate_per_tree" : 0.8,
#         "min_rows" : 5,
#         "seed": 4241,
#         "score_tree_interval": 100
#         }
xgb = H2OXGBoostEstimator(nfolds=10, seed=1)
xgb.train(x=x, y=Y, training_frame=train,
         validation_frame=test)
#predictions_xgb = xgb.predict(test)
#print((predictions_xgb["predict"] == test["y"]).mean())

In [None]:
predictions_xgb = xgb.predict(test)
(predictions_xgb["predict"] == test["y"]).mean()

In [None]:
print(xgb)

In [None]:
# Cross Validation
rf_cv = H2ORandomForestEstimator(ntrees=200, nfolds=3)
rf_cv.train(x=x, y=Y, training_frame=bank_df)
print(rf_cv)

In [None]:
# Grid Search
from h2o.grid.grid_search import H2OGridSearch

xgb_parameters = {'max_depth': [3, 6],
                  'sample_rate': [0.4, 0.7],
                  'col_sample_rate': [0.8, 1.0],
                  'ntrees': [200, 300]}

xgb_grid_search = H2OGridSearch(model=H2OXGBoostEstimator,
                                grid_id='example_grid',
                                hyper_params=xgb_parameters)

xgb_grid_search.train(x=x,
                      y=Y,
                      training_frame=train,
                      validation_frame=test,
                      learn_rate=0.3,
                      seed=42)

grid_results = xgb_grid_search.get_grid(sort_by='accuracy',
                                        decreasing=True)
print(grid_results)

In [None]:
# Auto ML

from h2o.automl import H2OAutoML
autoML = H2OAutoML(max_runtime_secs=240)
autoML.train(x=x,
             y=Y,
             training_frame=train)

leaderboard = autoML.leaderboard
print(leaderboard)

In [None]:
autoML.leader

In [None]:
predictionAML = autoML.predict(test)
print(predictionAML)