In [None]:
import h2o

In [None]:
h2o.init()

# Chapter 1

In [None]:
url = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv"
iris = h2o.import_file(url)

In [None]:
train, test = iris.split_frame([0.8])

In [None]:
train.summary()

In [None]:
train.nrows

In [None]:
test.nrows

In [None]:
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [None]:
mDL = H2ODeepLearningEstimator()
mDL.train(["sepal_len", "sepal_wid", "petal_len", "petal_wid"], "class", train)

In [None]:
mDL

In [None]:
p = mDL.predict(test)

In [None]:
p

In [None]:
mDL.model_performance(test)

## AUTO ML

In [None]:
from h2o.automl import H2OAutoML

In [None]:
mA = H2OAutoML(max_runtime_secs = 30)
mA.train(["sepal_len", "sepal_wid", "petal_len", "petal_wid"], "class", train)

In [None]:
mA

In [None]:
mA.leader.predict(test)

In [None]:
mA.leader.model_performance(test)

# Chapter 2
## Random Forest

In [None]:
from h2o.estimators.random_forest import H2ORandomForestEstimator

In [None]:
mRF = H2ORandomForestEstimator()
mRF.train(["sepal_len", "sepal_wid", "petal_len", "petal_wid"], "class", train)

In [None]:
mRF

In [None]:
mRF.model_performance(test)

In [None]:
help(h2o.estimators.random_forest.H2ORandomForestEstimator)

## Gradient Boosting Machines(GBM)

In [None]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [None]:
mGBM = H2OGradientBoostingEstimator()
mGBM.train(["sepal_len", "sepal_wid", "petal_len", "petal_wid"], "class", train)

In [None]:
mGBM

In [None]:
mGBM.model_performance(test)

In [None]:
help(h2o.estimators.gbm.H2OGradientBoostingEstimator)

## Importing data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
x = [z*0.01 for z in range(0, 1001)]

In [None]:
print( x[0:5])
print( x[995:1001])

In [None]:
y = np.sin(x)
y = y + np.random.normal(0, 0.1, len(x))

In [None]:
plt.plot(x,y)
plt.show()

In [None]:
sine_wave = pd.DataFrame({
    'a': x,
    'b': y
})

In [None]:
sine_wave_h2o = h2o.H2OFrame(
    sine_wave
)

In [None]:
sine_wave_h2o.summary()

In [None]:
sine_wave_h2o = h2o.H2OFrame(
    sine_wave,
    destination_frame = "sine_wave"
)

In [None]:
sine_wave_h2o.as_data_frame().head()

## Artificial Datasets

## Overfitting Train-Valid-Test

In [None]:
# people = h2o.get_frame("people")
# train, valid, test = people.split_frame(
#    rations = [0.8, 0.1],
#    destination_frames = ["people_train", "people_valid","people_test"],
#    seed = 123
# )
# print("%d/%d/%d" % train.nrows, valid.nrows, test.nrows)

## GBM with valid

In [None]:
# y = "income"
# ignoreFields = [y, "id"]
# x = [i for i in train.names if i not in ignoreFields]

# m1 = H2OGradientBoostingEstimator(model_id = "defaults")
# m1.train(x, y, train, validation_frame = valid)

# m1.mae(train = True)
# m1.mae(valid = True)

# perf = m1.model_performance(test)
# perf.mae()

## GBM Overfit!

In [None]:
# m2 = H2OGradientBoostingEstimator(
#    model_id = "overfit",
#    ntrees = 1000,
#    max_depth = 10
# )

# m2.train(x, y, train, validation_frame = valid)

# print("Train: %d --> %d", m1.mae(train = True), m2.mae(train = True))
# print("Valid: %d --> %d", m1.mae(valid = True), m2.mae(valid = True))
# print(" Test: %d --> %d", perf.mae(), m2.model_performance(test).mae())

## Cross-validation

In [None]:
# train, test = people.split_frame(
#    rations = [0.897],
#    destination_frames = ["people_train", people_test"],
#    seed = 123
# )

# print("%d/%d" % train.nrows, test.nrows)

## Overfitting CV

In [None]:
# y = "income"
# ignoreFields = [y, "id"]
# x = [i for i in train.names if i not in ignoreFields]

# m4 = H2OGradientBoostingEstimator(
#     model_id = "overfit9folds", 
#     ntrees = 1000,
#     max_depth = 10,
#     nfolds = 9 
# )
# m4.train(x, y, train)

# m3.mae(train = True)
# m3.mae(xval = True)

# print("Train: %d --> %d", m3.mae(train = True), m4.mae(train = True))
# print("Valid: %d --> %d", m3.mae(xval = True), m4.mae(xval = True))
# print(" Test: %d --> %d", perf.mae(), m4.model_performance(test).mae())