## Chapter 4 - Deep learning

### Binding and merging data

In [None]:
import h2o
import pandas as pd
import numpy as np
h2o.init()

In [None]:
# http://data.h2o.ai/
url = "https://s3.amazonaws.com/h2o-training/sparkling-water/allyears2k_headers.csv.gz"
data = h2o.import_file(url)

In [None]:
train, valid, test = data.split_frame([0.8, 0.1], seed = 69)

In [None]:
print("%d/%d/%d" % (train.nrows, valid.nrows, test.nrows))

In [None]:
train2 = data[1:35255,:]
train2 = h2o.assign(train2, "first35255")

In [None]:
h2o.ls()

In [None]:
data.ncol

In [None]:
dates = data[:,0:4]

In [None]:
airports = data[:,["Origin", "Dest"]]

In [None]:
dates.ncol

In [None]:
airports.ncol

#### Use cbind to join ("bind") columns

In [None]:
a_and_d = airports.cbind(dates)

In [None]:
a_and_d.dim

#### Use rbind to join rows

In [None]:
restored_data = train.rbind([valid, test])

In [None]:
restored_data.dim

In [None]:
data.dim

In [None]:
restored_data[:,0:4].head()

In [None]:
data[:,0:4].head()

In [None]:
train[:,0:4].head()

#### Use h2o.merge() to join tables together when they have +1 columns in common

##### Unline cbind() they can have diff. number of rows, and
##### unlike rbind() they can have diff. number of columns

### Deep Learning - Part 1

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
url = "https://s3.amazonaws.com/h2o-training/sparkling-water/allyears2k_headers.csv.gz"
data = h2o.import_file(url)
train, valid, test = data.split_frame([0.8, 0.1], seed = 69)
print("%d/%d/%d" % (train.nrows, valid.nrows, test.nrows))

In [None]:
y = "IsArrDelayed"
ignoreFields = [
    "ArrDelay", "DepDelay",
    "CarrierDelay", "WeatherDelay",
    "NASDelay", "SecurityDelay",
    "LateAircraftDelay", 
    "IsDepDelayed", "IsArrDelayed",
    "ActualElapsedTime" # But CRSElapsedTime is fine
    "ArrTime" # But CRSArrTime is fine
]
xAll = [i for i in train.names if i not in ignoreFields]

In [None]:
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [None]:
m_def = H2ODeepLearningEstimator()
%time m_def.train(xAll, y, train, validation_frame = valid) # e.g. 61s

In [None]:
m_def.model_performance(test) # e.g. MSE of 0.132, error of 0.2002

In [None]:
m_def

In [None]:
m_def.plot()

In [None]:
m_200_epochs = H2ODeepLearningEstimator(epochs = 200,
                                       stopping_rounds = 5, #Default
                                       stopping_tolerance = 0, #Default
                                       stopping_metric = "logloss" #Indirectly the default
                                       )
%time m_200_epochs.train(xAll, y, train, validation_frame = valid) 

In [None]:
m_200_epochs.model_performance(test)

In [None]:
m_200_epochs.plot()

In [None]:
m_200_epochs

In [None]:
m_200x200x200 = H2ODeepLearningEstimator(epochs = 200,
                # same early stopping (default)
                hidden = [200,200,200]
                )
%time m_200x200x200.train(xAll, y, train, validation_frame = valid) 

In [None]:
m_200x200x200.model_performance(test)

In [None]:
m_200x200x200.plot()

In [None]:
m_400x400 = = H2ODeepLearningEstimator(epochs = 200,
                # same early stopping (default)
                hidden = [400,400]
                )
%time m_400x400.train(xAll, y, train, validation_frame = valid) 

m_400x400.model_performance(test)

m_400x400.plot()

### Why did 400x400 take longer than 200x200x200

In [None]:
m_def.summary()

In [None]:
m_def.summary()["units"]

In [None]:
(3801 * 200) + (200 * 200) + (200 * 2) # Plus 200 + 200 + 2 biases

In [None]:
m_200x200x200.summary()

In [None]:
m_200x200x200.summary()["units"]

In [None]:
m_400x400.summary()

In [None]:
m_400x400.summary()["units"]

In [None]:
train.nlevels() # enum cardinality

In [None]:
train.structure()

#### Models without that high-cardinality column

In [None]:
x2 = list(filter(lambda v: v != 'TailNum', xAll))

In [None]:
m2_def = H2ODeepLearningEstimator()
%time m2_def.train(x2, y, train, validation_frame = valid) # e.g. 13s

In [None]:
m2_200_epochs = H2ODeepLearningEstimator(epochs = 200,
                                       stopping_rounds = 5, #Default
                                       stopping_tolerance = 0, #Default
                                       stopping_metric = "logloss" #Indirectly the default
                                       )
%time m2_200_epochs.train(x2, y, train, validation_frame = valid) 

In [None]:
m2_200x200x200 = H2ODeepLearningEstimator(epochs = 200,
                # same early stopping (default)
                hidden = [200,200,200]
                )
%time m2_200x200x200.train(x2, y, train, validation_frame = valid) 

In [None]:
m2_400x400 = H2ODeepLearningEstimator(epochs = 200,
                # same early stopping (default)
                hidden = [400,400]
                )
%time m2_400x400.train(x2, y, train, validation_frame = valid) 

In [None]:
all_models = [m_def, m2_def, m_200_epochs, m2_200_epochs, 
             m_200x200x200, m2_200x200x200, m_400x400, m2_400x400]

loglosses = map(lambda x: x.logloss(), all_models)
print("    defaults: %.4f --> %.4f\n  200 epochs: %.4f --> %.4f\n 200x3: %.4f --> %.4f\n  400x2: %.4f --> %.4f\n" % loglosses)

mse = map(lambda x: x.mse(), all_models)
print("    defaults: %.4f --> %.4f\n  200 epochs: %.4f --> %.4f\n 200x3: %.4f --> %.4f\n  400x2: %.4f --> %.4f\n" % mse)

In [None]:
m_400x400.summary()

## Deep Learning with Grids

## Deep Learning (Regression)