## Chapter 3 - Linear Models and more

In [None]:
# import_file --> for local file system
# upload_file  --> upload to h2o server
# export_file(frame_name, destination)

In [None]:
import h2o
h2o.init()

## Exploring with GMLs

In [None]:
# http://data.princeton.edu/wws509/datasets/#smoking
smoking = h2o.import_file("data/smoking.dat", destination_frame = "smoking")

In [None]:
smoking.summary()

In [None]:
# Add extra column
import math
smoking["myval"] = (smoking["dead"] / smoking["pop"]) * 1000
smoking["myval"] = smoking["myval"].round()


In [None]:
smoking.summary()

In [None]:
smoking[:, "pop"].sum()

In [None]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [None]:
x = [1,2]
y = 5

In [None]:
m = H2OGeneralizedLinearEstimator(
    family = "poisson",
    model_id = "smoking_p"
    # nfolds =12,
    # fold_assignment = "Modulo"   
)
m.train(x,y,smoking)

In [None]:
m.model_performance()

In [None]:
m.coef()

In [None]:
m2 = H2OGeneralizedLinearEstimator(
    family = "poisson",
    model_id = "smoking_p2" 
)
m2.train("smoke",y,smoking)

In [None]:
m2.model_performance()

In [None]:
m2.coef()

## Naive Bayes 
// (classification, not regression)

In [None]:
url = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv"
iris = h2o.import_file(url)

In [None]:
train, test = iris.split_frame([0.8])

In [None]:
train.summary()

In [None]:
train.nrows

In [None]:
test.nrows

In [None]:
from h2o.estimators.naive_bayes import H2ONaiveBayesEstimator

In [None]:
mNB = H2ONaiveBayesEstimator()
mNB.train(["sepal_len", "sepal_wid", "petal_len", "petal_wid"], "class", train)

In [None]:
mNB

In [None]:
p = mNB.predict(test)

In [None]:
p

In [None]:
mNB.model_performance(test)

In [None]:
mNB_2 = H2ONaiveBayesEstimator(laplace = 2)
mNB_2.train(["sepal_len", "sepal_wid", "petal_len", "petal_wid"], "class", train)

In [None]:
mNB_2.model_performance(test)

## Data Manipulation

## Grid Search

In [None]:
import h2o.grid

In [None]:
# http://data.h2o.ai/
url = "https://s3.amazonaws.com/h2o-training/sparkling-water/allyears2k_headers.csv.gz"
data = h2o.import_file(url)

In [None]:
data.columns

In [None]:
train, valid, test = data.split_frame([0.8, 0.1], seed = 69)

In [None]:
print("%d/%d/%d" % (train.nrows, valid.nrows, test.nrows))

In [None]:
y = "IsArrDelayed"
ignoreFields = [
    "ArrDelay", "DepDelay",
    "CarrierDelay", "WeatherDelay",
    "NASDelay", "SecurityDelay",
    "LateAircraftDelay", 
    "IsDepDelayed", "IsArrDelayed",
    "ActualElapsedTime" # But CRSElapsedTime is fine
]
xAll = [i for i in train.names if i not in ignoreFields]
xLikely = ["Month", "DayOfWeek", "UniqueCarrier", 
           "Origin", "Dest", "Distance"
           "Cancelled", "Diverted"
          ]

In [None]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [None]:
m_def = H2OGeneralizedLinearEstimator(
        family = "binomial",
)
m_def.train(xAll, y, train, validation_frame = valid)

In [None]:
m_def.logloss(valid = True) # 0.623

In [None]:
g = h2o.grid.H2OGridSearch(
    H2OGeneralizedLinearEstimator(
        family = "binomial",
        lambda_search = True
    ),
    hyper_params = {
        "alpha":[x * 0.01 for x in range(0,100)],
    },
    search_criteria = {
        "strategy": "RandomDiscrete",
        "max_models":8,
        "max_runtime_secs":30,
    }
)
g.train(xAll, y, train, validation_frame = valid)

In [None]:
g

In [None]:
g2 = h2o.grid.H2OGridSearch(
    H2OGeneralizedLinearEstimator(
        family = "binomial",
        lambda_search = True
    ),
    hyper_params = {
        "alpha":[0, 0.2, 0.4, 0.5, 0.6, 0.8, 0.99],
    },
    search_criteria = {
        "strategy": "Cartesian"
    }
)
g2.train(xAll, y, train, validation_frame = valid)

In [None]:
g2