In [1]:
import pandas as pd
import numpy as np
import h2o
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
import time 
import datetime
from datetime import datetime

In [2]:
h2o.init() # start h2o
# 2 cluster 1.759Gb

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.181-b13, mixed mode)
  Starting server from C:\Users\gubingjing\AppData\Local\Continuum\anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\GUBING~1\AppData\Local\Temp\tmp825w9w_y
  JVM stdout: C:\Users\GUBING~1\AppData\Local\Temp\tmp825w9w_y\h2o_gubingjing_started_from_python.out
  JVM stderr: C:\Users\GUBING~1\AppData\Local\Temp\tmp825w9w_y\h2o_gubingjing_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Asia/Shanghai
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.0.2
H2O cluster version age:,1 month and 5 days
H2O cluster name:,H2O_from_python_gubingjing_r3ompf
H2O cluster total nodes:,1
H2O cluster free memory:,1.759 Gb
H2O cluster total cores:,2
H2O cluster allowed cores:,2


In [3]:
dataPath = "http://coursera.h2o.ai/house_data.3487.csv"
data = h2o.import_file(path=dataPath)  # import dataset to h2oframe

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
# generate date feature to year and month
data['year'] = data['date'].substring(0,4)
data['month'] = data['date'].substring(4,6)
# transform some features to categorical features
data['waterfront'] =  data['waterfront'].asfactor()
data['view'] =  data['view'].asfactor()
data['condition'] =  data['condition'].asfactor()
data['grade'] =  data['grade'].asfactor()
data['zipcode'] =  data['zipcode'].asfactor()
data['year'] =  data['year'].asfactor()
data['month'] =  data['month'].asfactor()
# split 90% for train and 10% for test
data_split = data.split_frame(ratios=[0.9],seed=123)  
train = data_split[0]
test = data_split[1]
X = ['bedrooms','bathrooms','sqft_living',
     'sqft_lot','floors','waterfront',
     'view','condition','grade',
     'sqft_above','sqft_basement','yr_built',
     'yr_renovated','zipcode','lat',
     'long','sqft_living15','sqft_lot15']
y = 'price'

In [5]:
# train a linear regression model in 10-fold cross validation with lasso regularization 
# set seed to make sure reproducible
model_v1 = H2OGeneralizedLinearEstimator(family= "gaussian", lambda_= 0.1, alpha = 1,
                                         nfolds = 10, seed = 123,keep_cross_validation_predictions=True)
model_v1.train(X, y, training_frame= train)
print('rmse for penalized regression model:{}'.format(model_v1.rmse(xval=True)))
h2o.save_model(model=model_v1, path="/mymodel", force=True) # save model

glm Model Build progress: |███████████████████████████████████████████████| 100%
rmse for penalized regression model:151138.85161107982


'C:\\mymodel\\GLM_model_python_1545899671381_1'

In [6]:
# train a gbm model in 10-fold cross validation 
# set seed to make sure reproducible
model_v2 = H2OGradientBoostingEstimator(
    ntrees=400,
    learn_rate=0.2,
    max_depth=8,
    stopping_tolerance=0.01, 
    stopping_rounds=2,
    score_each_iteration=True,
    seed=123,
    nfolds = 10,
    keep_cross_validation_predictions=True
)
model_v2.train(X, y, training_frame= train)
print('rmse for GBM model:{}'.format(model_v2.rmse(xval=True)))
h2o.save_model(model=model_v2, path="/mymodel", force=True) # save model

gbm Model Build progress: |███████████████████████████████████████████████| 100%
rmse for GBM model:122640.99558214223


'C:\\mymodel\\GBM_model_python_1545899671381_2'

In [7]:
# train a random forest  model in 10-fold cross validation 
# set seed to make sure reproducible
model_v3 = H2ORandomForestEstimator(
    ntrees=50,
    max_depth=10,
    stopping_tolerance=0.01, 
    stopping_rounds=2,
    score_each_iteration=True,
    seed=123,
    nfolds = 10,
    keep_cross_validation_predictions=True
)
model_v3.train(X, y, training_frame= train)
print('rmse for RF model:{}'.format(model_v3.rmse(xval=True)))
h2o.save_model(model=model_v3, path="/mymodel", force=True) # save model

drf Model Build progress: |███████████████████████████████████████████████| 100%
rmse for RF model:137898.55625461036


'C:\\mymodel\\DRF_model_python_1545899671381_3'

In [8]:
# train a deep learning model in 10-fold cross validation 
# set seed to make sure reproducible
model_v4 = H2ODeepLearningEstimator(epochs=10,
                                    nfolds=10,
                                    seed=123,
                                    l1=1e-5,
                                    l2=1e-5,
                                    activation="RectifierWithDropout",
                                    hidden=[256,512,64],
                                    input_dropout_ratio = 0.1,
                                    hidden_dropout_ratios = [0.15,0.2,0.05],
                                    keep_cross_validation_predictions=True)
model_v4.train(X, y, training_frame= train)
print('rmse for DL model:{}'.format(model_v4.rmse(xval=True)))
h2o.save_model(model=model_v4, path="/mymodel", force=True) # save model

deeplearning Model Build progress: |██████████████████████████████████████| 100%
rmse for DL model:125834.89760008856


'C:\\mymodel\\DeepLearning_model_python_1545899671381_4'

In [9]:
# train a ensemble model based on four models above
# set seed to make sure reproducible
ensemble = H2OStackedEnsembleEstimator(base_models=[model_v4,model_v3, model_v2,model_v1],
                                       metalearner_algorithm = 'deeplearning',seed=123, 
                                       metalearner_params = {'hidden':[128,384,512],'activation':"RectifierWithDropout",
                                                             'input_dropout_ratio': 0.1}
                                       )
ensemble.train(X, y, training_frame=train)
print('rmse for ensemble model:{}'.format(ensemble.rmse()))
h2o.save_model(model=ensemble, path="/mymodel", force=True) # save model

stackedensemble Model Build progress: |███████████████████████████████████| 100%
rmse for ensemble model:98181.47855753504


'C:\\mymodel\\StackedEnsemble_model_python_1545899671381_5'

In [10]:
# get the model performance on test set
ensemble.model_performance(test)

## the RMSE is 121,523 which is an RMSE below 123,000.


ModelMetricsRegression: stackedensemble
** Reported on test data. **

MSE: 14768032388.45377
RMSE: 121523.79350750112
MAE: 72606.6977994553
RMSLE: 0.19035198217892466
Mean Residual Deviance: 14768032388.45377




In [None]:
h2o.cluster().shutdown()