In [1]:
import numpy as np
import pandas as pd
import h2o
import random
import matplotlib.pyplot as plt

In [2]:
pd.set_option('float_format', '{:f}'.format)
np.random.seed(0)

In [3]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,1 day 9 hours 38 mins
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,19 days
H2O cluster name:,H2O_from_python_yananli_xj2flq
H2O cluster total nodes:,1
H2O cluster free memory:,1.500 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


## Create a data set

In [4]:
df = pd.DataFrame()

# house area, unit is square meter

N = 1000
area = np.random.normal(100, 10, N)
area = [70 if i < 70 else i for i in area]
area = [130 if i > 130 else i for i in area]

df['area']  = area

In [5]:
# house location, suppose location doesn't matter here. These four cities are pretty similar

location = ["San Francisco", "San Francisco", "San Francisco", "San Diego", "San Jose", "Los Angeles", "Los Angeles"]

index = [i % len(location) for i in range(N)]

res = []
for i in index:
    res.append(location[i])
    
df['location']  = res

In [6]:
# number of bedrooms

df['numBedroom'] = np.random.uniform(1,5,N).astype(int)

In [7]:
# years built

year = np.random.uniform(1900,2019,N).astype(int)
df['age'] = 2019 - year

In [8]:
# house price

price = df['area'] * 10000 + 20000
price += df['numBedroom'] * 10000
price -= df['age'] * 1000
price += np.random.uniform(0,20000,1000)

df['price'] = price

In [25]:
df.head()

Unnamed: 0,area,location,numBedroom,age,price
0,117.640523,San Francisco,4,3,1252998.522406
1,104.001572,San Francisco,3,90,1003807.198154
2,109.78738,San Francisco,4,35,1132549.269337
3,122.408932,San Diego,4,59,1228406.392994
4,118.67558,San Jose,4,63,1183959.415463


In [10]:
df.describe()

Unnamed: 0,area,numBedroom,age,price
count,1000.0,1000.0,1000.0,1000.0
mean,99.547894,2.558,59.683,991044.36683
std,9.873878,1.118878,34.423531,107151.725272
min,70.0,1.0,1.0,704951.667264
25%,93.015799,2.0,29.0,919441.626183
50%,99.41972,3.0,61.0,990188.415309
75%,106.069506,4.0,88.25,1062624.139039
max,127.593551,4.0,119.0,1267343.515918


## Save data to flow

In [11]:
house = h2o.H2OFrame(df, destination_frame = "house")

Parse progress: |█████████████████████████████████████████████████████████| 100%


## Split data for cross validation

In [12]:
train, test = house.split_frame(
    ratios = [0.9],
    destination_frames = ["house_train", "house_test"],
    seed = 123)

## Train a random forest model

In [13]:
from h2o.estimators.random_forest import H2ORandomForestEstimator

mRF = H2ORandomForestEstimator(model_id = "mRF", nfolds = 10)
mRF.train(["area", "location", "numBedroom", "age"],"price", train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [14]:
mRF

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  mRF


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,50.0,50.0,169448.0,12.0,18.0,14.22,87.0,456.0,265.5




ModelMetricsRegression: drf
** Reported on train data. **

MSE: 1244595733.09
RMSE: 35278.8283974
MAE: 24969.6760707
RMSLE: 0.0366346804058
Mean Residual Deviance: 1244595733.09

ModelMetricsRegression: drf
** Reported on cross-validation data. **

MSE: 1173786143.89
RMSE: 34260.5625156
MAE: 24051.4711714
RMSLE: 0.0354530321086
Mean Residual Deviance: 1173786143.89

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
0,mae,24273.75,3797.0574,26441.562,21264.363,21293.24,27237.115,33046.035,22222.633,21160.092,21554.97,24045.637,24471.844
1,mean_residual_deviance,1203563010.0,432224544.0,1489628160.0,818077700.0,966329600.0,1682860030.0,2084808960.0,998063940.0,750383740.0,872370690.0,1063139330.0,1309968380.0
2,mse,1203563010.0,432224544.0,1489628160.0,818077700.0,966329600.0,1682860030.0,2084808960.0,998063940.0,750383740.0,872370690.0,1063139330.0,1309968380.0
3,r2,0.8967049,0.0258996,0.88856816,0.9191855,0.9175463,0.845549,0.8589574,0.9103065,0.91177404,0.9207013,0.89789844,0.89656264
4,residual_deviance,1203563010.0,432224544.0,1489628160.0,818077700.0,966329600.0,1682860030.0,2084808960.0,998063940.0,750383740.0,872370690.0,1063139330.0,1309968380.0
5,rmse,34228.65,5959.3633,38595.7,28602.059,31085.842,41022.676,45659.707,31592.15,27393.133,29535.922,32605.818,36193.484
6,rmsle,0.035388786,0.006011203,0.037697714,0.027826628,0.033571247,0.04639943,0.043949563,0.033255596,0.028744396,0.0323938,0.03316254,0.03688696



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2020-01-04 23:13:18,3.063 sec,0.0,,,
1,,2020-01-04 23:13:18,3.067 sec,1.0,40333.153601,28178.345265,1626763279.371555
2,,2020-01-04 23:13:18,3.071 sec,2.0,34016.204087,24682.890596,1157102140.504331
3,,2020-01-04 23:13:18,3.074 sec,3.0,46490.262909,32665.686975,2161344545.329551
4,,2020-01-04 23:13:18,3.076 sec,4.0,48459.971069,34849.33001,2348368795.993856
5,,2020-01-04 23:13:18,3.079 sec,5.0,50203.440872,35959.636294,2520385475.411994
6,,2020-01-04 23:13:18,3.083 sec,6.0,49521.675221,34703.82075,2452396316.739819
7,,2020-01-04 23:13:18,3.085 sec,7.0,50497.331833,35787.825613,2549980522.2329
8,,2020-01-04 23:13:18,3.089 sec,8.0,48061.43267,34194.846301,2309901310.326316
9,,2020-01-04 23:13:18,3.092 sec,9.0,47967.188362,34092.110144,2300851159.363013



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,area,245686071197696.0,1.0,0.80553
1,age,42560944865280.0,0.173233,0.139544
2,numBedroom,11938142617600.0,0.048591,0.039142
3,location,4814308638720.0,0.019595,0.015785




In [15]:
mRF.model_performance(train)

# notice the difference with mRF.model_performance(train = True), here it refers to the whole train data set


ModelMetricsRegression: drf
** Reported on test data. **

MSE: 688617118.297
RMSE: 26241.5151677
MAE: 18745.9853946
RMSLE: 0.0271783987811
Mean Residual Deviance: 688617118.297




In [16]:
mRF.model_performance(test)


ModelMetricsRegression: drf
** Reported on test data. **

MSE: 1602251729.17
RMSE: 40028.1367186
MAE: 25223.1561037
RMSLE: 0.0454520218522
Mean Residual Deviance: 1602251729.17




## Train an overfitted random forest model

In [17]:
from h2o.estimators.random_forest import H2ORandomForestEstimator

mRFoverfit = H2ORandomForestEstimator(model_id = "mRFoverfit", nfolds = 10, ntrees = 1000, max_depth = 10)
mRFoverfit.train(["area", "location", "numBedroom", "age"],"price", train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [18]:
mRFoverfit

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  mRFoverfit


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,1000.0,1000.0,2769895.0,9.0,10.0,9.999,22.0,358.0,216.071




ModelMetricsRegression: drf
** Reported on train data. **

MSE: 1133871353.02
RMSE: 33673.0062962
MAE: 23254.3103815
RMSLE: 0.0349128300898
Mean Residual Deviance: 1133871353.02

ModelMetricsRegression: drf
** Reported on cross-validation data. **

MSE: 1163743753.62
RMSE: 34113.6886546
MAE: 23753.6828627
RMSLE: 0.0354900300501
Mean Residual Deviance: 1163743753.62

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
0,mae,23799.127,2493.3762,20848.416,25152.201,22728.316,21840.795,20681.975,27047.885,26820.965,21957.174,24421.832,26491.71
1,mean_residual_deviance,1165295230.0,263016000.0,840969920.0,1202550660.0,1057546180.0,1098422270.0,808165310.0,1568964860.0,1364970880.0,1012155840.0,1151881220.0,1547325570.0
2,mse,1165295230.0,263016000.0,840969920.0,1202550660.0,1057546180.0,1098422270.0,808165310.0,1568964860.0,1364970880.0,1012155840.0,1151881220.0,1547325570.0
3,r2,0.89825886,0.015310324,0.9165272,0.9019194,0.9153894,0.8841248,0.91714185,0.8766255,0.88718146,0.9018067,0.9009905,0.88088167
4,residual_deviance,1165295230.0,263016000.0,840969920.0,1202550660.0,1057546180.0,1098422270.0,808165310.0,1568964860.0,1364970880.0,1012155840.0,1151881220.0,1547325570.0
5,rmse,33941.344,3841.3625,28999.482,34677.812,32519.936,33142.453,28428.248,39610.16,36945.51,31814.396,33939.375,39336.06
6,rmsle,0.03532536,0.004515743,0.030864313,0.03571988,0.032535758,0.03459331,0.02987681,0.043213286,0.04206972,0.033413935,0.03283215,0.038134437



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2020-01-04 23:13:40,20.984 sec,0.0,,,
1,,2020-01-04 23:13:40,20.988 sec,1.0,63508.471311,42849.498881,4033325928.271548
2,,2020-01-04 23:13:40,20.990 sec,2.0,50845.688087,34770.546535,2585283997.014684
3,,2020-01-04 23:13:40,20.992 sec,3.0,48769.76133,33484.982407,2378489620.203253
4,,2020-01-04 23:13:40,20.994 sec,4.0,50051.900777,34367.438475,2505192771.372954
5,,2020-01-04 23:13:40,20.996 sec,5.0,52585.162857,37092.001185,2765199352.692716
6,,2020-01-04 23:13:40,20.998 sec,6.0,48951.351067,34415.90846,2396234771.269279
7,,2020-01-04 23:13:40,21.000 sec,7.0,48295.425777,33830.051957,2332448150.997348
8,,2020-01-04 23:13:40,21.002 sec,8.0,45173.266793,31609.532327,2040624032.787124
9,,2020-01-04 23:13:40,21.004 sec,9.0,43063.697119,30322.498677,1854482009.558304



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,area,4892487978057728.0,1.0,0.796068
1,age,902672404185088.0,0.184502,0.146876
2,numBedroom,246424151261184.0,0.050368,0.040096
3,location,104229100322816.0,0.021304,0.016959




In [19]:
mRFoverfit.model_performance(train)


ModelMetricsRegression: drf
** Reported on test data. **

MSE: 644543291.101
RMSE: 25387.8571585
MAE: 17983.4621947
RMSLE: 0.0264296033368
Mean Residual Deviance: 644543291.101




In [20]:
mRFoverfit.model_performance(test)


ModelMetricsRegression: drf
** Reported on test data. **

MSE: 1354342611.27
RMSE: 36801.3941485
MAE: 23517.4983976
RMSLE: 0.0422635134493
Mean Residual Deviance: 1354342611.27




## Train a gradient boosting model

In [21]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

mGBM = H2OGradientBoostingEstimator(model_id = "mGBM", nfolds = 10)
mGBM.train(["area", "location", "numBedroom", "age"],"price", train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [22]:
mGBM

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  mGBM


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,50.0,50.0,19565.0,5.0,5.0,5.0,20.0,31.0,26.46




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 28157708.6364
RMSE: 5306.38376264
MAE: 4248.9629553
RMSLE: 0.00547734423961
Mean Residual Deviance: 28157708.6364

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 89932404.5477
RMSE: 9483.26971818
MAE: 7308.25103139
RMSLE: 0.00999851009499
Mean Residual Deviance: 89932404.5477

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
0,mae,7292.871,717.64435,7287.792,7143.1675,6677.086,6482.5103,6825.91,6746.269,7210.684,7762.7485,7950.4854,8842.06
1,mean_residual_deviance,89657568.0,23816212.0,98302504.0,75576360.0,71431672.0,65013028.0,69883256.0,78180040.0,79470952.0,106966448.0,112785152.0,138966272.0
2,mse,89657568.0,23816212.0,98302504.0,75576360.0,71431672.0,65013028.0,69883256.0,78180040.0,79470952.0,106966448.0,112785152.0,138966272.0
3,r2,0.9918708,0.0030697512,0.99031156,0.99392515,0.99485016,0.9942865,0.9944733,0.993558,0.99291146,0.9895044,0.98943776,0.98545015
4,residual_deviance,89657568.0,23816212.0,98302504.0,75576360.0,71431672.0,65013028.0,69883256.0,78180040.0,79470952.0,106966448.0,112785152.0,138966272.0
5,rmse,9399.013,1209.2838,9914.762,8693.467,8451.726,8063.0654,8359.621,8841.948,8914.648,10342.458,10620.035,11788.3955
6,rmsle,0.00987275,0.0016520125,0.011039735,0.008971052,0.008756217,0.008348158,0.008050079,0.009582531,0.009147773,0.010877082,0.010393719,0.013561154



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2020-01-04 23:13:47,1.092 sec,0.0,107099.905848,86105.396805,11470389832.614931
1,,2020-01-04 23:13:47,1.096 sec,1.0,96784.989188,77732.401283,9367334132.048187
2,,2020-01-04 23:13:47,1.097 sec,2.0,87517.868789,70185.415218,7659377357.286299
3,,2020-01-04 23:13:47,1.099 sec,3.0,79187.931542,63431.874586,6270728501.823227
4,,2020-01-04 23:13:47,1.102 sec,4.0,71632.851293,57369.411079,5131265384.365243
5,,2020-01-04 23:13:47,1.104 sec,5.0,64862.028487,51879.898179,4207082739.430878
6,,2020-01-04 23:13:47,1.106 sec,6.0,58766.646815,46934.40832,3453518777.825171
7,,2020-01-04 23:13:47,1.108 sec,7.0,53278.089782,42511.586714,2838554850.793396
8,,2020-01-04 23:13:47,1.111 sec,8.0,48323.398111,38543.901007,2335150805.005023
9,,2020-01-04 23:13:47,1.113 sec,9.0,43870.413449,34965.517867,1924613176.152253



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,area,48558132690944.0,1.0,0.889972
1,age,5542165086208.0,0.114135,0.101577
2,numBedroom,454045990912.0,0.009351,0.008322
3,location,7054465536.0,0.000145,0.000129




In [23]:
mGBM.model_performance(train)


ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 28157695.8193
RMSE: 5306.38255493
MAE: 4248.96235956
RMSLE: 0.00547734457316
Mean Residual Deviance: 28157695.8193




In [24]:
mGBM.model_performance(test)


ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 104515205.444
RMSE: 10223.2678457
MAE: 7721.20863413
RMSLE: 0.0119889810985
Mean Residual Deviance: 104515205.444


