<a href="https://colab.research.google.com/github/ykato27/AutoML-h2o/blob/main/AutoML_h2o.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install h2o



In [2]:
import h2o
from h2o.automl import H2OAutoML

import sys, os, os.path
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# ボストンの住宅価格データ
from sklearn.datasets import load_boston

# 前処理
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 評価指標
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

h2o.init(
    nthreads=-1,     # number of threads when launching a new H2O server
    max_mem_size=12  # in gigabytes
)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.11" 2021-04-20; OpenJDK Runtime Environment (build 11.0.11+9-Ubuntu-0ubuntu2.18.04); OpenJDK 64-Bit Server VM (build 11.0.11+9-Ubuntu-0ubuntu2.18.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpn4loe1dy
  JVM stdout: /tmp/tmpn4loe1dy/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpn4loe1dy/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.3
H2O_cluster_version_age:,1 month and 6 days
H2O_cluster_name:,H2O_from_python_unknownUser_g023k1
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,12 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [3]:
# データセットの読込み
boston = load_boston()

# 説明変数の格納
df = pd.DataFrame(boston.data, columns=boston.feature_names)
# 目的変数の追加
df["MEDV"] = boston.target

# データの中身を確認
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [4]:
# ランダムシード値
RANDOM_STATE = 10

# 学習データと評価データの割合
TEST_SIZE = 0.2

# 学習データと評価データを作成
x_train, x_test, y_train, y_test = train_test_split(
    df.iloc[:, 0 : df.shape[1] - 1],
    df.iloc[:, df.shape[1] - 1],
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

In [5]:
train = pd.merge(x_train, y_train, left_index=True, right_index=True)
test = pd.merge(x_test, y_test, left_index=True, right_index=True)

h2o_train = h2o.H2OFrame(train)
h2o_test = h2o.H2OFrame(test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [6]:
# 説明変数/目的変数のカラムを指定
kw = h2o.H2OFrame(list(h2o_train.columns))

feature_cols = kw.columns[:-1]
target_cols = "MEDV"

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [7]:
%%time
# max_runtime_secsは最大実行時間を指定
aml = H2OAutoML(
    max_runtime_secs=30,  # 30 sec
    max_models=None,  # no limit
    stopping_metric ='rmse',
    sort_metric ='rmse',
    seed = RANDOM_STATE,
    )

aml.train(
    y=target_cols,
    training_frame=h2o_train,
    )

AutoML progress: |████████████████████████████████████████████████████████| 100%
CPU times: user 4.84 s, sys: 182 ms, total: 5.02 s
Wall time: 33 s


In [8]:
#スコアの高いモデル順に並び変え
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,rmse,mean_residual_deviance,mse,mae,rmsle
XGBoost_grid__1_AutoML_20210626_015741_model_2,3.19587,10.2136,10.2136,2.20228,0.151833
StackedEnsemble_BestOfFamily_AutoML_20210626_015741,3.20552,10.2754,10.2754,2.16786,0.150956
GBM_grid__1_AutoML_20210626_015741_model_2,3.31107,10.9632,10.9632,2.22769,0.157146
XGBoost_grid__1_AutoML_20210626_015741_model_1,3.34327,11.1775,11.1775,2.21535,0.157374
StackedEnsemble_AllModels_AutoML_20210626_015741,3.35611,11.2635,11.2635,2.24106,0.156368
XGBoost_grid__1_AutoML_20210626_015741_model_4,3.40357,11.5843,11.5843,2.37371,0.166969
XGBoost_grid__1_AutoML_20210626_015741_model_6,3.41653,11.6727,11.6727,2.27024,0.147409
XGBoost_grid__1_AutoML_20210626_015741_model_5,3.469,12.034,12.034,2.42595,0.165229
XGBoost_3_AutoML_20210626_015741,3.48113,12.1183,12.1183,2.39609,0.166242
XGBoost_2_AutoML_20210626_015741,3.54577,12.5725,12.5725,2.39016,0.16461




In [9]:
# Get the top model of leaderboard
aml.leader

Model Details
H2OXGBoostEstimator :  XGBoost
Model Key:  XGBoost_grid__1_AutoML_20210626_015741_model_2


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees
0,,51.0




ModelMetricsRegression: xgboost
** Reported on train data. **

MSE: 0.838590800098565
RMSE: 0.9157460347162661
MAE: 0.6549287715760788
RMSLE: 0.04671028674980109
Mean Residual Deviance: 0.838590800098565

ModelMetricsRegression: xgboost
** Reported on cross-validation data. **

MSE: 10.213601193133211
RMSE: 3.1958725245436828
MAE: 2.2022834638557813
RMSLE: 0.1518326052905409
Mean Residual Deviance: 10.213601193133211

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,2.2017617,0.13352625,2.3332224,2.1583037,2.272711,2.253587,1.9909848
1,mean_residual_deviance,10.205462,2.5901244,13.705225,8.704645,10.261615,11.438369,6.917458
2,mse,10.205462,2.5901244,13.705225,8.704645,10.261615,11.438369,6.917458
3,r2,0.863989,0.041528296,0.8075872,0.87932706,0.8513178,0.8602157,0.92149717
4,residual_deviance,10.205462,2.5901244,13.705225,8.704645,10.261615,11.438369,6.917458
5,rmse,3.1735935,0.40891057,3.702057,2.9503636,3.2033756,3.3820658,2.6301062
6,rmsle,0.15101969,0.017219953,0.16159286,0.12684318,0.16780958,0.15949704,0.13935581



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2021-06-26 01:57:59,2.177 sec,0.0,23.074545,21.341832,532.434631
1,,2021-06-26 01:57:59,2.191 sec,5.0,6.615638,4.65527,43.76666
2,,2021-06-26 01:57:59,2.201 sec,10.0,3.713856,2.187642,13.792726
3,,2021-06-26 01:57:59,2.212 sec,15.0,2.677808,1.628269,7.170656
4,,2021-06-26 01:57:59,2.230 sec,20.0,2.148995,1.380848,4.61818
5,,2021-06-26 01:57:59,2.244 sec,25.0,1.779495,1.174594,3.166601
6,,2021-06-26 01:57:59,2.259 sec,30.0,1.541492,1.048642,2.376199
7,,2021-06-26 01:57:59,2.277 sec,35.0,1.335966,0.924262,1.784806
8,,2021-06-26 01:57:59,2.295 sec,40.0,1.156918,0.811169,1.33846
9,,2021-06-26 01:57:59,2.314 sec,45.0,1.040091,0.735423,1.08179



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,LSTAT,15953.510742,1.0,0.465742
1,RM,8911.679688,0.558603,0.260165
2,DIS,1796.557373,0.112612,0.052448
3,CRIM,1773.405396,0.111161,0.051772
4,PTRATIO,1283.730103,0.080467,0.037477
5,NOX,1184.868164,0.07427,0.034591
6,B,1013.11084,0.063504,0.029576
7,INDUS,800.35675,0.050168,0.023365
8,AGE,691.031982,0.043315,0.020174
9,TAX,592.707947,0.037152,0.017303




In [10]:
# 予測
y_pred = aml.leader.predict(h2o_test)
y_pred

xgboost prediction progress: |████████████████████████████████████████████| 100%


predict
29.041
28.0348
25.7835
28.0571
19.73
14.18
47.8597
17.3832
21.3282
46.4187




In [11]:
perf = aml.leader.model_performance(h2o_test)
perf


ModelMetricsRegression: xgboost
** Reported on test data. **

MSE: 22.087954294343596
RMSE: 4.69978236669993
MAE: 3.0897762120938768
RMSLE: 0.1595180418199213
Mean Residual Deviance: 22.087954294343596


