<a href="https://colab.research.google.com/github/ykato27/AutoML-AutoGluon/blob/main/AutoGluon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade pip
!pip install --upgrade setuptools
!pip install --upgrade "mxnet<2.0.0"
!pip install --pre autogluon



In [2]:
# ライブラリーのインポート
import autogluon as ag
from autogluon.tabular import TabularDataset, TabularPredictor

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ボストンの住宅価格データ
from sklearn.datasets import load_boston

# 評価指標
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [3]:
# データセットの読込み
boston = load_boston()

# 説明変数の格納
df = pd.DataFrame(boston.data, columns=boston.feature_names)
# 目的変数の追加
df["MEDV"] = boston.target

# データの中身を確認
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [4]:
# ランダムシード値
RANDOM_STATE = 10

# 学習データと評価データの割合
TEST_SIZE = 0.2

# 学習データと評価データを作成
x_train, x_test, y_train, y_test = train_test_split(
    df.iloc[:, 0 : df.shape[1] - 1],
    df.iloc[:, df.shape[1] - 1],
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

# trainのデータセットの2割をモデル学習時のバリデーションデータとして利用する
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train, y_train, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

df_train = pd.concat([x_train, y_train], axis=1)
df_valid = pd.concat([x_valid, y_valid], axis=1)

In [5]:
predictor = TabularPredictor(
    label='MEDV',
    problem_type='regression', 
    eval_metric='root_mean_squared_error',
)

No path specified. Models will be saved in: "AutogluonModels/ag-20210615_035127/"


In [6]:
predictor.fit(
            train_data=df_train,
            tuning_data=df_valid,
            time_limit=None,
)

Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20210615_035127/"
AutoGluon Version:  0.2.1b20210614
Train Data Rows:    323
Train Data Columns: 13
Tuning Data Rows:    81
Tuning Data Columns: 13
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    12731.1 MB
	Train Data (Original)  Memory Usage: 0.04 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', ...]
	Types of features in proce

[1000]	train_set's rmse: 0.367521	valid_set's rmse: 2.89849


	-2.8857	 = Validation root_mean_squared_error score
	0.75s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-3.0262	 = Validation root_mean_squared_error score
	0.85s	 = Training runtime
	0.1s	 = Validation runtime
Fitting model: CatBoost ...
	-2.5772	 = Validation root_mean_squared_error score
	3.32s	 = Training runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-2.8088	 = Validation root_mean_squared_error score
	0.63s	 = Training runtime
	0.1s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-3.1452	 = Validation root_mean_squared_error score
	1.18s	 = Training runtime
	0.02s	 = Validation runtime
Fitting model: XGBoost ...
	-2.765	 = Validation root_mean_squared_error score
	0.54s	 = Training runtime
	0.0s	 = Validation runtime
Fitting model: NeuralNetMXNet ...
	-2.8305	 = Validation root_mean_squared_error score
	4.92s	 = Training runtime
	0.13s	 = Validation runtime
Fitting model: LightGBMLarge ...
	-2.634	 = Valida

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f3a16cf6c90>

In [7]:
y_pred = predictor.predict(x_test)

In [8]:
# 評価
def calculate_scores(true, pred):
    """全ての評価指標を計算する

    Parameters
    ----------
    true (np.array)       : 実測値
    pred (np.array)       : 予測値

    Returns
    -------
    scores (pd.DataFrame) : 各評価指標を纏めた結果

    """
    scores = {}
    scores = pd.DataFrame(
        {
            "R2": r2_score(true, pred),
            "MAE": mean_absolute_error(true, pred),
            "MSE": mean_squared_error(true, pred),
            "RMSE": np.sqrt(mean_squared_error(true, pred)),
        },
        index=["scores"],
    )
    return scores

In [9]:
scores = calculate_scores(y_test, y_pred)
print(scores)

              R2      MAE        MSE     RMSE
scores  0.884956  2.43223  12.031392  3.46863


In [10]:
predictor.leaderboard(df_valid, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-2.43954,-2.43954,0.397564,0.25865,10.936155,0.006678,0.000638,0.452443,2,True,12
1,CatBoost,-2.577247,-2.577247,0.006343,0.002005,3.32135,0.006343,0.002005,3.32135,1,True,6
2,LightGBMLarge,-2.634021,-2.634021,0.030162,0.007841,0.938447,0.030162,0.007841,0.938447,1,True,11
3,XGBoost,-2.765037,-2.765037,0.068874,0.004887,0.543379,0.068874,0.004887,0.543379,1,True,9
4,ExtraTreesMSE,-2.808843,-2.808843,0.115854,0.103029,0.629762,0.115854,0.103029,0.629762,1,True,7
5,NeuralNetMXNet,-2.830514,-2.830514,0.151103,0.126151,4.916997,0.151103,0.126151,4.916997,1,True,10
6,LightGBM,-2.885714,-2.885714,0.029679,0.014084,0.753322,0.029679,0.014084,0.753322,1,True,4
7,RandomForestMSE,-3.026227,-3.026227,0.116973,0.102855,0.854194,0.116973,0.102855,0.854194,1,True,5
8,NeuralNetFastAI,-3.145224,-3.145224,0.020258,0.015513,1.177875,0.020258,0.015513,1.177875,1,True,8
9,LightGBMXT,-3.40425,-3.40425,0.005322,0.003379,0.444052,0.005322,0.003379,0.444052,1,True,3


In [11]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -2.439540       0.258650  10.936155                0.000638           0.452443            2       True         12
1              CatBoost  -2.577247       0.002005   3.321350                0.002005           3.321350            1       True          6
2         LightGBMLarge  -2.634021       0.007841   0.938447                0.007841           0.938447            1       True         11
3               XGBoost  -2.765037       0.004887   0.543379                0.004887           0.543379            1       True          9
4         ExtraTreesMSE  -2.808843       0.103029   0.629762                0.103029           0.629762            1       True          7
5        NeuralNetMXNet  -2.830514       0.126151   4.916997                0.126151           4.916997 