## AutoFeatのサンプルコード

- AutoFeatとは自動特徴量エンジニアリングを実施してくれるライブラリー
- AutoFeatを使って特徴量エンジニアリングを実施した後にXGBoostで回帰モデルを構築(Optunaでハイパラチューニング）

In [1]:
%load_ext lab_black

In [2]:
# ライブラリーのインポート
import os

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# ボストンの住宅価格データ
from sklearn.datasets import load_boston

# 前処理
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# AutoFeat
import autofeat
from autofeat import AutoFeatRegressor

# XGBoost
import xgboost as xgb

# Optuna
import optuna
from optuna.samplers import TPESampler

# 評価指標
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [3]:
print(autofeat.__version__)

2.0.9


In [4]:
print(xgb.__version__)

1.3.3


In [5]:
print(optuna.__version__)

2.9.1


In [6]:
# データセットの読込み
boston = load_boston()

# 説明変数の格納
df = pd.DataFrame(boston.data, columns=boston.feature_names)
# 目的変数の追加
df["MEDV"] = boston.target

# データの中身を確認
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


#### 前処理

In [7]:
# ランダムシード値
RANDOM_STATE = 10

# 学習データと評価データの割合
TEST_SIZE = 0.2

# 学習データと評価データを作成
x_train, x_test, y_train, y_test = train_test_split(
    df.iloc[:, 0 : df.shape[1] - 1],
    df.iloc[:, df.shape[1] - 1],
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

# trainのデータセットの2割をモデル学習時のバリデーションデータとして利用する
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train, y_train, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

In [8]:
# 特徴量生成モデル定義
model = AutoFeatRegressor(verbose=1)

In [9]:
# 特徴量生成（学習データ利用）
x_train_feature_creation = model.fit_transform(x_train, y_train)

[AutoFeat] The 2 step feature engineering process could generate up to 4186 features.
[AutoFeat] With 323 data points this new feature matrix would use about 0.01 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 60 transformed features from 13 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 2583 feature combinations from 2628 original feature tuples - done.
[feateng] Generated altogether 2645 new features in 2 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 1202 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 43 features after 5 feature selection runs
[featsel] 34 features after correlation filtering
[featsel] 22 features after noise filtering


In [10]:
# 自動生成した特徴量の確認
x_train_feature_creation

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,...,CHAS*exp(RM),RAD**3/LSTAT,INDUS**3*LSTAT,LSTAT**3*exp(RM),B*RM**2,LSTAT**5,LSTAT*TAX,AGE**3*B**3,TAX**3*ZN**3,INDUS**3*exp(RM)
0,0.06860,0.0,2.89,0.0,0.445,7.416,62.5,3.4952,2.0,276.0,...,0.00000,1.292407,149.411552,3.942755e+05,21828.331526,9.087685e+03,1708.44,1.526453e+13,0.000000e+00,4.012559e+04
1,4.42228,0.0,18.10,0.0,0.584,6.003,94.5,2.5403,24.0,666.0,...,0.00000,648.405253,126422.078120,3.921312e+06,11938.369422,4.404899e+06,14199.12,3.068460e+13,0.000000e+00,2.399416e+06
2,8.98296,0.0,18.10,1.0,0.770,6.212,97.4,2.1222,24.0,666.0,...,498.69765,785.454545,104363.441600,2.718788e+06,14576.201817,1.688742e+06,11721.60,4.979908e+13,0.000000e+00,2.957148e+06
3,0.03961,0.0,5.19,0.0,0.515,6.037,34.5,5.9853,5.0,224.0,...,0.00000,15.605493,1119.784856,2.151460e+05,14465.166956,3.297331e+04,1794.24,2.567442e+12,0.000000e+00,5.852452e+04
4,8.05579,0.0,18.10,0.0,0.584,5.427,95.4,2.4298,24.0,666.0,...,0.00000,762.072767,107565.501740,1.357776e+06,10384.302159,1.964203e+06,12081.24,3.805556e+13,0.000000e+00,1.348813e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,5.73116,0.0,18.10,0.0,0.532,7.061,77.0,3.4106,24.0,666.0,...,0.00000,1972.039943,41567.484410,4.015202e+05,19707.759957,1.692739e+04,4668.66,2.819595e+13,0.000000e+00,6.911767e+06
319,14.42080,0.0,18.10,0.0,0.740,6.461,93.3,2.0026,24.0,666.0,...,0.00000,765.872576,107031.825050,3.761909e+06,1147.556882,1.915958e+06,12021.30,1.687210e+10,0.000000e+00,3.793258e+06
320,0.13554,12.5,6.07,0.0,0.409,5.594,36.8,6.4980,4.0,345.0,...,0.00000,4.889228,2927.559428,6.029236e+05,12420.126608,3.843246e+05,4516.05,3.115923e+12,8.020239e+10,6.011868e+04
321,9.51363,0.0,18.10,0.0,0.713,6.728,94.1,2.4961,24.0,666.0,...,0.00000,738.856227,110945.454110,5.472108e+06,302.376773,2.292815e+06,12460.86,2.483695e+08,0.000000e+00,4.954148e+06


In [11]:
x_train_feature_creation.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'exp(RM)/TAX', 'log(NOX)/DIS', 'RM**2/PTRATIO',
       '1/(DIS*LSTAT)', 'sqrt(B)/PTRATIO', 'CRIM*exp(RM)', 'TAX**2/LSTAT',
       'sqrt(CRIM)*LSTAT', 'RM**3/TAX', 'TAX**2/RAD', 'CHAS*DIS**2',
       'exp(RM)/CRIM', 'CHAS*exp(RM)', 'RAD**3/LSTAT', 'INDUS**3*LSTAT',
       'LSTAT**3*exp(RM)', 'B*RM**2', 'LSTAT**5', 'LSTAT*TAX', 'AGE**3*B**3',
       'TAX**3*ZN**3', 'INDUS**3*exp(RM)'],
      dtype='object')

In [12]:
# valid, testデータの特徴量生成
x_valid_feature_creation = model.transform(x_valid)
x_test_feature_creation = model.transform(x_test)

[AutoFeat] Computing 22 new features.
[AutoFeat]    22/   22 new features ...done.
[AutoFeat] Computing 22 new features.
[AutoFeat]    22/   22 new features ...done.


In [13]:
def objective(trial):

    param = {
        "eta": trial.suggest_loguniform("eta", 1e-8, 1.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1, 40),
        "max_delta_step": trial.suggest_loguniform("max_delta_step", 1e-8, 1.0),
        "subsample": trial.suggest_uniform("subsample", 0.0, 1.0),
        "reg_lambda": trial.suggest_uniform("reg_lambda", 0.0, 1000.0),
        "reg_alpha": trial.suggest_uniform("reg_alpha", 0.0, 1000.0),
    }

    model = xgb.XGBRegressor(**param)

    model.fit(
        x_train_feature_creation,
        y_train,
        eval_set=[(x_valid_feature_creation, y_valid)],
        early_stopping_rounds=50,
        verbose=False,
    )

    preds = model.predict(x_valid_feature_creation)
    mae = mean_absolute_error(y_valid, preds)

    return mae

In [14]:
%%time
# optunaで最適値を見つける
# create_studyメソッドの引数"sampler"にサンプラーと乱数シードを指定
study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=RANDOM_STATE))
study.optimize(objective, n_trials=500)

[32m[I 2021-09-17 00:57:34,253][0m A new study created in memory with name: no-name-9831b0d3-092e-45c2-8627-8fead9cad08b[0m
[32m[I 2021-09-17 00:57:34,323][0m Trial 0 finished with value: 20.835659417988342 and parameters: {'eta': 0.014810344004555135, 'gamma': 1.4656004675652718e-08, 'max_depth': 6, 'min_child_weight': 15.83538918896983, 'max_delta_step': 9.728728830009641e-05, 'subsample': 0.22479664553084766, 'reg_lambda': 198.06286475962398, 'reg_alpha': 760.5307121989588}. Best is trial 0 with value: 20.835659417988342.[0m
[32m[I 2021-09-17 00:57:34,357][0m Trial 1 finished with value: 20.835802469135803 and parameters: {'eta': 2.2536511574969237e-07, 'gamma': 5.090008568091192e-08, 'max_depth': 7, 'min_child_weight': 33.681648241632445, 'max_delta_step': 1.075439863800637e-08, 'subsample': 0.5121922633857766, 'reg_lambda': 812.6209616521135, 'reg_alpha': 612.5260668293881}. Best is trial 0 with value: 20.835659417988342.[0m
[32m[I 2021-09-17 00:57:34,408][0m Trial 2 fi

CPU times: user 3min 25s, sys: 5.37 s, total: 3min 30s
Wall time: 53.3 s


In [15]:
best_params = study.best_params
best_params["random_state"] = RANDOM_STATE
best_params

{'eta': 0.6118624705864463,
 'gamma': 0.21031856425698406,
 'max_depth': 8,
 'min_child_weight': 13.37137922141418,
 'max_delta_step': 0.5646694352719399,
 'subsample': 0.9828064834886048,
 'reg_lambda': 103.3274777173904,
 'reg_alpha': 329.4857657337786,
 'random_state': 10}

In [16]:
# チューニングしたハイパーパラメーターをフィット
optimised_model = xgb.XGBRegressor(**(best_params))

optimised_model.fit(x_train_feature_creation, y_train)

# XGBoost推論
y_pred = optimised_model.predict(x_test_feature_creation)

In [17]:
# 評価
def calculate_scores(true, pred):
    """全ての評価指標を計算する

    Parameters
    ----------
    true (np.array)       : 実測値
    pred (np.array)       : 予測値

    Returns
    -------
    scores (pd.DataFrame) : 各評価指標を纏めた結果

    """
    scores = {}
    scores = pd.DataFrame(
        {
            "R2": r2_score(true, pred),
            "MAE": mean_absolute_error(true, pred),
            "MSE": mean_squared_error(true, pred),
            "RMSE": np.sqrt(mean_squared_error(true, pred)),
        },
        index=["scores"],
    )
    return scores

In [18]:
scores = calculate_scores(y_test, y_pred)
print(scores)

              R2       MAE       MSE      RMSE
scores  0.568785  4.314875  45.09681  6.715416
