## modeler.pyのexample

In [1]:
%load_ext lab_black

In [2]:
import os
import sys

sys.path.insert(0, os.path.abspath("../"))

# 複数の回帰モデルを学習するクラス
from src.model.modeler import Modeler

In [3]:
import numpy as np

# 必要な追加のライブラリー
import pandas as pd

# ボストンの住宅価格データ
from sklearn.datasets import load_boston
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# 回帰モデル
from sklearn.linear_model import ElasticNet, Lasso, Ridge

# 評価指標
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# 前処理
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [4]:
# データセットの読込み
boston = load_boston()

# 説明変数の格納
df = pd.DataFrame(boston.data, columns=boston.feature_names)
# 目的変数の追加
df["MEDV"] = boston.target

# データの中身を確認
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [5]:
# ランダムシード値
RANDOM_STATE = 10

# 学習データと評価データの割合
TEST_SIZE = 0.2

# outputディレクトリ
output_dir = f"../models/"
# outputディレクト作成
os.makedirs(output_dir, exist_ok=True)

# 対象モデル
models = [
    "Ridge",
    "Lasso",
    "ElasticNet",
    "GradientBoostingRegressor",
    "RandomForestRegressor",
    "SVR",
]

# パラメータ設定
param_ridge = {"alpha": 0.1, "random_state": RANDOM_STATE}

param_lasso = {"alpha": 0.1, "random_state": RANDOM_STATE}

param_elasticnet = {
    "alpha": 0.1,
    "l1_ratio": 0.5,
    "random_state": RANDOM_STATE,
}

param_gbdt = {
    "learning_rate": 0.1,
    "n_estimators": 50,
    "subsample": 1.0,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "min_weight_fraction_leaf": 0.0,
    "max_depth": 3,
    "alpha": 0.9,
    "random_state": RANDOM_STATE,
}

param_randomforest = {
    "n_estimators": 50,
    "max_depth": 5,
    "random_state": RANDOM_STATE,
}

param_svr = {
    "kernel": "rbf",
    "degree": 3,
    "gamma": "scale",
    "C": 10,
    "epsilon": 0.1,
}

param_dict = {
    "Ridge": param_ridge,
    "Lasso": param_lasso,
    "ElasticNet": param_elasticnet,
    "RandomForestRegressor": param_randomforest,
    "GradientBoostingRegressor": param_gbdt,
    "SVR": param_svr,
}

# 学習データと評価データを作成
x_train, x_test, y_train, y_test = train_test_split(
    df.iloc[:, 0 : df.shape[1] - 1],
    df.iloc[:, df.shape[1] - 1],
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

# データを標準化
sc = StandardScaler()
sc.fit(x_train)  # 学習用データで標準化
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)

In [6]:
modeler = Modeler()
modeler.add(Ridge)
modeler.add(Lasso)
modeler.add(ElasticNet)
modeler.add(GradientBoostingRegressor)
modeler.add(RandomForestRegressor)
modeler.add(SVR)

In [7]:
modeler.train(output_dir, x_train_std, y_train, param_dict)

学習完了


In [8]:
pred_dict = modeler.predict(output_dir, x_test_std)

In [9]:
def _mape(true, pred):
    """MAPEを計算する
    Args:
        true (np.array) : 実測値
        pred (np.array) : 予測値

    Returns:
        np.array        : mapeの計算結果
    """
    return np.mean(np.abs((true - pred) / true)) * 100


# SMAPEの計算
def _smape(true, pred):
    """SMAPEを計算する
    Args:
        true (np.array) : 実測値
        pred (np.array) : 予測値

    Returns:
        np.array        : smapeの計算結果
    """
    return (
        100
        / len(true)
        * np.sum(2 * np.abs(pred - true) / (np.abs(pred) + np.abs(true)))
    )


def calculate_scores(true, pred_dict, models):
    """全ての評価指標を計算する
    Args:
        true (np.array) : 実測値
        pred (dict)     : 予測値
        models (list)   : 機械学習モデルのリスト

    Returns:
        scores (dict)   : 各評価指標を纏めた結果
    """
    scores = {}
    for model in models:
        scores[model] = pd.DataFrame(
            {
                "R2": r2_score(true, pred_dict[model]),
                "MAE": mean_absolute_error(true, pred_dict[model]),
                "MSE": mean_squared_error(true, pred_dict[model]),
                "RMSE": np.sqrt(mean_squared_error(true, pred_dict[model])),
                "MAPE": _mape(true, pred_dict[model]),
                "SMAPE": _smape(true, pred_dict[model]),
            },
            index=[model],
        )

    return scores

In [10]:
scores = calculate_scores(y_test, pred_dict, models)
scores

{'Ridge':              R2       MAE        MSE      RMSE       MAPE      SMAPE
 Ridge  0.670955  4.061084  34.411785  5.866156  17.914521  19.314554,
 'Lasso':              R2       MAE        MSE      RMSE       MAPE      SMAPE
 Lasso  0.672716  4.024912  34.227569  5.850433  17.574116  19.155856,
 'ElasticNet':                   R2       MAE        MSE      RMSE       MAPE    SMAPE
 ElasticNet  0.670972  4.026788  34.410006  5.866004  17.480982  18.9879,
 'GradientBoostingRegressor':                                  R2       MAE        MSE      RMSE       MAPE  \
 GradientBoostingRegressor  0.876543  2.593033  12.911244  3.593222  10.711113   
 
                                SMAPE  
 GradientBoostingRegressor  10.534208  ,
 'RandomForestRegressor':                             R2       MAE        MSE      RMSE       MAPE  \
 RandomForestRegressor  0.83249  2.959699  17.518357  4.185494  12.428927   
 
                            SMAPE  
 RandomForestRegressor  12.018696  ,
 'SVR':  