## 回帰モデル全般のサンプルコード

In [1]:
# ライブラリーのインポート
import os
from os.path import join
import pickle
import shutil

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# ボストンの住宅価格データ
from sklearn.datasets import load_boston

# 前処理
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 回帰モデル
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR

# 評価指標
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
# データセットの読込み
boston = load_boston()

# 説明変数の格納
df = pd.DataFrame(boston.data, columns = boston.feature_names)
# 目的変数の追加
df['MEDV'] = boston.target

# データの中身を確認
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [3]:
df.shape[1]

14

### パラメータ設定

In [4]:
# ランダムシード値
RANDOM_STATE = 10

# 学習データと評価データの割合
TEST_SIZE = 0.2

# 対象モデル
models = ["LinearRegression",
          "Ridge",
          "Lasso",
          "ElasticNet",
          "GBDT",
          "AdaBoost",
          "RandomForest",
          "SVR"]

# パラメータ設定
param_linearregression = {'normalize': False}
param_ridge = {'alpha': 0.1, 'random_state': RANDOM_STATE}
param_lasso = {'alpha': 0.1, 'random_state': RANDOM_STATE}
param_elasticnet = {'alpha': 0.1, 'l1_ratio': 0.5,
                    'random_state': RANDOM_STATE}
param_randomforest = {'n_estimators': 50, 'max_depth': 5,
                      'random_state': RANDOM_STATE}
param_gbdt = {'learning_rate': 0.1, 'n_estimators': 50,
              'subsample': 1.0, 'min_samples_split': 2,
              'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0,
              'max_depth': 3, 'alpha': 0.9, 'random_state': RANDOM_STATE}
param_adaboost = {'n_estimators': 50, 'learning_rate': 1,
                  'random_state': RANDOM_STATE}
param_svr = {'kernel': 'rbf', 'degree': 3, 'gamma': 'scale', 'C': 10,
             'epsilon': 0.1}

param_dict = {'LinearRegression': param_linearregression, 'Ridge': param_ridge,
              'Lasso': param_lasso, 'ElasticNet': param_elasticnet,
              'RandomForest': param_randomforest, 'GBDT': param_gbdt,
              'AdaBoost': param_adaboost, 'SVR': param_svr}

# outputディレクトリ
output_dir = f'models/' 

### 前処理

In [5]:
# 学習データと評価データを作成
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, 0:df.shape[1]-1],
                                                    df.iloc[:, df.shape[1]-1],
                                                    test_size=TEST_SIZE,
                                                    random_state=RANDOM_STATE)

#データを標準化
sc = StandardScaler()
sc.fit(x_train) #学習用データで標準化
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)

### モデルの学習

In [6]:
def _get_model(model_name):
    """モデルを取得する
    
    Args:
        model_name (str): モデルの名前
    
    Returns:
        model           : モデル
    """
    model_dict = {"LinearRegression": LinearRegression(),
                  "Ridge": Ridge(),
                  "Lasso": Lasso(),
                  "ElasticNet": ElasticNet(),
                  "GBDT": GradientBoostingRegressor(),
                  "AdaBoost": AdaBoostRegressor(),
                  "RandomForest": RandomForestRegressor(),
                  "SVR": SVR()}

    model = model_dict.get(model_name)

    if model is None:
        raise ValueError(f'model_nameが違います: {model_name}')

    return model

In [7]:
def models_train(output_dir_path, train, target, models, param_dict):
    """モデルの学習

    Args:
        output_dir_path (str): アウトプット先のフォルダパス.
        train (nd.array)     : 学習データの説明変数
        target (Series)      : 学習データの目的変数
        features (list)      : 説明変数のリスト.
        models (list)        : 機械学習モデルのリスト
        param_dict (dict)    : key: モデル名, values: パラメータ 

    """
    path_train_target = join(output_dir_path, f'trained_model/')
    if not os.path.exists(path_train_target):
        os.makedirs(path_train_target)
    
    for model_name in models:
        model = _get_model(model_name)
        model.set_params(**param_dict[model_name])
        model.fit(train, target)
        
        filename = f'{model_name}.pickle'
        with open(join(path_train_target, filename), 'wb') as f:
            pickle.dump(model, f)

    return print('学習完了')

In [8]:
# モデルの学習
models_train(output_dir, x_train_std, y_train, models, param_dict)

学習完了


### モデルの予測

In [9]:
def models_predicts(output_dir_path, test):
    """評価データの予測

    Args:
        output_dir_path (str): アウトプット先のフォルダパス.
        test (nd.array)      : 評価データの説明変数

    Returns:
        pred_dict (dict)     : モデルに対応した予測結果を持つ辞書.
    """
    # 学習済みモデルのパスを取得
    trained_files = os.listdir(join(output_dir_path, f'trained_model/'))
    pred_dict = {}

    for trained_file in trained_files:
        with open(f'{output_dir_path}/trained_model/{trained_file}', 'rb')as f:
            model = pickle.load(f)            
        pred_dict[trained_file[:-7]] = model.predict(test)

    return pred_dict

In [10]:
pred_dict = models_predicts(output_dir, x_test_std)

### 評価指標算出

In [11]:
def _mape(true, pred):  
    """MAPEを計算する
    Args:
        true (np.array) : 実測値
        pred (np.array) : 予測値

    Returns:
        np.array        : mapeの計算結果
    """
    return np.mean(np.abs((true - pred) / true)) * 100

# SMAPEの計算
def _smape(true, pred):
    """SMAPEを計算する
    Args:
        true (np.array) : 実測値
        pred (np.array) : 予測値

    Returns:
        np.array        : smapeの計算結果
    """
    return 100/len(true) * np.sum(2 * np.abs(pred - true) / (np.abs(pred) + np.abs(true)))

def calculate_scores(true, pred_dict, models):
    """全ての評価指標を計算する
    Args:
        true (np.array) : 実測値
        pred (dict)     : 予測値
        models (list)   : 機械学習モデルのリスト
    
    Returns:
        scores (dict)   : 各評価指標を纏めた結果
    """
    scores = {}
    for model in models:
        scores[model] = pd.DataFrame({'R2': r2_score(true, pred_dict[model]),
                                      'MAE': mean_absolute_error(true, pred_dict[model]),
                                      'MSE': mean_squared_error(true, pred_dict[model]),
                                      'RMSE': np.sqrt(mean_squared_error(true, pred_dict[model])),
                                      'MAPE': _mape(true, pred_dict[model]),
                                      'SMAPE': _smape(true, pred_dict[model])},
                                      index = [model])
    
    return scores

In [12]:
scores = calculate_scores(y_test, pred_dict, models)