In [1]:
import sys, os
sys.path.insert(0, os.path.dirname(os.getcwd()))

In [2]:
from models import EnsembleRegressor, Objective
from kennard_stone import train_test_split, KFold

import pandas as pd
from sklearn.metrics import mean_squared_error
import optuna
from lightgbm import LGBMRegressor

Using TensorFlow backend.


In [3]:
SEED = 334
kf = KFold(n_splits = 5)

In [4]:
from sklearn.datasets import load_boston
data = load_boston()
X = pd.DataFrame(data.data, columns = data.feature_names)
y = pd.Series(data.target, name = 'PRICE')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [6]:
# rf = RandomForestRegressor(random_state = SEED, n_jobs = -1)
lgbt = LGBMRegressor(random_state = SEED, n_jobs = -1)

In [7]:
objective = Objective(lgbt, X_train, y_train, scoring = 'neg_root_mean_squared_error', cv = kf)
sampler = optuna.samplers.TPESampler(seed = SEED)
study = optuna.create_study(sampler = sampler, direction = 'maximize')
study.optimize(objective, n_trials = 100, n_jobs = -1)

[32m[I 2021-02-07 17:19:40,995][0m A new study created in memory with name: no-name-521805c7-30de-45f0-93c2-8acba0db982b[0m
[32m[I 2021-02-07 17:19:48,068][0m Trial 6 finished with value: -3.567279269006864 and parameters: {'n_estimators': 95, 'min_child_weight': 2.558862944027569, 'colsample_bytree': 0.7606326525490008, 'subsample': 0.8962686806585287, 'num_leaves': 20}. Best is trial 6 with value: -3.567279269006864.[0m
[32m[I 2021-02-07 17:19:49,586][0m Trial 5 finished with value: -3.5372776701075646 and parameters: {'n_estimators': 127, 'min_child_weight': 5.125183076186909, 'colsample_bytree': 0.7854826377750047, 'subsample': 0.7813464753756487, 'num_leaves': 171}. Best is trial 5 with value: -3.5372776701075646.[0m
[32m[I 2021-02-07 17:19:50,468][0m Trial 8 finished with value: -4.295417174169678 and parameters: {'n_estimators': 19, 'min_child_weight': 0.05139603452316137, 'colsample_bytree': 0.6568578507067135, 'subsample': 0.8815713236474993, 'num_leaves': 16}. Best

In [8]:
best_estimator = objective.model(**objective.fixed_params_, **study.best_params).fit(X_train, y_train)

In [9]:
mean_squared_error(best_estimator.predict(X_test), y_test, squared = False)

2.225033939760566

同じ条件にするために```boruta```を```False```に．

In [10]:
er = EnsembleRegressor([lgbt], random_state = SEED, n_jobs = -1, boruta = False, scoring = 'neg_root_mean_squared_error', verbose = 0, cv = kf)

In [11]:
er.fit(X_train, y_train)

EnsembleRegressor(boruta=False, cv=KFold(n_splits=5),
                  estimators=[LGBMRegressor(random_state=334)],
                  random_state=334, scoring='neg_root_mean_squared_error')

In [12]:
mean_squared_error(er.predict(X_test), y_test, squared = False)

2.189592488626531

In [13]:
for estimators in er.results_.estimators:
    print(mean_squared_error(estimators[0].predict(X_test), y_test, squared = False))

2.2958052728000737
2.3917277246043858
2.471371097501595
2.5692955116163643
2.66576030132261
