In [2]:
from yikit.models import EnsembleRegressor, Objective
from kennard_stone import train_test_split, KFold

import pandas as pd
from sklearn.metrics import mean_squared_error
import optuna
from lightgbm import LGBMRegressor

In [3]:
SEED = 334
kf = KFold(n_splits = 5)

In [4]:
from sklearn.datasets import load_boston
data = load_boston()
X = pd.DataFrame(data.data, columns = data.feature_names)
y = pd.Series(data.target, name = 'PRICE')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [6]:
# rf = RandomForestRegressor(random_state = SEED, n_jobs = -1)
lgbt = LGBMRegressor(random_state = SEED, n_jobs = -1)

In [7]:
objective = Objective(lgbt, X_train, y_train, scoring = 'neg_root_mean_squared_error', cv = kf)
study = optuna.create_study(sampler = objective.sampler, direction = 'maximize')
study.optimize(objective, n_trials = 100)

'colsample_bytree': 0.8652996355660826, 'subsample': 0.8088596171510801, 'num_leaves': 173}. Best is trial 35 with value: -3.5042669200709513.[0m
[32m[I 2021-04-27 13:15:14,702][0m Trial 36 finished with value: -3.5121524987547934 and parameters: {'n_estimators': 220, 'min_child_weight': 5.495743287971056, 'colsample_bytree': 0.8468975152669369, 'subsample': 0.8555245210356347, 'num_leaves': 500}. Best is trial 35 with value: -3.5042669200709513.[0m
[32m[I 2021-04-27 13:15:23,059][0m Trial 37 finished with value: -3.5175459215960414 and parameters: {'n_estimators': 307, 'min_child_weight': 2.55504196814523, 'colsample_bytree': 0.9222276860922412, 'subsample': 0.9194484093327067, 'num_leaves': 172}. Best is trial 35 with value: -3.5042669200709513.[0m
[32m[I 2021-04-27 13:15:30,763][0m Trial 38 finished with value: -3.510641389382589 and parameters: {'n_estimators': 432, 'min_child_weight': 0.7068705187514247, 'colsample_bytree': 0.8720536209002473, 'subsample': 0.7808042263978

In [8]:
best_estimator = objective.model(**objective.fixed_params_, **study.best_params).fit(X_train, y_train)

In [9]:
mean_squared_error(best_estimator.predict(X_test), y_test, squared = False)

2.2117068343094943

同じ条件にするために```boruta```を```False```に．

In [10]:
er = EnsembleRegressor([lgbt], random_state = SEED, n_jobs = -1, boruta = False, scoring = 'neg_root_mean_squared_error', verbose = 0, cv = kf)

In [11]:
er.fit(X_train, y_train)

EnsembleRegressor(boruta=False, cv=KFold(n_splits=5),
                  estimators=[LGBMRegressor(random_state=334)],
                  random_state=334, scoring='neg_root_mean_squared_error')

In [12]:
mean_squared_error(er.predict(X_test), y_test, squared = False)

2.2069190623240944

In [13]:
for estimators in er.results_.estimators:
    print(mean_squared_error(estimators[0].predict(X_test), y_test, squared = False))

2.2709687427038596
2.4313735019283906
2.4736068439802765
2.570243958202424
2.6610604053071283
