In [1]:
from yikit.models import EnsembleRegressor, Objective
from kennard_stone import train_test_split, KFold

import pandas as pd
from sklearn.metrics import mean_squared_error
import optuna
from lightgbm import LGBMRegressor

In [2]:
SEED = 334
kf = KFold(n_splits = 5)

In [3]:
from sklearn.datasets import load_boston
data = load_boston()
X = pd.DataFrame(data.data, columns = data.feature_names)
y = pd.Series(data.target, name = 'PRICE')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [5]:
# rf = RandomForestRegressor(random_state = SEED, n_jobs = -1)
lgbt = LGBMRegressor(random_state = SEED, n_jobs = -1)

In [6]:
objective = Objective(lgbt, X_train, y_train, scoring = 'neg_root_mean_squared_error', cv = kf)
study = optuna.create_study(sampler = objective.sampler, direction = 'maximize')
study.optimize(objective, n_trials = 100)

[32m[I 2021-08-06 17:13:23,833][0m A new study created in memory with name: no-name-9a7435bc-8470-4998-a50b-a990af8f9b7b[0m
[32m[I 2021-08-06 17:25:33,716][0m Trial 0 finished with value: -3.5747749861853073 and parameters: {'n_estimators': 149, 'min_child_weight': 4.73466341291026, 'colsample_bytree': 0.7692344530914099, 'subsample': 0.747430109551545, 'num_leaves': 20}. Best is trial 0 with value: -3.5747749861853073.[0m
[32m[I 2021-08-06 17:29:26,746][0m Trial 1 finished with value: -3.5664334932591215 and parameters: {'n_estimators': 303, 'min_child_weight': 0.024010365232285977, 'colsample_bytree': 0.7952766305226454, 'subsample': 0.6875360485342359, 'num_leaves': 16}. Best is trial 1 with value: -3.5664334932591215.[0m
[32m[I 2021-08-06 17:29:39,804][0m Trial 2 finished with value: -3.937805111716314 and parameters: {'n_estimators': 29, 'min_child_weight': 0.001153830380700728, 'colsample_bytree': 0.7403753198055126, 'subsample': 0.9106266950327273, 'num_leaves': 49}. 

In [7]:
best_estimator = objective.model(**objective.fixed_params_, **study.best_params).fit(X_train, y_train)

In [8]:
mean_squared_error(best_estimator.predict(X_test), y_test, squared = False)

2.2107845218078226

同じ条件にするために```boruta```を```False```に．

In [9]:
er = EnsembleRegressor([lgbt], random_state = SEED, n_jobs = -1, boruta = False, scoring = 'neg_root_mean_squared_error', verbose = 0, cv = kf)

In [10]:
er.fit(X_train, y_train)

EnsembleRegressor(boruta=False, cv=KFold(n_splits=5),
                  estimators=[LGBMRegressor(random_state=334)],
                  random_state=334, scoring='neg_root_mean_squared_error')

In [11]:
mean_squared_error(er.predict(X_test), y_test, squared = False)

2.201786106532741

In [12]:
for estimators in er.results_.estimators:
    print(mean_squared_error(estimators[0].predict(X_test), y_test, squared = False))

2.293632129832008
2.4323930765702557
2.449809279179175
2.578067528470058
2.66576030132261
