In [1]:
from yikit.models import EnsembleRegressor, Objective

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_diabetes
import optuna
from lightgbm import LGBMRegressor

In [2]:
SEED = 334
kf = KFold(n_splits = 5, shuffle=True, random_state=SEED)

In [3]:
data = load_diabetes()
X = pd.DataFrame(data.data, columns = data.feature_names)
y = pd.Series(data.target, name = 'PRICE')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=SEED)

In [5]:
# rf = RandomForestRegressor(random_state = SEED, n_jobs = -1)
lgbt = LGBMRegressor(random_state = SEED, n_jobs = -1)

In [6]:
objective = Objective(lgbt, X_train, y_train, scoring = 'neg_root_mean_squared_error', cv = kf)
study = optuna.create_study(sampler = objective.sampler, direction = 'maximize')
study.optimize(objective, n_trials = 100)

[32m[I 2022-06-27 21:06:40,363][0m A new study created in memory with name: no-name-8597b58b-01f3-49ed-b8d5-d4b6e50c5f37[0m
[32m[I 2022-06-27 21:08:52,973][0m Trial 0 finished with value: -63.891728772288765 and parameters: {'n_estimators': 285, 'min_child_weight': 0.0032162822843555513, 'colsample_bytree': 0.9416332990063434, 'subsample': 0.6960045690722916, 'num_leaves': 241}. Best is trial 0 with value: -63.891728772288765.[0m
[32m[I 2022-06-27 21:08:57,310][0m Trial 1 finished with value: -57.30781496850794 and parameters: {'n_estimators': 18, 'min_child_weight': 2.3886939058278256, 'colsample_bytree': 0.94602736228968, 'subsample': 0.8213569936576139, 'num_leaves': 38}. Best is trial 1 with value: -57.30781496850794.[0m
[32m[I 2022-06-27 21:09:19,603][0m Trial 2 finished with value: -65.05323586360112 and parameters: {'n_estimators': 871, 'min_child_weight': 0.3154694608147225, 'colsample_bytree': 0.8647883796553751, 'subsample': 0.8072018624816005, 'num_leaves': 159}. 

In [None]:
best_estimator = objective.model(**objective.fixed_params_, **study.best_params).fit(X_train, y_train)

In [None]:
mean_squared_error(best_estimator.predict(X_test), y_test, squared = False)

2.2107845218078226

同じ条件にするために```boruta```を```False```に．

In [None]:
er = EnsembleRegressor([lgbt], random_state = SEED, n_jobs = -1, boruta = False, scoring = 'neg_root_mean_squared_error', verbose = 0, cv = kf)

In [None]:
er.fit(X_train, y_train)

EnsembleRegressor(boruta=False, cv=KFold(n_splits=5),
                  estimators=[LGBMRegressor(random_state=334)],
                  random_state=334, scoring='neg_root_mean_squared_error')

In [None]:
mean_squared_error(er.predict(X_test), y_test, squared = False)

2.201786106532741

In [None]:
for estimators in er.results_.estimators:
    print(mean_squared_error(estimators[0].predict(X_test), y_test, squared = False))

2.293632129832008
2.4323930765702557
2.449809279179175
2.578067528470058
2.66576030132261
