In [None]:
!pip install --upgrade optuna --quiet
!pip install --upgrade shap --quiet
!pip install --upgrade --force-reinstall scikit-learn==1.0.0 --quiet

[K     |████████████████████████████████| 302 kB 5.4 MB/s 
[K     |████████████████████████████████| 208 kB 55.8 MB/s 
[K     |████████████████████████████████| 80 kB 9.0 MB/s 
[K     |████████████████████████████████| 75 kB 4.6 MB/s 
[K     |████████████████████████████████| 111 kB 55.9 MB/s 
[K     |████████████████████████████████| 49 kB 6.4 MB/s 
[K     |████████████████████████████████| 144 kB 59.5 MB/s 
[?25h  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 356 kB 5.4 MB/s 
[?25h  Building wheel for shap (setup.py) ... [?25l[?25hdone


In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_boston, load_diabetes
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_validate, cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor

import optuna
import shap

from tqdm.auto import tqdm

In [None]:
class Config:
    SEED = 3655

In [1]:
# X, y = load_boston(return_X_y=True)
X, y = load_diabetes(return_X_y=True, as_frame=False)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=.8, test_size=.2, random_state=Config.SEED)

NameError: ignored

## RandomForestRegressor

In [None]:
def objective(trial:int):
    # n_estimators = trial.suggest_int("n_estimators", 100, 1_000)
    max_depth = trial.suggest_int('max_depth', 4, 32)
    max_samples = trial.suggest_loguniform("max_samples", 0.5, .8)
    max_features = trial.suggest_loguniform("max_features", 0.5, .8)
    ccp_alpha = trial.suggest_loguniform("ccp_alpha", 0.01, .1)
    random_state = trial.suggest_int("random_state", 0, 100)

    model = RandomForestRegressor(
        n_estimators=100,
        max_depth=max_depth,
        max_samples=max_samples,
        max_features=max_features,
        ccp_alpha=ccp_alpha,
        n_jobs=-1,
        random_state=random_state
    )

    cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=Config.SEED).split(X_train)

    oof_pred = cross_val_score(
        estimator=model, 
        X=X_train, 
        y=y_train, 
        scoring="neg_root_mean_squared_error", 
        cv=cv,
        n_jobs=-1)
    return np.mean(-oof_pred)

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

trial = study.best_trial

print(trial.value)
print(trial.params)

[32m[I 2021-09-28 13:54:42,237][0m A new study created in memory with name: no-name-37851d83-e8f3-4c34-b294-3e64f8b74a0d[0m
[32m[I 2021-09-28 13:54:46,806][0m Trial 0 finished with value: 57.25945640916635 and parameters: {'max_depth': 31, 'max_samples': 0.5439667453926186, 'max_features': 0.6638509576346638, 'ccp_alpha': 0.047106226412457645, 'random_state': 70}. Best is trial 0 with value: 57.25945640916635.[0m
[32m[I 2021-09-28 13:54:50,059][0m Trial 1 finished with value: 57.44409284549775 and parameters: {'max_depth': 14, 'max_samples': 0.6185496752979981, 'max_features': 0.7544241203451847, 'ccp_alpha': 0.023821901269438377, 'random_state': 17}. Best is trial 0 with value: 57.25945640916635.[0m
[32m[I 2021-09-28 13:54:53,298][0m Trial 2 finished with value: 57.2706614644076 and parameters: {'max_depth': 12, 'max_samples': 0.5892365372468387, 'max_features': 0.7779195469012937, 'ccp_alpha': 0.09008454197113923, 'random_state': 97}. Best is trial 0 with value: 57.2594564

57.046312697048045
{'max_depth': 25, 'max_samples': 0.5390955120021056, 'max_features': 0.5420829243715305, 'ccp_alpha': 0.04057052844168037, 'random_state': 33}


In [None]:
tuned_model = RandomForestRegressor(**trial.params)
tuned_model.fit(X_train, y_train)

RandomForestRegressor(ccp_alpha=0.04057052844168037, max_depth=25,
                      max_features=0.5420829243715305,
                      max_samples=0.5390955120021056, random_state=33)

In [None]:
train_rmse = mean_squared_error(y_train, tuned_model.predict(X_train), squared=False)
valid_rmse = mean_squared_error(y_valid, tuned_model.predict(X_valid), squared=False)

print(f'train rmse: {train_rmse:.3f}')
print(f'valid rmse: {valid_rmse:.3f}')

train_r2 = r2_score(y_train, tuned_model.predict(X_train))
valid_r2 = r2_score(y_valid, tuned_model.predict(X_valid))

print(f'train r2: {train_r2:.3f}')
print(f'valid r2: {valid_r2:.3f}')

train rmse: 34.475
valid rmse: 54.756
train r2: 0.794
valid r2: 0.539


## BaggingRegressor

In [None]:
def objective(trial:int):
    # n_estimators = trial.suggest_int("n_estimators", 100, 1_000)
    max_depth = trial.suggest_int('max_depth', 4, 32)
    ccp_alpha = trial.suggest_loguniform("ccp_alpha", 0.01, .1)
    max_samples = trial.suggest_loguniform("max_samples", 0.5, .8)
    max_features = trial.suggest_loguniform("max_features", 0.5, .8)
    random_state = trial.suggest_int("random_state", 0, 100)

    base_estimator = DecisionTreeRegressor(
        criterion="friedman_mse",
        max_depth=max_depth,
        random_state=random_state,
        ccp_alpha=ccp_alpha,

    )
    model = BaggingRegressor(
        base_estimator=base_estimator,
        n_estimators=100,
        max_samples=max_samples,
        max_features=max_features,
        n_jobs=-1,
        random_state=random_state
    )

    cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=Config.SEED).split(X_train)

    oof_pred = cross_val_score(
        estimator=model, 
        X=X_train, 
        y=y_train, 
        scoring="neg_root_mean_squared_error", 
        cv=cv,
        n_jobs=-1)
    return np.mean(-oof_pred)

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

trial = study.best_trial

print(trial.value)
print(trial.params)

In [None]:
base_estimator = DecisionTreeRegressor(
    criterion="friedman_mse",
    max_depth=trial.params["max_depth"],
    random_state=trial.params["random_state"],
    ccp_alpha=trial.params["ccp_alpha"],

)
tuned_model = BaggingRegressor(
    base_estimator=base_estimator,
    n_estimators=trial.params["n_estimators"],
    max_samples=trial.params["max_samples"],
    max_features=trial.params["max_features"],
    n_jobs=-1,
    random_state=trial.params["random_state"]
)

tuned_model.fit(X_train, y_train)

BaggingRegressor(base_estimator=DecisionTreeRegressor(ccp_alpha=0.07347418682414515,
                                                      criterion='friedman_mse',
                                                      max_depth=8,
                                                      random_state=44),
                 max_features=0.8509069737987963,
                 max_samples=0.6234893800546413, n_estimators=347, n_jobs=-1,
                 random_state=44)

In [None]:
train_rmse = mean_squared_error(y_train, tuned_model.predict(X_train), squared=False)
valid_rmse = mean_squared_error(y_valid, tuned_model.predict(X_valid), squared=False)

print(f'train rmse: {train_rmse:.3f}')
print(f'valid rmse: {valid_rmse:.3f}')

train rmse: 33.459
valid rmse: 54.364


## GradientBoostingRegressor

In [None]:
def objective(trial:int):
    max_depth = trial.suggest_int('max_depth', 4, 32)
    ccp_alpha = trial.suggest_loguniform("ccp_alpha", 0.01, .1)
    subsample = trial.suggest_loguniform("subsample", 0.5, .8)
    max_features = trial.suggest_loguniform("max_features", 0.5, .8)
    random_state = trial.suggest_int("random_state", 0, 100)

    EARLY_STOPPING_ROUNDS = 20
    MAX_EPOCHS = 1_000
    N_SPLITS = 5
    N_REPEATS = 2

    cv = RepeatedKFold(n_splits=N_SPLITS, n_repeats=N_REPEATS, random_state=random_state).split(X_train)
    val_scores = []
    for (train_idx, valid_idx) in cv:
        model = GradientBoostingRegressor(
            n_estimators=0,
            subsample=subsample,
            max_features=max_features,
            max_depth=max_depth,
            random_state=random_state,
            validation_fraction=0,
            ccp_alpha=ccp_alpha
        )

        best_valid_score = float('inf')
        cnt = 0
        for epoch in range(MAX_EPOCHS):
            model.n_estimators += 1
            model.fit(X_train[train_idx], y_train[train_idx])

            valid_rmse = mean_squared_error(y_train[valid_idx], model.predict(X_train[valid_idx]), squared=False)

            if valid_rmse < best_valid_score:
                # print(f'\nBest validation score is improved!! epoch {epoch}: {best_valid_score:.4f} -> {valid_rmse:.4f}')
                best_valid_score = valid_rmse
                cnt = 0
            else:
                cnt += 1

            if cnt > EARLY_STOPPING_ROUNDS:
                break
            
        val_scores.append(best_valid_score)
    
    return np.mean(val_scores)

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

trial = study.best_trial

print(trial.value)
print(trial.params)

[32m[I 2021-09-28 14:18:23,622][0m A new study created in memory with name: no-name-a6006d0e-47e4-4454-b9b7-8c46095dd528[0m
[32m[I 2021-09-28 14:18:33,081][0m Trial 0 finished with value: 58.983718476475666 and parameters: {'max_depth': 25, 'subsample': 0.5186021346756398, 'max_features': 0.6576633546701358, 'random_state': 10}. Best is trial 0 with value: 58.983718476475666.[0m
[32m[I 2021-09-28 14:18:45,131][0m Trial 1 finished with value: 60.26489551830291 and parameters: {'max_depth': 15, 'subsample': 0.6676340208303934, 'max_features': 0.5084830554213372, 'random_state': 92}. Best is trial 0 with value: 58.983718476475666.[0m
[32m[I 2021-09-28 14:18:58,064][0m Trial 2 finished with value: 59.517438809928265 and parameters: {'max_depth': 24, 'subsample': 0.680127051436786, 'max_features': 0.6121228514125281, 'random_state': 0}. Best is trial 0 with value: 58.983718476475666.[0m
[32m[I 2021-09-28 14:19:11,188][0m Trial 3 finished with value: 59.632839033297884 and para

57.95497039217629
{'max_depth': 25, 'subsample': 0.6598143896012909, 'max_features': 0.7836812860773404, 'random_state': 45}


In [None]:
tuned_model = GradientBoostingRegressor(**trial.params)
tuned_model.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=25, max_features=0.7836812860773404,
                          random_state=45, subsample=0.6598143896012909)

In [None]:
train_rmse = mean_squared_error(y_train, tuned_model.predict(X_train), squared=False)
valid_rmse = mean_squared_error(y_valid, tuned_model.predict(X_valid), squared=False)

print(f'train rmse: {train_rmse:.3f}')
print(f'valid rmse: {valid_rmse:.3f}')

train rmse: 0.143
valid rmse: 61.423
