In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_boston, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso, Ridge, BayesianRidge

from tqdm.auto import tqdm

In [28]:
class Config:
    SEED = 42
    N_ESTIMATORS = 50

In [16]:
X, y = load_boston(True)
# X, y = load_diabetes(True)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=.8, test_size=.2, random_state=Config.SEED)

In [17]:
RF = RandomForestRegressor(
    n_estimators=Config.N_ESTIMATORS, 
    random_state=Config.SEED,
    oob_score=True,
    ccp_alpha=0.01
    )

RF.fit(X_train, y_train)

train_rmse = mean_squared_error(y_train, RF.predict(X_train), squared=False)
valid_rmse = mean_squared_error(y_valid, RF.predict(X_valid), squared=False)

print(f'train rmse: {train_rmse:.3f}')
print(f'valid rmse: {valid_rmse:.3f}')

train rmse: 1.475
valid rmse: 2.834


* overfitting 발생 -> oof_pred로 학습할 필요있음

In [None]:
X_meta_train = np.zeros((len(X_train), Config.N_ESTIMATORS))
X_meta_valid = np.zeros((len(X_valid), Config.N_ESTIMATORS))

for i, estimator in tqdm(enumerate(RF.estimators_)):
    X_meta_train[:, i] = estimator.predict(X_train)
    X_meta_valid[:, i] = estimator.predict(X_valid)

meta_model = Lasso(max_iter=10_000)
meta_model.fit(X_meta_train, y_train)

train_rmse = mean_squared_error(y_train, meta_model.predict(X_meta_train), squared=False)
valid_rmse = mean_squared_error(y_valid, meta_model.predict(X_meta_valid), squared=False)

print(f'train rmse: {train_rmse:.3f}')
print(f'valid rmse: {valid_rmse:.3f}')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


train rmse: 17.187
valid rmse: 57.847


In [None]:
GBR = GradientBoostingRegressor(
    n_estimators=Config.N_ESTIMATORS, 
    random_state=Config.SEED
    )

GBR.fit(X_train, y_train)

train_rmse = mean_squared_error(y_train, GBR.predict(X_train), squared=False)
valid_rmse = mean_squared_error(y_valid, GBR.predict(X_valid), squared=False)

print(f'train rmse: {train_rmse:.3f}')
print(f'valid rmse: {valid_rmse:.3f}')

train rmse: 37.943
valid rmse: 53.466


In [None]:
X_meta_train = np.zeros((len(X_train), Config.N_ESTIMATORS))
X_meta_valid = np.zeros((len(X_valid), Config.N_ESTIMATORS))

for i, [estimator] in tqdm(enumerate(GBR.estimators_)):
    X_meta_train[:, i] = estimator.predict(X_train)
    X_meta_valid[:, i] = estimator.predict(X_valid)

meta_model = Lasso(max_iter=10_000)
meta_model.fit(X_meta_train, y_train)

train_rmse = mean_squared_error(y_train, meta_model.predict(X_meta_train), squared=False)
valid_rmse = mean_squared_error(y_valid, meta_model.predict(X_meta_valid), squared=False)

print(f'train rmse: {train_rmse:.3f}')
print(f'valid rmse: {valid_rmse:.3f}')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


train rmse: 29.970
valid rmse: 57.774


In [29]:
DT = DecisionTreeRegressor(max_depth=5)
estimators = []

X_meta_train = np.zeros((len(X_train), Config.N_ESTIMATORS))
X_meta_valid = np.zeros((len(X_valid), Config.N_ESTIMATORS))

for i in tqdm(range(Config.N_ESTIMATORS)):
    BR = BaggingRegressor(
        base_estimator=DT, 
        n_estimators=10, 
        max_samples=.5, 
        bootstrap=True,
        oob_score=True, 
        n_jobs=-1, 
        random_state=Config.SEED)

    BR.fit(X_train, y_train)
    
    estimators.append(BR)

    X_meta_train[:, i] = BR.oob_prediction_
    X_meta_valid[:, i] = BR.predict(X_valid)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [30]:
meta_model = Lasso()
meta_model.fit(X_meta_train, y_train)

train_rmse = mean_squared_error(y_train, meta_model.predict(X_meta_train), squared=False)
valid_rmse = mean_squared_error(y_valid, meta_model.predict(X_meta_valid), squared=False)

print(f'train rmse: {train_rmse:.3f}')
print(f'valid rmse: {valid_rmse:.3f}')

train rmse: 3.926
valid rmse: 3.164
