In [1]:
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
from contextlib import contextmanager
from time import time

@contextmanager
def timer(logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
    if prefix: format_str = str(prefix) + format_str
    if suffix: format_str = format_str + str(suffix)
    start = time()
    yield
    d = time() - start
    out_str = format_str.format(d)
    if logger:
        logger.info(out_str)
    else:
        print(out_str)

In [3]:
from abc import ABC, abstractmethod

class BaseEstimator:
    @abstractmethod
    def fit(self):
        pass

    @abstractmethod
    def predict(self):
        pass

In [5]:
class GBDTRegressor(BaseEstimator):
    def __init__(
        self, 
        model_params=None
    ):
        self.model_params = {} if model_params is None else model_params
        self.model = LGBMRegressor(**self.model_params)

    def fit(
        self, 
        train_dataset,
        valid_dataset=None,
        fit_params=None
    ):
        X_train, y_train = train_dataset
        
        if valid_dataset is not None:
            X_valid, y_valid = valid_dataset

        if fit_params is None:
            fit_params = {}

        self.model.fit(
            X_train,
            y_train,
            eval_set=None if valid_dataset is None else [(X_train, y_train), (X_valid, y_valid)],
            **fit_params,
        )

    def predict(
        self, 
        X
    ):
      return self.model.predict(X)  

In [6]:
from sklearn.linear_model import LinearRegression

class LinearRegressor(BaseEstimator):
    def __init__(
        self, 
        model_params=None
    ):
        self.model_params = {} if model_params is None else model_params
        self.model = LinearRegression(**self.model_params)

    def fit(
        self, 
        train_dataset,
        valid_dataset=None,
        fit_params=None
    ):
        X_train, y_train = train_dataset
        
        if valid_dataset is not None:
            X_valid, y_valid = valid_dataset

        if fit_params is None:
            fit_params = {}

        self.model.fit(
            X=X_train,
            y=y_train,
            **fit_params,
        )

    def predict(
        self, 
        X
    ):
        return self.model.predict(X)  

In [7]:
from sklearn.linear_model import Ridge

class RidgeRegressor(BaseEstimator):
    def __init__(
        self, 
        model_params=None
    ):
        self.model_params = {} if model_params is None else model_params
        self.model = Ridge(**self.model_params)

    def fit(
        self, 
        train_dataset,
        valid_dataset=None,
        fit_params=None
    ):
        X_train, y_train = train_dataset
        
        if valid_dataset is not None:
            X_valid, y_valid = valid_dataset

        if fit_params is None:
            fit_params = {}

        self.model.fit(
            X=X_train,
            y=y_train,
            **fit_params
        )

    def predict(
        self, 
        X
    ):
      return self.model.predict(X)  

In [8]:
from dataclasses import dataclass

@dataclass(frozen=True)
class FoldScore(object):
    fold: int
    train_score: float
    valid_score: float

    def show(self):
        print(f"fold :{self.fold}\ntrain_score: {self.train_score:.3f}\nvalid_score: {self.valid_score:.3f}")

In [9]:
def cross_validate(
    model_cls,
    X,
    y,
    cv,
    eval_function,
    model_params=None,
    fit_params=None
):
    models = []
    oof_pred = np.zeros_like(y, dtype=np.float64)

    scores = []

    for i, (train_idx, valid_idx) in enumerate(cv):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]

        model = model_cls(model_params=model_params)

        with timer(prefix=f"fit fold={i+1}\t", suffix="\n"):
            model.fit(
                train_dataset=(X_train, y_train),
                valid_dataset=(X_valid, y_valid),
                fit_params=fit_params,
            )

        fold_train_pred = model.predict(X_train)
        fold_valid_pred = model.predict(X_valid)
        
        fold_score = FoldScore(
            fold=i+1,
            train_score=eval_function(y_train, fold_train_pred),
            valid_score=eval_function(y_valid, fold_valid_pred)
        )

        fold_score.show()

        models.append(model)
        oof_pred[valid_idx] = fold_valid_pred
        scores.append(fold_score)

        print("=" * 40 + "\n")

    oof_score = eval_function(y, oof_pred)

    print(f"FINISHED\nout of fold score: {oof_score:.2f}")
    print("\n" + "=" * 40 + "\n")

    scores = pd.DataFrame(scores)

    return (
        models, 
        oof_pred, 
        scores
        )

In [10]:
def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False)

In [11]:
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv = list(kfold.split(X, y))

In [17]:
models, oof_pred, scores = cross_validate(
    model_cls=GBDTRegressor,
    X=X,
    y=y,
    cv=cv,
    model_params={
        'boosting_type': 'gbdt',
        'n_estimators':1_000
    },
    fit_params={
        'early_stopping_rounds': 20, 
        'verbose': 50
        },
    eval_function=root_mean_squared_error
)

Training until validation scores don't improve for 20 rounds.
[50]	training's l2: 0.200996	valid_1's l2: 0.240556
[100]	training's l2: 0.156226	valid_1's l2: 0.214848
[150]	training's l2: 0.133031	valid_1's l2: 0.205682
[200]	training's l2: 0.116577	valid_1's l2: 0.200996
[250]	training's l2: 0.104528	valid_1's l2: 0.197457
[300]	training's l2: 0.0939211	valid_1's l2: 0.193949
[350]	training's l2: 0.0852599	valid_1's l2: 0.192967
[400]	training's l2: 0.0778506	valid_1's l2: 0.191766
Early stopping, best iteration is:
[400]	training's l2: 0.0778506	valid_1's l2: 0.191766
fit fold=1	3.372[s]

fold :1
train_score: 0.279
valid_score: 0.438

Training until validation scores don't improve for 20 rounds.
[50]	training's l2: 0.200601	valid_1's l2: 0.240271
[100]	training's l2: 0.155862	valid_1's l2: 0.217915
[150]	training's l2: 0.132399	valid_1's l2: 0.209588
[200]	training's l2: 0.116098	valid_1's l2: 0.206284
[250]	training's l2: 0.103562	valid_1's l2: 0.203132
[300]	training's l2: 0.093018

In [18]:
models, oof_pred, scores = cross_validate(
    model_cls=RidgeRegressor,
    X=X,
    y=y,
    cv=cv,
    eval_function=root_mean_squared_error
)

fit fold=1	0.025[s]

fold :1
train_score: 0.720
valid_score: 0.746

fit fold=2	0.004[s]

fold :2
train_score: 0.724
valid_score: 0.726

fit fold=3	0.005[s]

fold :3
train_score: 0.727
valid_score: 0.714

fit fold=4	0.005[s]

fold :4
train_score: 0.730
valid_score: 0.711

fit fold=5	0.005[s]

fold :5
train_score: 0.719
valid_score: 0.745

FINISHED
out of fold score: 0.73




In [None]:
from dataclasses import dataclass
from typing import List, Optional

@dataclass(frozen=True)
class BaseBlock:
    name: str
    model_cls: type(BaseEstimator)
    model_params: dict = None
    fit_params: dict = None
    parents: List['BaseBlock'] = None

@dataclass(frozen=True)
class Result:
    block: BaseBlock
    output: np.ndarray
    input_column_names: List[str]
    models: List[BaseEstimator] = None

    @property
    def is_test_phase(self):
        return self.models is None

In [None]:
class CacheProvider:
    def __init__(self):
        self.results = []

    def register(self, new_result: Result):
        self.results += [new_result]

    def find(self, block: BaseBlock, test=False) -> Optional[Result]:
        for result in self.results:
            if result.block == block and result.is_test_phase == test:
                return result

        return None

In [None]:
def _to_output(result: Result):
    out_df = pd.DataFrame(result.output)
    out_df = out_df.add_prefix(result.block.name + "_")
    return out_df

In [None]:
def _build_input_data(block: BaseBlock, X, y, cv) -> pd.DataFrame:
    if block.parents is None:
        return X

    predicts = [run_fit(block=b, X=X, y=y, cv=cv) for b in block.parents]

    output = pd.concat(predicts, axis=1)

    return output

In [None]:
def run_fit(block: BaseBlock, X, y, cv) -> pd.DataFrame:
    result = cache_provider.find(block, test=False)
    if result:
        print("use cache file...")
        return _to_output(result)

    x_df = _build_input_data(block, X=X, y=y, cv=cv)

    print(f"run {block.name} ...")

    models, output, _ = cross_validate(
        model_cls=block.model_cls,
        X=x_df, 
        y=y, 
        cv=cv,
        model_params=block.model_params,
        fit_params=block.fit_params,
        eval_function=root_mean_squared_error
    )

    result = Result(
        block=block, 
        models=models, 
        output=output, 
        input_column_names=x_df.columns.tolist()
        )

    cache_provider.register(result)

    return _to_output(result)

In [None]:
def run_predict(block: BaseBlock, input_df: pd.DataFrame):
    result = cache_provider.find(block, test=True)
    if result:
        print("use cache file...")
        return _to_output(result)

    train_result = cache_provider.find(block, test=False)
    if train_result is None:
        raise ValueError("must fit before predict")

    x_df = _build_input_data(block, input_df, generator=run_predict)
    print(f"run {block.name} ...")
    predicts = np.array([model.predict_proba(x_df.values) for model in train_result.models])
    predict = np.mean(predicts, axis=0)

    test_result = Result(block=block, output=predict, input_column_names=x_df.columns.tolist())
    cache_provider.register(test_result)
    return _to_output(test_result)

In [None]:
from sklearn.linear_model import LinearRegression

cache_provider = CacheProvider()

# 一段目のモデル. 特徴をそのまま利用する.
first = [
    BaseBlock(
        name="lgbm_1", 
        model_cls=GBDTRegressor, 
        model_params={
            "learning_rate": .1,
            "reg_lambda": 1.,
            "reg_alpha": .1,
            "max_depth": 5, 
            "n_estimators": 10_000, 
            "colsample_bytree": .5, 
            "min_child_samples": 10,
            "subsample_freq": 3,
            "subsample": .9,
            "importance_type": "gain", 
            "random_state": 71,
        },
        fit_params={
            'early_stopping_rounds': 20,
            'verbose': 50
        }
    ),
    BaseBlock(
        name="lgbm_2", 
        model_cls=GBDTRegressor, 
        model_params={
            "learning_rate": .1,
            "reg_lambda": 100.,
            "reg_alpha": 100.,
            "max_depth": 5, 
            "n_estimators": 10_000, 
            "colsample_bytree": .5, 
            "min_child_samples": 10,
            "subsample_freq": 3,
            "subsample": .9,
            "importance_type": "gain", 
            "random_state": 71,
        },
        fit_params={
            'early_stopping_rounds': 20,
            'verbose': 50
        }
    )
]

# 三段目のモデル. 二段目のモデルを入力 (parents) にもつ
final_model = BaseBlock(
    name="ridge", 
    model_cls=RidgeRegressor,
    model_params={
        'fit_intercept': False
    },
    parents=first,
)

In [None]:
oof_pred = run_fit(
    block=final_model,
    X=X,
    y=y,
    cv=cv
)

run lgbm_1 ...
Training until validation scores don't improve for 20 rounds.
[50]	training's l2: 0.271589	valid_1's l2: 0.313855
[100]	training's l2: 0.210435	valid_1's l2: 0.263964
[150]	training's l2: 0.182226	valid_1's l2: 0.245027
[200]	training's l2: 0.164764	valid_1's l2: 0.234974
[250]	training's l2: 0.150983	valid_1's l2: 0.227903
[300]	training's l2: 0.139782	valid_1's l2: 0.222224
[350]	training's l2: 0.130697	valid_1's l2: 0.218956
[400]	training's l2: 0.123325	valid_1's l2: 0.21513
[450]	training's l2: 0.116812	valid_1's l2: 0.212324
Early stopping, best iteration is:
[465]	training's l2: 0.114582	valid_1's l2: 0.211263
fit fold=1	1.207[s]

fold :1
train_score: 0.339
valid_score: 0.460

Training until validation scores don't improve for 20 rounds.
[50]	training's l2: 0.274958	valid_1's l2: 0.308332
[100]	training's l2: 0.213502	valid_1's l2: 0.25732
[150]	training's l2: 0.18172	valid_1's l2: 0.236025
[200]	training's l2: 0.162374	valid_1's l2: 0.224882
[250]	training's l2: 