In [1]:
%load_ext lab_black

In [2]:
%config Completer.use_jedi = False

In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import time
from contextlib import contextmanager
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.mixture import GaussianMixture

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from ngboost import NGBRegressor

In [4]:
@contextmanager
def timer(logger=None, format_str="{:.3f}[s]", prefix=None, suffix=None):
    if prefix:
        format_str = str(prefix) + format_str
    if suffix:
        format_str = format_str + str(suffix)
    start = time.time()
    yield
    d = time.time() - start
    out_str = format_str.format(d)
    if logger:
        logger.info(out_str)
    else:
        print(out_str)

In [5]:
path = "input"

In [35]:
train = pd.read_csv(path + "/train.csv", index_col=["id"])
test = pd.read_csv(path + "/test.csv", index_col=["id"])
submission = pd.read_csv(path + "/sample_submission.csv", index_col=["id"])

In [36]:
features = [
    "cont1",
    "cont2",
    "cont3",
    "cont4",
    "cont5",
    "cont6",
    "cont7",
    "cont8",
    "cont9",
    "cont10",
    "cont11",
    "cont12",
    "cont13",
    "cont14",
]

In [37]:
def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [38]:
X = train[features]
y = train["target"]

In [39]:
kfold = KFold(n_splits=5, shuffle=True)

cv = list(kfold.split(X))

## NGBoost

In [40]:
def fit_ngb(X, y, cv, params: dict = None, verbose: int = 50):
    if params is None:
        params = {}

    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (train_idx, valid_idx) in tqdm(enumerate(cv), total=len(cv)):
        X_train, y_train = X[train_idx], y[train_idx]
        X_valid, y_valid = X[valid_idx], y[valid_idx]

        model = NGBRegressor(**params)

        with timer(prefix=f"fit fold={i+1}"):
            model.fit(
                X=X_train,
                Y=y_train,
                X_val=X_valid,
                Y_val=y_valid,
                early_stopping_rounds=verbose,
            )

        y_pred_i = model.predict(X[valid_idx])
        oof_pred[valid_idx] = y_pred_i
        models.append(model)

        print(f"Fold {i+1} RMSE: {rmse_score(y[valid_idx], y_pred_i):.4f}")

    score = rmse_score(y, oof_pred)
    print(f"FINISEHD | Whole RMSE: {score:.4f}")

    return oof_pred, models

In [41]:
ngb_params = {
    "learning_rate": 0.05,
    "n_estimators": 500,
    "verbose_eval": 100,
}

oof_pred, models = fit_ngb(X.values, y.values, cv, ngb_params)

  0%|          | 0/5 [00:00<?, ?it/s]

[iter 0] loss=1.1078 val_loss=1.1094 scale=1.0000 norm=0.8041
[iter 100] loss=1.0714 val_loss=1.0761 scale=1.0000 norm=0.7855
[iter 200] loss=1.0632 val_loss=1.0698 scale=1.0000 norm=0.7826
[iter 300] loss=1.0574 val_loss=1.0659 scale=1.0000 norm=0.7803
[iter 400] loss=1.0530 val_loss=1.0635 scale=1.0000 norm=0.7786


KeyboardInterrupt: 

In [None]:
def visualize_importance(models, X, importance_type="gain"):
    feature_importance_df = pd.DataFrame()

    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df["feature_importance"] = model.feature_importances_[0]
        _df["column"] = X.columns
        _df["fold"] = i + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, _df], axis=0, ignore_index=True
        )

    order = (
        feature_importance_df.groupby("column")
        .sum()[["feature_importance"]]
        .sort_values("feature_importance", ascending=False)
        .index[:50]
    )

    fig, ax = plt.subplots(figsize=(max(6, len(order) * 0.4), 7))
    sns.boxenplot(
        data=feature_importance_df,
        x="column",
        y="feature_importance",
        order=order,
        ax=ax,
        palette="viridis",
    )
    ax.tick_params(axis="x", rotation=90)
    ax.grid()
    fig.tight_layout()
    return fig, ax

In [None]:
visualize_importance(models, X)

plt.show()

In [295]:
pred = np.array([model.predict(test) for model in models])
pred = np.mean(pred, axis=0)

In [None]:
sns.histplot(pred, label="pred", color="skyblue")
sns.histplot(oof_pred, label="oof_pred", color="orange")

plt.legend()
plt.grid(True)
plt.show()

In [112]:
submission["target"] = pred
submission.to_csv(path+ "submission.csv")