In [1]:
!pip install --upgrade wandb --quiet

[K     |████████████████████████████████| 1.7 MB 4.6 MB/s 
[K     |████████████████████████████████| 181 kB 33.5 MB/s 
[K     |████████████████████████████████| 144 kB 37.4 MB/s 
[K     |████████████████████████████████| 63 kB 997 kB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [2]:
import wandb

wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [34]:
from dataclasses import dataclass

@dataclass(frozen=True)
class job_type:
    split_data: str="split_data"
    train_model: str="train_model"

In [38]:
class global_config:
    entity = "sunyeul"
    project_name = "test_project"
    group_name = "experiment_name"
    seed = 42

In [24]:
from sklearn.datasets import fetch_california_housing

X, y = fetch_california_housing(return_X_y=True, as_frame=True)

In [44]:
import pandas as pd

from sklearn.model_selection import KFold

wandb.init(
    project=global_config.project_name,
    group=global_config.group_name,
    name=f"cv",
    job_type=job_type.split_data,
    config={
        "n_splits": 5,
        "shuffle": True,
        "random_state": 42
    }
)

config = wandb.config

cv = KFold(
    n_splits=config.n_splits,
    shuffle=config.shuffle,
    random_state=config.random_state
)

fold_df = pd.DataFrame(
    data={"fold": np.arange(len(X))}
)

for i, (train_idx, valid_idx) in enumerate(cv.split(X, y), start=1):
    fold_df.iloc[valid_idx] = i

artifact = wandb.Artifact(
    name="fold",
    type="dataframe",
    description=""
)
table = wandb.Table(dataframe=fold_df)
artifact.add(obj=table, name="fold")

wandb.log_artifact(artifact)
wandb.finish()




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [40]:
run = wandb.init()
artifact = run.use_artifact(f'{global_config.entity}/{global_config.project_name}/fold:latest', type='dataframe')
artifact_dir = artifact.download()




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [21]:
import numpy as np

from sklearn.metrics import mean_squared_error

from wandb.lightgbm import wandb_callback, log_summary
import lightgbm as lgb


y_oof = np.zeros_like(y)

for i, (train_idx, valid_idx) in enumerate(cv.split(X, y), start=1):
    wandb.init(
        project=global_config.project_name,
        group=global_config.group_name,
        name=f"fold_{i}",
        job_type=job_type.train_model
    )

    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    train_dataset = lgb.Dataset(
        data=X_train,
        label=y_train
    )
    valid_dataset = lgb.Dataset(
        data=X_valid,
        label=y_valid,
        reference=train_dataset
    )

    model_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': ['rmse', 'l2', 'l1', 'huber'],
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbosity': 0
    }

    fit_params = {
        "num_boost_round": 1_000,
        "early_stopping_rounds" : 20,
        "verbose_eval": 50
    }

    wandb.config.update(model_params)
    wandb.config.update(fit_params)

    model = lgb.train(
        params=model_params,
        train_set=train_dataset,
        valid_sets=[train_dataset, valid_dataset],
        valid_names=["train", "valid"],
        callbacks=[wandb_callback()],
        **fit_params
    )

    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

    fold_score = mean_squared_error(y_valid, y_pred, squared=False)
    print(f'fold_{i}_rmse: {fold_score:.3f}')
    # wandb.log({f'rmse': fold_score})

    wandb.finish()



Training until validation scores don't improve for 20 rounds.
[50]	train's huber: 0.119648	train's l1: 0.372013	train's l2: 0.269138	train's rmse: 0.518785	valid's huber: 0.203525	valid's l1: 0.537415	valid's l2: 0.439354	valid's rmse: 0.662838
[100]	train's huber: 0.0918449	train's l1: 0.309657	train's l2: 0.203276	train's rmse: 0.450862	valid's huber: 0.188272	valid's l1: 0.491649	valid's l2: 0.411779	valid's rmse: 0.6417
Early stopping, best iteration is:
[115]	train's huber: 0.0885576	train's l1: 0.302918	train's l2: 0.195123	train's rmse: 0.441727	valid's huber: 0.18237	valid's l1: 0.48093	valid's l2: 0.398441	valid's rmse: 0.631222
fold_1_rmse: 0.631



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_huber,█▇▆▅▄▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_l1,█▇▆▅▅▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_l2,█▇▆▄▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_rmse,█▇▆▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_huber,█▇▆▄▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_l1,█▇▆▅▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_l2,█▇▅▄▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_rmse,█▇▆▄▄▃▃▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
iteration,134




Training until validation scores don't improve for 20 rounds.
[50]	train's huber: 0.112821	train's l1: 0.359381	train's l2: 0.254251	train's rmse: 0.504233	valid's huber: 0.153034	valid's l1: 0.418799	valid's l2: 0.364518	valid's rmse: 0.603753
[100]	train's huber: 0.0844504	train's l1: 0.295522	train's l2: 0.186458	train's rmse: 0.431808	valid's huber: 0.141018	valid's l1: 0.395532	valid's l2: 0.333529	valid's rmse: 0.57752
[150]	train's huber: 0.0752936	train's l1: 0.276856	train's l2: 0.163704	train's rmse: 0.404603	valid's huber: 0.139729	valid's l1: 0.392743	valid's l2: 0.329742	valid's rmse: 0.574231
Early stopping, best iteration is:
[143]	train's huber: 0.0763737	train's l1: 0.27915	train's l2: 0.166341	train's rmse: 0.407849	valid's huber: 0.139485	valid's l1: 0.391658	valid's l2: 0.329362	valid's rmse: 0.573901
fold_2_rmse: 0.574



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_huber,█▇▅▄▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_l1,█▇▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_l2,█▆▅▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_rmse,█▇▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_huber,█▇▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_l1,█▇▆▅▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_l2,█▆▅▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_rmse,█▇▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
iteration,162




Training until validation scores don't improve for 20 rounds.
[50]	train's huber: 0.110082	train's l1: 0.352609	train's l2: 0.247777	train's rmse: 0.497772	valid's huber: 0.17673	valid's l1: 0.463501	valid's l2: 0.410798	valid's rmse: 0.640935
[100]	train's huber: 0.0819191	train's l1: 0.288465	train's l2: 0.181282	train's rmse: 0.425772	valid's huber: 0.158366	valid's l1: 0.436959	valid's l2: 0.360041	valid's rmse: 0.600034
[150]	train's huber: 0.0729997	train's l1: 0.270202	train's l2: 0.158676	train's rmse: 0.398342	valid's huber: 0.155826	valid's l1: 0.433757	valid's l2: 0.350871	valid's rmse: 0.592344
Early stopping, best iteration is:
[159]	train's huber: 0.0715788	train's l1: 0.267262	train's l2: 0.155222	train's rmse: 0.393982	valid's huber: 0.15513	valid's l1: 0.43286	valid's l2: 0.348803	valid's rmse: 0.590595
fold_3_rmse: 0.591



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_huber,█▇▅▅▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_l1,█▇▆▅▄▄▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_l2,█▆▅▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_rmse,█▇▆▅▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_huber,█▆▅▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_l1,█▇▅▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_l2,█▆▅▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_rmse,█▇▅▅▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
iteration,178




Training until validation scores don't improve for 20 rounds.
[50]	train's huber: 0.116189	train's l1: 0.365607	train's l2: 0.261883	train's rmse: 0.511745	valid's huber: 0.194118	valid's l1: 0.501092	valid's l2: 0.468604	valid's rmse: 0.684546
[100]	train's huber: 0.0861511	train's l1: 0.298425	train's l2: 0.190785	train's rmse: 0.436789	valid's huber: 0.157816	valid's l1: 0.429555	valid's l2: 0.379817	valid's rmse: 0.616293
[150]	train's huber: 0.0766671	train's l1: 0.278986	train's l2: 0.166711	train's rmse: 0.408303	valid's huber: 0.150017	valid's l1: 0.416819	valid's l2: 0.358125	valid's rmse: 0.598436
Early stopping, best iteration is:
[141]	train's huber: 0.0779503	train's l1: 0.281321	train's l2: 0.170037	train's rmse: 0.412355	valid's huber: 0.150131	valid's l1: 0.415787	valid's l2: 0.359573	valid's rmse: 0.599644
fold_4_rmse: 0.600



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_huber,█▇▅▄▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_l1,█▇▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_l2,█▆▅▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_rmse,█▇▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_huber,█▇▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_l1,█▇▆▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_l2,█▇▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_rmse,█▇▆▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
iteration,160




Training until validation scores don't improve for 20 rounds.
[50]	train's huber: 0.108704	train's l1: 0.349446	train's l2: 0.245214	train's rmse: 0.495191	valid's huber: 0.220082	valid's l1: 0.525238	valid's l2: 0.531549	valid's rmse: 0.729074
[100]	train's huber: 0.080237	train's l1: 0.284927	train's l2: 0.177562	train's rmse: 0.421381	valid's huber: 0.20706	valid's l1: 0.497277	valid's l2: 0.49928	valid's rmse: 0.706597
Early stopping, best iteration is:
[105]	train's huber: 0.0790397	train's l1: 0.282472	train's l2: 0.174446	train's rmse: 0.417667	valid's huber: 0.206143	valid's l1: 0.495646	valid's l2: 0.496985	valid's rmse: 0.704972
fold_5_rmse: 0.705



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_huber,█▇▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_l1,█▇▆▆▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_l2,█▇▆▅▄▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_rmse,█▇▆▅▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_huber,█▇▆▅▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_l1,█▇▆▅▄▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_l2,█▆▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_rmse,█▇▆▅▄▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
iteration,124
