In [2]:
import wandb
from dotenv import load_dotenv

# .env ファイルから環境変数を読み込む
load_dotenv()
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mairspace_nobo[0m ([33mairspace_nobo8[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
import os
import random
import numpy as np
import pandas as pd
import wandb
from wandb.integration.lightgbm import log_summary, wandb_callback
import lightgbm as lgb
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [4]:
print("wandb version:", wandb.__version__)
print("lightgbm version:", lgb.__version__)

wandb version: 0.21.0
lightgbm version: 4.6.0


In [7]:
# Configクラスを定義
class CFG:
    exp_name = 'example_lightgbm'
    test_size = 0.2
    random_state = 42
    learning_rate = 0.1
    num_leaves = 31
    n_estimators = 10000
    feature_fraction = 0.9
    stopping_rounds = 50
    log_evaluation = 100
    objective = 'regression'
    metric = 'rmse',
    features = ["MedInc", "HouseAge", "AveRooms","AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"]

# CFGクラスのインスタンスを作成
config = CFG()

# シード固定
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(config.random_state)

In [8]:
# クラスの属性を辞書に変換する関数
def class_to_dict(obj):
    return {k: getattr(obj, k) for k in dir(obj) if not k.startswith('__') and not callable(getattr(obj, k))}

class_to_dict(config)

{'exp_name': 'example_lightgbm',
 'feature_fraction': 0.9,
 'features': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'learning_rate': 0.1,
 'log_evaluation': 100,
 'metric': ('rmse',),
 'n_estimators': 10000,
 'num_leaves': 31,
 'objective': 'regression',
 'random_state': 42,
 'stopping_rounds': 50,
 'test_size': 0.2}

In [9]:
# WandBの初期化
wandb.init(
    project="example_lightgbm",
    config=class_to_dict(config),
    name = config.exp_name,
)

In [10]:
# データセットを取得
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.DataFrame(data.target, columns=data.target_names)
X_train, X_test, y_train, y_test = train_test_split(
    df[config.features],
    y,
    test_size=config.test_size,
    random_state=config.random_state
)

In [11]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16512, 8), (4128, 8), (16512, 1), (4128, 1))

In [13]:
# LightGBM用のデータセットに変換
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [14]:
# モデルのパラメータを設定
params = {
	'learning_rate': config.learning_rate,
	'num_leaves': config.num_leaves,
	'objective': config.objective,
	'metric': config.metric[0] if isinstance(config.metric, tuple) else config.metric,
	'feature_fraction': config.feature_fraction
}

# モデルを学習
model = lgb.train(
	params,
	train_data,
	num_boost_round=config.n_estimators,
	callbacks=[
		lgb.early_stopping(stopping_rounds=config.stopping_rounds, verbose=True),
		lgb.log_evaluation(config.log_evaluation),
		wandb_callback()
	],
	valid_sets=[train_data, test_data],
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.071947
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.39261	valid_1's rmse: 0.461229
[200]	training's rmse: 0.339785	valid_1's rmse: 0.447667
[300]	training's rmse: 0.305127	valid_1's rmse: 0.442757
[400]	training's rmse: 0.278112	valid_1's rmse: 0.439342
[500]	training's rmse: 0.255133	valid_1's rmse: 0.438417
[600]	training's rmse: 0.235924	valid_1's rmse: 0.437065
Early stopping, best iteration is:
[622]	training's rmse: 0.231847	valid_1's rmse: 0.436778


In [20]:
# テストデータで予測
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
# モデルの評価
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

RMSE: 0.4367780679358385


In [21]:
# 学習のサマリーをWandBに記録
log_summary(model, save_model_checkpoint=True)

In [22]:
# 終了
wandb.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
iteration,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇█
training_rmse,█▅▄▄▄▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
valid_1_rmse,█▇▆▅▅▄▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_iteration,622
iteration,671
