In [None]:
import optuna
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score

# 데이터 불러오기
X_train = pd.read_csv("./data/X_train.csv")
y_train = pd.read_csv("./data/y_train_log.csv").values.ravel()


# Optuna objective 함수
def objective(trial):
    params = {
        "objective": "regression",
        "n_estimators": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 128, step=16),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "random_state": 42,
        "verbosity": -1,
    }

    model = lgb.LGBMRegressor(**params)
    score = cross_val_score(
        model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5
    )
    return -score.mean()  # Optuna는 최소화 기준이므로 음수 제거

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)  # 원하는 탐색 횟수 설정

print("Best RMSE:", study.best_value)
print("Best hyperparameters:")
print(study.best_params)

[I 2025-04-15 21:58:00,659] A new study created in memory with name: no-name-d5852ed1-608d-436f-9796-1054adafc68d
[I 2025-04-15 21:58:03,432] Trial 0 finished with value: 0.12790838856918305 and parameters: {'learning_rate': 0.01976619263266054, 'num_leaves': 96, 'max_depth': 3, 'min_child_samples': 40, 'subsample': 0.8850185218589728, 'colsample_bytree': 0.707291217204945, 'reg_alpha': 0.3828826192028628, 'reg_lambda': 0.4922883804409326}. Best is trial 0 with value: 0.12790838856918305.
[I 2025-04-15 21:58:08,822] Trial 1 finished with value: 0.13155257793113723 and parameters: {'learning_rate': 0.007971987247010362, 'num_leaves': 48, 'max_depth': 6, 'min_child_samples': 50, 'subsample': 0.8662871738266789, 'colsample_bytree': 0.6890013749560474, 'reg_alpha': 0.08401643203434972, 'reg_lambda': 0.22467587801049915}. Best is trial 0 with value: 0.12790838856918305.
[I 2025-04-15 21:58:12,497] Trial 2 finished with value: 0.13302949735514322 and parameters: {'learning_rate': 0.060949579

Best RMSE: 0.12505373625874902
Best hyperparameters:
{'learning_rate': 0.032433171433384124, 'num_leaves': 16, 'max_depth': 3, 'min_child_samples': 33, 'subsample': 0.9537683573812108, 'colsample_bytree': 0.693495406469989, 'reg_alpha': 0.002063068144913055, 'reg_lambda': 0.7137439599801031}


In [16]:
# 최적 파라미터에 기본 값 추가
best_params = study.best_params
best_params.update(
    {"objective": "regression", "n_estimators": 1000, "random_state": 42}
)

# 모델 학습
model = lgb.LGBMRegressor(**best_params)
model.fit(X_train, y_train)

# 예측
X_test = pd.read_csv("./data/X_test.csv")
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)  # 로그 복원

In [17]:
test_origin = pd.read_csv("./data/test.csv")
submission = pd.DataFrame({"Id": test_origin["Id"], "SalePrice": y_pred})

submission.to_csv("./submission/lgbm_optuna_submission.csv", index=False)
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,119606.924077
1,1462,154401.429156
2,1463,185703.587944
3,1464,195583.10173
4,1465,185596.161177
