In [13]:
import pandas as pd
import numpy as np
import optuna 

from sklearn.metrics import mean_squared_error
import lightgbm as lgbm

from scripts.preprocess import preprocess

import warnings
warnings.filterwarnings('ignore')

## Data Preprocessing

In [11]:
X_train, X_val, y_train, y_val = preprocess(
    "data/Housing_dataset_train.csv",
    "data/Housing_dataset_test.csv",
    "state_to_region.json",
)

print(X_train.shape, X_val.shape)
print(y_train.shape, y_val.shape)

(10500, 7) (3500, 7)
(10500,) (3500,)


In [14]:
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "n_estimators": 1000,
        "verbosity": -1,
        "bagging_freq": 1,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    model = lgbm.LGBMRegressor(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgbm.callback.early_stopping(stopping_rounds=100)],
        verbose=False,
    )

    preds = model.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)

    return rmse


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[I 2023-08-08 23:01:29,598] A new study created in memory with name: no-name-06449c2f-ef1f-4a79-b0b1-82b2510c784a


Training until validation scores don't improve for 100 rounds


[I 2023-08-08 23:01:30,944] Trial 0 finished with value: 853307.7085793996 and parameters: {'learning_rate': 0.001393985510556157, 'num_leaves': 416, 'subsample': 0.08448618483187993, 'colsample_bytree': 0.44060011208511013, 'min_data_in_leaf': 75}. Best is trial 0 with value: 853307.7085793996.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 2.382e+06
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 2.382e+06


[I 2023-08-08 23:01:35,498] Trial 1 finished with value: 665129.160374735 and parameters: {'learning_rate': 0.0038350743889421883, 'num_leaves': 347, 'subsample': 0.6738327145150824, 'colsample_bytree': 0.2441937291335825, 'min_data_in_leaf': 36}. Best is trial 1 with value: 665129.160374735.


Training until validation scores don't improve for 100 rounds


[I 2023-08-08 23:01:37,899] Trial 2 finished with value: 553814.9216273047 and parameters: {'learning_rate': 0.0293077981731368, 'num_leaves': 902, 'subsample': 0.20266707544440327, 'colsample_bytree': 0.12853705727890677, 'min_data_in_leaf': 45}. Best is trial 2 with value: 553814.9216273047.


Did not meet early stopping. Best iteration is:
[970]	valid_0's rmse: 2.382e+06
Training until validation scores don't improve for 100 rounds


[I 2023-08-08 23:01:39,597] Trial 3 finished with value: 540013.538552373 and parameters: {'learning_rate': 0.08028218624421499, 'num_leaves': 612, 'subsample': 0.1723567495230023, 'colsample_bytree': 0.7265436107343063, 'min_data_in_leaf': 47}. Best is trial 3 with value: 540013.538552373.


Early stopping, best iteration is:
[196]	valid_0's rmse: 2.382e+06
Training until validation scores don't improve for 100 rounds


[I 2023-08-08 23:01:42,901] Trial 4 finished with value: 705161.7697628671 and parameters: {'learning_rate': 0.0032038718871657524, 'num_leaves': 932, 'subsample': 0.5952971638929704, 'colsample_bytree': 0.27869306054916454, 'min_data_in_leaf': 65}. Best is trial 3 with value: 540013.538552373.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 2.382e+06
