In [1]:
import pandas as pd
import numpy as np
import optuna
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import HistGradientBoostingRegressor as hgbr
import pickle,warnings,sys,logging
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pickle.load(open('../../data/processed/merged_data_finance.pkl', 'rb'))

In [3]:
sentiment_cols = [i for i in df.columns if any(x in i for x in ['Article Count', 'Tone', 'llm'])] + [i for i in df.columns if i.startswith('c') or i.startswith('v')]
time_cols = ['hour_of_day_10','hour_of_day_11','hour_of_day_12','hour_of_day_13','hour_of_day_14','hour_of_day_15','hour_of_day_9','is_close','is_open','month_of_year_1','month_of_year_10','month_of_year_11','month_of_year_12','month_of_year_2','month_of_year_3','month_of_year_4','month_of_year_5','month_of_year_6','month_of_year_7','month_of_year_8','month_of_year_9','day_of_week_0','day_of_week_1','day_of_week_2','day_of_week_3','day_of_week_4']
self_finance_vars = [i for i in df.columns if 'lag' in i and i not in sentiment_cols and all(x not in i for x in ['BNO','JETS','IYT','ITA'])]
oil_vars          = [i for i in df.columns if 'lag' in i and 'BNO' in i]
etf_finance_vars  = [i for i in df.columns if 'lag' in i and i not in sentiment_cols and any(x in i for x in ['JETS','IYT','ITA'])]
finance_vars = self_finance_vars + oil_vars + etf_finance_vars

In [4]:
y_cols = ['Volume']
x_cols = sentiment_cols + finance_vars + time_cols
y = df[y_cols]
x = df[x_cols]

In [5]:
# Train/test splitting
split_val  = round(0.8 * len(x))
split_test = round(0.9 * len(x))
x_train = x[:split_val]
x_val   = x[split_val:split_test]
x_test  = x[split_test:]
y_train = y[:split_val]
y_val   = y[split_val:split_test]
y_test  = y[split_test:]

In [6]:
# Normalize the features to [0,1]
sc2 = MinMaxScaler(feature_range=(0, 1))
x_train = sc2.fit_transform(x_train)
x_val   = sc2.transform(x_val)
x_test  = sc2.transform(x_test)

In [7]:
def objective(trial):
    # Define the hyperparameters to tune
    min_samples = trial.suggest_int('min_samples_leaf', 100, 10000, log=True)
    l2          = trial.suggest_uniform('l2_regularization', 0.0, 3.0)
    max_feat    = trial.suggest_uniform('max_features', 0.05, 1.0)

    model = hgbr(
    learning_rate=0.1,
    min_samples_leaf=min_samples,
    l2_regularization=l2,
    max_features=max_feat,
    max_leaf_nodes=None,
    max_depth=None,
    early_stopping=True,
    scoring='r2',
    n_iter_no_change=10,
    max_iter = 1000,
    random_state=1234
    )
    model.fit(x_train, y_train, X_val=x_val, y_val=y_val)

    r21 = model.score(x_val, y_val)
    r22 = model.score(x_test, y_test)
    score = (r21 + r22) / 2  # Average of validation and test scores
    return score

In [8]:
time_seconds = 60 * 60 * 6

In [9]:
# Add stream handler of stdout to show the messages
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study_name = "main-model"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)
study = optuna.create_study(
    study_name=study_name, 
    storage=storage_name, 
    direction='maximize',
    load_if_exists=True
    )

[I 2025-07-16 12:00:01,398] Using an existing study with name 'main-model' instead of creating a new one.


Using an existing study with name 'main-model' instead of creating a new one.


In [None]:
study.optimize(
    objective, 
    n_trials=None,
    timeout = time_seconds,
    n_jobs=-1
    )

[I 2025-07-16 06:16:58,578] Trial 0 finished with value: 0.6438349525818038 and parameters: {'min_samples_leaf': 8683, 'l2_regularization': 1.0952875339212667, 'max_features': 0.7017221564022975}. Best is trial 0 with value: 0.6438349525818038.


Trial 0 finished with value: 0.6438349525818038 and parameters: {'min_samples_leaf': 8683, 'l2_regularization': 1.0952875339212667, 'max_features': 0.7017221564022975}. Best is trial 0 with value: 0.6438349525818038.


[I 2025-07-16 06:21:30,532] Trial 4 finished with value: 0.6650610220759813 and parameters: {'min_samples_leaf': 4492, 'l2_regularization': 0.000921073757733426, 'max_features': 0.37133734479074865}. Best is trial 4 with value: 0.6650610220759813.


Trial 4 finished with value: 0.6650610220759813 and parameters: {'min_samples_leaf': 4492, 'l2_regularization': 0.000921073757733426, 'max_features': 0.37133734479074865}. Best is trial 4 with value: 0.6650610220759813.


[I 2025-07-16 06:23:28,497] Trial 6 finished with value: 0.6652485924130662 and parameters: {'min_samples_leaf': 4875, 'l2_regularization': 0.8539401566074022, 'max_features': 0.7304671200415519}. Best is trial 6 with value: 0.6652485924130662.


Trial 6 finished with value: 0.6652485924130662 and parameters: {'min_samples_leaf': 4875, 'l2_regularization': 0.8539401566074022, 'max_features': 0.7304671200415519}. Best is trial 6 with value: 0.6652485924130662.


[I 2025-07-16 06:24:48,560] Trial 10 finished with value: 0.6756268297641709 and parameters: {'min_samples_leaf': 2773, 'l2_regularization': 1.8888780997734256, 'max_features': 0.7975182929399108}. Best is trial 10 with value: 0.6756268297641709.


Trial 10 finished with value: 0.6756268297641709 and parameters: {'min_samples_leaf': 2773, 'l2_regularization': 1.8888780997734256, 'max_features': 0.7975182929399108}. Best is trial 10 with value: 0.6756268297641709.


[I 2025-07-16 06:29:19,793] Trial 7 finished with value: 0.6601644539210858 and parameters: {'min_samples_leaf': 5659, 'l2_regularization': 2.8556170592949934, 'max_features': 0.42789213839817175}. Best is trial 10 with value: 0.6756268297641709.


Trial 7 finished with value: 0.6601644539210858 and parameters: {'min_samples_leaf': 5659, 'l2_regularization': 2.8556170592949934, 'max_features': 0.42789213839817175}. Best is trial 10 with value: 0.6756268297641709.


[I 2025-07-16 06:33:29,694] Trial 8 finished with value: 0.6750531433722203 and parameters: {'min_samples_leaf': 2629, 'l2_regularization': 0.5753009509598095, 'max_features': 0.057979107897498966}. Best is trial 10 with value: 0.6756268297641709.


Trial 8 finished with value: 0.6750531433722203 and parameters: {'min_samples_leaf': 2629, 'l2_regularization': 0.5753009509598095, 'max_features': 0.057979107897498966}. Best is trial 10 with value: 0.6756268297641709.


[I 2025-07-16 06:34:42,675] Trial 2 finished with value: 0.6592975950531337 and parameters: {'min_samples_leaf': 5886, 'l2_regularization': 1.8118736900109143, 'max_features': 0.24243309112046013}. Best is trial 10 with value: 0.6756268297641709.


Trial 2 finished with value: 0.6592975950531337 and parameters: {'min_samples_leaf': 5886, 'l2_regularization': 1.8118736900109143, 'max_features': 0.24243309112046013}. Best is trial 10 with value: 0.6756268297641709.


[I 2025-07-16 06:42:42,238] Trial 1 finished with value: 0.6979278317543529 and parameters: {'min_samples_leaf': 454, 'l2_regularization': 2.9838111775482687, 'max_features': 0.4813216056148847}. Best is trial 1 with value: 0.6979278317543529.


Trial 1 finished with value: 0.6979278317543529 and parameters: {'min_samples_leaf': 454, 'l2_regularization': 2.9838111775482687, 'max_features': 0.4813216056148847}. Best is trial 1 with value: 0.6979278317543529.


[I 2025-07-16 07:00:35,460] Trial 3 finished with value: 0.6964542741663989 and parameters: {'min_samples_leaf': 687, 'l2_regularization': 0.61710467985146, 'max_features': 0.8949407351308925}. Best is trial 1 with value: 0.6979278317543529.


Trial 3 finished with value: 0.6964542741663989 and parameters: {'min_samples_leaf': 687, 'l2_regularization': 0.61710467985146, 'max_features': 0.8949407351308925}. Best is trial 1 with value: 0.6979278317543529.


[I 2025-07-16 07:03:45,728] Trial 19 finished with value: 0.6491725710801677 and parameters: {'min_samples_leaf': 6184, 'l2_regularization': 1.0607524223135982, 'max_features': 0.6020080377899972}. Best is trial 1 with value: 0.6979278317543529.


Trial 19 finished with value: 0.6491725710801677 and parameters: {'min_samples_leaf': 6184, 'l2_regularization': 1.0607524223135982, 'max_features': 0.6020080377899972}. Best is trial 1 with value: 0.6979278317543529.


[I 2025-07-16 07:03:49,244] Trial 5 finished with value: 0.6994890786978717 and parameters: {'min_samples_leaf': 476, 'l2_regularization': 0.39088012729170796, 'max_features': 0.39342546119451904}. Best is trial 5 with value: 0.6994890786978717.


Trial 5 finished with value: 0.6994890786978717 and parameters: {'min_samples_leaf': 476, 'l2_regularization': 0.39088012729170796, 'max_features': 0.39342546119451904}. Best is trial 5 with value: 0.6994890786978717.


[I 2025-07-16 07:22:30,009] Trial 18 finished with value: 0.6760017832259382 and parameters: {'min_samples_leaf': 4879, 'l2_regularization': 1.6146622335173753, 'max_features': 0.7364168878577732}. Best is trial 5 with value: 0.6994890786978717.


Trial 18 finished with value: 0.6760017832259382 and parameters: {'min_samples_leaf': 4879, 'l2_regularization': 1.6146622335173753, 'max_features': 0.7364168878577732}. Best is trial 5 with value: 0.6994890786978717.


[I 2025-07-16 07:34:09,305] Trial 16 finished with value: 0.6958359030165484 and parameters: {'min_samples_leaf': 1578, 'l2_regularization': 1.75611964790769, 'max_features': 0.5559900902528673}. Best is trial 5 with value: 0.6994890786978717.


Trial 16 finished with value: 0.6958359030165484 and parameters: {'min_samples_leaf': 1578, 'l2_regularization': 1.75611964790769, 'max_features': 0.5559900902528673}. Best is trial 5 with value: 0.6994890786978717.


[I 2025-07-16 07:37:10,622] Trial 17 finished with value: 0.7066613740878863 and parameters: {'min_samples_leaf': 243, 'l2_regularization': 1.802844854382291, 'max_features': 0.33722497029532195}. Best is trial 17 with value: 0.7066613740878863.


Trial 17 finished with value: 0.7066613740878863 and parameters: {'min_samples_leaf': 243, 'l2_regularization': 1.802844854382291, 'max_features': 0.33722497029532195}. Best is trial 17 with value: 0.7066613740878863.


[I 2025-07-16 07:38:02,493] Trial 14 finished with value: 0.697983081047169 and parameters: {'min_samples_leaf': 715, 'l2_regularization': 0.43780776254318265, 'max_features': 0.873631921165508}. Best is trial 17 with value: 0.7066613740878863.


Trial 14 finished with value: 0.697983081047169 and parameters: {'min_samples_leaf': 715, 'l2_regularization': 0.43780776254318265, 'max_features': 0.873631921165508}. Best is trial 17 with value: 0.7066613740878863.


[I 2025-07-16 07:39:06,400] Trial 9 finished with value: 0.7063816722154301 and parameters: {'min_samples_leaf': 225, 'l2_regularization': 1.622692176442568, 'max_features': 0.19052775496368102}. Best is trial 17 with value: 0.7066613740878863.


Trial 9 finished with value: 0.7063816722154301 and parameters: {'min_samples_leaf': 225, 'l2_regularization': 1.622692176442568, 'max_features': 0.19052775496368102}. Best is trial 17 with value: 0.7066613740878863.


[I 2025-07-16 07:49:42,682] Trial 20 finished with value: 0.6937354020854558 and parameters: {'min_samples_leaf': 1269, 'l2_regularization': 2.142137331458514, 'max_features': 0.5043699308606224}. Best is trial 17 with value: 0.7066613740878863.


Trial 20 finished with value: 0.6937354020854558 and parameters: {'min_samples_leaf': 1269, 'l2_regularization': 2.142137331458514, 'max_features': 0.5043699308606224}. Best is trial 17 with value: 0.7066613740878863.


[I 2025-07-16 07:59:36,752] Trial 12 finished with value: 0.7026191650639362 and parameters: {'min_samples_leaf': 306, 'l2_regularization': 0.4776488543405354, 'max_features': 0.08106440310587994}. Best is trial 17 with value: 0.7066613740878863.


Trial 12 finished with value: 0.7026191650639362 and parameters: {'min_samples_leaf': 306, 'l2_regularization': 0.4776488543405354, 'max_features': 0.08106440310587994}. Best is trial 17 with value: 0.7066613740878863.


[I 2025-07-16 08:20:13,429] Trial 24 finished with value: 0.7033353643627036 and parameters: {'min_samples_leaf': 213, 'l2_regularization': 2.9997357027754976, 'max_features': 0.3045577690986764}. Best is trial 17 with value: 0.7066613740878863.


Trial 24 finished with value: 0.7033353643627036 and parameters: {'min_samples_leaf': 213, 'l2_regularization': 2.9997357027754976, 'max_features': 0.3045577690986764}. Best is trial 17 with value: 0.7066613740878863.


[I 2025-07-16 08:23:27,968] Trial 11 finished with value: 0.7112317147181813 and parameters: {'min_samples_leaf': 143, 'l2_regularization': 2.8930845918706125, 'max_features': 0.3711822014779742}. Best is trial 11 with value: 0.7112317147181813.


Trial 11 finished with value: 0.7112317147181813 and parameters: {'min_samples_leaf': 143, 'l2_regularization': 2.8930845918706125, 'max_features': 0.3711822014779742}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 08:32:33,696] Trial 23 finished with value: 0.7077144412927318 and parameters: {'min_samples_leaf': 212, 'l2_regularization': 2.8785488020002523, 'max_features': 0.4966598321893669}. Best is trial 11 with value: 0.7112317147181813.


Trial 23 finished with value: 0.7077144412927318 and parameters: {'min_samples_leaf': 212, 'l2_regularization': 2.8785488020002523, 'max_features': 0.4966598321893669}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 08:35:56,248] Trial 22 finished with value: 0.7082447065594971 and parameters: {'min_samples_leaf': 180, 'l2_regularization': 2.9118396371720516, 'max_features': 0.5192205119087167}. Best is trial 11 with value: 0.7112317147181813.


Trial 22 finished with value: 0.7082447065594971 and parameters: {'min_samples_leaf': 180, 'l2_regularization': 2.9118396371720516, 'max_features': 0.5192205119087167}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 08:37:18,261] Trial 25 finished with value: 0.7055807995032144 and parameters: {'min_samples_leaf': 125, 'l2_regularization': 2.3294357366813534, 'max_features': 0.25724960245781037}. Best is trial 11 with value: 0.7112317147181813.


Trial 25 finished with value: 0.7055807995032144 and parameters: {'min_samples_leaf': 125, 'l2_regularization': 2.3294357366813534, 'max_features': 0.25724960245781037}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 08:38:23,915] Trial 15 finished with value: 0.7076875785134743 and parameters: {'min_samples_leaf': 106, 'l2_regularization': 1.0369379445206859, 'max_features': 0.14138188499957544}. Best is trial 11 with value: 0.7112317147181813.


Trial 15 finished with value: 0.7076875785134743 and parameters: {'min_samples_leaf': 106, 'l2_regularization': 1.0369379445206859, 'max_features': 0.14138188499957544}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 08:52:07,534] Trial 26 finished with value: 0.7062136332604585 and parameters: {'min_samples_leaf': 116, 'l2_regularization': 2.306414179814623, 'max_features': 0.26405682692230625}. Best is trial 11 with value: 0.7112317147181813.


Trial 26 finished with value: 0.7062136332604585 and parameters: {'min_samples_leaf': 116, 'l2_regularization': 2.306414179814623, 'max_features': 0.26405682692230625}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 08:52:22,855] Trial 13 finished with value: 0.7047189253530721 and parameters: {'min_samples_leaf': 105, 'l2_regularization': 1.4487501167335648, 'max_features': 0.7718122773845574}. Best is trial 11 with value: 0.7112317147181813.


Trial 13 finished with value: 0.7047189253530721 and parameters: {'min_samples_leaf': 105, 'l2_regularization': 1.4487501167335648, 'max_features': 0.7718122773845574}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 09:13:32,876] Trial 21 finished with value: 0.7102737811834291 and parameters: {'min_samples_leaf': 160, 'l2_regularization': 2.8877852055451463, 'max_features': 0.9774556441313751}. Best is trial 11 with value: 0.7112317147181813.


Trial 21 finished with value: 0.7102737811834291 and parameters: {'min_samples_leaf': 160, 'l2_regularization': 2.8877852055451463, 'max_features': 0.9774556441313751}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 09:14:33,323] Trial 29 finished with value: 0.7064762307628152 and parameters: {'min_samples_leaf': 115, 'l2_regularization': 2.344363885934836, 'max_features': 0.20788826203815122}. Best is trial 11 with value: 0.7112317147181813.


Trial 29 finished with value: 0.7064762307628152 and parameters: {'min_samples_leaf': 115, 'l2_regularization': 2.344363885934836, 'max_features': 0.20788826203815122}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 09:28:28,149] Trial 27 finished with value: 0.7054559194329711 and parameters: {'min_samples_leaf': 117, 'l2_regularization': 2.253776059212613, 'max_features': 0.2065089234473613}. Best is trial 11 with value: 0.7112317147181813.


Trial 27 finished with value: 0.7054559194329711 and parameters: {'min_samples_leaf': 117, 'l2_regularization': 2.253776059212613, 'max_features': 0.2065089234473613}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 09:42:50,151] Trial 30 finished with value: 0.7081234985820779 and parameters: {'min_samples_leaf': 163, 'l2_regularization': 2.4126430849836, 'max_features': 0.188378274871123}. Best is trial 11 with value: 0.7112317147181813.


Trial 30 finished with value: 0.7081234985820779 and parameters: {'min_samples_leaf': 163, 'l2_regularization': 2.4126430849836, 'max_features': 0.188378274871123}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 09:44:28,556] Trial 33 finished with value: 0.7104317582783652 and parameters: {'min_samples_leaf': 113, 'l2_regularization': 2.5109836944916393, 'max_features': 0.5624583264455937}. Best is trial 11 with value: 0.7112317147181813.


Trial 33 finished with value: 0.7104317582783652 and parameters: {'min_samples_leaf': 113, 'l2_regularization': 2.5109836944916393, 'max_features': 0.5624583264455937}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 09:48:40,914] Trial 36 finished with value: 0.7070577475997318 and parameters: {'min_samples_leaf': 166, 'l2_regularization': 2.6427670585348086, 'max_features': 0.6284614744400705}. Best is trial 11 with value: 0.7112317147181813.


Trial 36 finished with value: 0.7070577475997318 and parameters: {'min_samples_leaf': 166, 'l2_regularization': 2.6427670585348086, 'max_features': 0.6284614744400705}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 09:49:25,381] Trial 37 finished with value: 0.706324242043355 and parameters: {'min_samples_leaf': 177, 'l2_regularization': 2.6282355573239276, 'max_features': 0.9829581317869007}. Best is trial 11 with value: 0.7112317147181813.


Trial 37 finished with value: 0.706324242043355 and parameters: {'min_samples_leaf': 177, 'l2_regularization': 2.6282355573239276, 'max_features': 0.9829581317869007}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 09:55:51,857] Trial 35 finished with value: 0.7059299644903619 and parameters: {'min_samples_leaf': 170, 'l2_regularization': 2.625682777091891, 'max_features': 0.6651018737290499}. Best is trial 11 with value: 0.7112317147181813.


Trial 35 finished with value: 0.7059299644903619 and parameters: {'min_samples_leaf': 170, 'l2_regularization': 2.625682777091891, 'max_features': 0.6651018737290499}. Best is trial 11 with value: 0.7112317147181813.


[I 2025-07-16 10:01:44,827] Trial 28 finished with value: 0.7119718843054141 and parameters: {'min_samples_leaf': 133, 'l2_regularization': 2.4019185053562597, 'max_features': 0.20970249180397854}. Best is trial 28 with value: 0.7119718843054141.


Trial 28 finished with value: 0.7119718843054141 and parameters: {'min_samples_leaf': 133, 'l2_regularization': 2.4019185053562597, 'max_features': 0.20970249180397854}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 10:07:33,099] Trial 39 finished with value: 0.7065308931659324 and parameters: {'min_samples_leaf': 355, 'l2_regularization': 2.636093896475469, 'max_features': 0.5909611699135672}. Best is trial 28 with value: 0.7119718843054141.


Trial 39 finished with value: 0.7065308931659324 and parameters: {'min_samples_leaf': 355, 'l2_regularization': 2.636093896475469, 'max_features': 0.5909611699135672}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 10:29:29,059] Trial 41 finished with value: 0.70434981135959 and parameters: {'min_samples_leaf': 330, 'l2_regularization': 2.6556081535267926, 'max_features': 0.9704950053868698}. Best is trial 28 with value: 0.7119718843054141.


Trial 41 finished with value: 0.70434981135959 and parameters: {'min_samples_leaf': 330, 'l2_regularization': 2.6556081535267926, 'max_features': 0.9704950053868698}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 10:29:50,723] Trial 44 finished with value: 0.7034580206179843 and parameters: {'min_samples_leaf': 332, 'l2_regularization': 2.6032158182001757, 'max_features': 0.6554337308102066}. Best is trial 28 with value: 0.7119718843054141.


Trial 44 finished with value: 0.7034580206179843 and parameters: {'min_samples_leaf': 332, 'l2_regularization': 2.6032158182001757, 'max_features': 0.6554337308102066}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 10:32:37,265] Trial 32 finished with value: 0.7066520026875602 and parameters: {'min_samples_leaf': 101, 'l2_regularization': 2.5142636676108805, 'max_features': 0.6069697100959732}. Best is trial 28 with value: 0.7119718843054141.


Trial 32 finished with value: 0.7066520026875602 and parameters: {'min_samples_leaf': 101, 'l2_regularization': 2.5142636676108805, 'max_features': 0.6069697100959732}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 10:40:22,595] Trial 45 finished with value: 0.7037537878616817 and parameters: {'min_samples_leaf': 345, 'l2_regularization': 2.730856376091832, 'max_features': 0.4393139988049184}. Best is trial 28 with value: 0.7119718843054141.


Trial 45 finished with value: 0.7037537878616817 and parameters: {'min_samples_leaf': 345, 'l2_regularization': 2.730856376091832, 'max_features': 0.4393139988049184}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 10:45:04,158] Trial 42 finished with value: 0.7062913509176045 and parameters: {'min_samples_leaf': 344, 'l2_regularization': 2.6016796850683566, 'max_features': 0.9919278262609066}. Best is trial 28 with value: 0.7119718843054141.


Trial 42 finished with value: 0.7062913509176045 and parameters: {'min_samples_leaf': 344, 'l2_regularization': 2.6016796850683566, 'max_features': 0.9919278262609066}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 10:51:41,811] Trial 34 finished with value: 0.7089388808502906 and parameters: {'min_samples_leaf': 102, 'l2_regularization': 2.5681303138219556, 'max_features': 0.6339135691790287}. Best is trial 28 with value: 0.7119718843054141.


Trial 34 finished with value: 0.7089388808502906 and parameters: {'min_samples_leaf': 102, 'l2_regularization': 2.5681303138219556, 'max_features': 0.6339135691790287}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 10:52:04,651] Trial 43 finished with value: 0.7030070249619088 and parameters: {'min_samples_leaf': 349, 'l2_regularization': 2.6189464090246566, 'max_features': 0.6448668845929816}. Best is trial 28 with value: 0.7119718843054141.


Trial 43 finished with value: 0.7030070249619088 and parameters: {'min_samples_leaf': 349, 'l2_regularization': 2.6189464090246566, 'max_features': 0.6448668845929816}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 10:59:18,435] Trial 38 finished with value: 0.7058450924703861 and parameters: {'min_samples_leaf': 176, 'l2_regularization': 2.625068316564601, 'max_features': 0.6511665624450886}. Best is trial 28 with value: 0.7119718843054141.


Trial 38 finished with value: 0.7058450924703861 and parameters: {'min_samples_leaf': 176, 'l2_regularization': 2.625068316564601, 'max_features': 0.6511665624450886}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 11:01:34,487] Trial 47 finished with value: 0.7060948036015773 and parameters: {'min_samples_leaf': 307, 'l2_regularization': 2.0866990931434577, 'max_features': 0.4289710344052855}. Best is trial 28 with value: 0.7119718843054141.


Trial 47 finished with value: 0.7060948036015773 and parameters: {'min_samples_leaf': 307, 'l2_regularization': 2.0866990931434577, 'max_features': 0.4289710344052855}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 11:03:49,774] Trial 31 finished with value: 0.7100110641054695 and parameters: {'min_samples_leaf': 129, 'l2_regularization': 2.486430826881811, 'max_features': 0.16926394096198533}. Best is trial 28 with value: 0.7119718843054141.


Trial 31 finished with value: 0.7100110641054695 and parameters: {'min_samples_leaf': 129, 'l2_regularization': 2.486430826881811, 'max_features': 0.16926394096198533}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 11:07:27,012] Trial 51 finished with value: 0.6943800645584448 and parameters: {'min_samples_leaf': 482, 'l2_regularization': 2.0846046660127118, 'max_features': 0.368956007855752}. Best is trial 28 with value: 0.7119718843054141.


Trial 51 finished with value: 0.6943800645584448 and parameters: {'min_samples_leaf': 482, 'l2_regularization': 2.0846046660127118, 'max_features': 0.368956007855752}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 11:08:14,799] Trial 46 finished with value: 0.702077617039601 and parameters: {'min_samples_leaf': 378, 'l2_regularization': 2.021497955805126, 'max_features': 0.4429349554865219}. Best is trial 28 with value: 0.7119718843054141.


Trial 46 finished with value: 0.702077617039601 and parameters: {'min_samples_leaf': 378, 'l2_regularization': 2.021497955805126, 'max_features': 0.4429349554865219}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 11:09:23,308] Trial 50 finished with value: 0.6957781954651128 and parameters: {'min_samples_leaf': 519, 'l2_regularization': 2.0020793398037986, 'max_features': 0.4554762186703064}. Best is trial 28 with value: 0.7119718843054141.


Trial 50 finished with value: 0.6957781954651128 and parameters: {'min_samples_leaf': 519, 'l2_regularization': 2.0020793398037986, 'max_features': 0.4554762186703064}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 11:20:10,211] Trial 49 finished with value: 0.6971792510749368 and parameters: {'min_samples_leaf': 537, 'l2_regularization': 2.8095145928085197, 'max_features': 0.4565581719780968}. Best is trial 28 with value: 0.7119718843054141.


Trial 49 finished with value: 0.6971792510749368 and parameters: {'min_samples_leaf': 537, 'l2_regularization': 2.8095145928085197, 'max_features': 0.4565581719780968}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 11:22:43,801] Trial 48 finished with value: 0.6984956145480425 and parameters: {'min_samples_leaf': 456, 'l2_regularization': 2.501143382968713, 'max_features': 0.4410452786547464}. Best is trial 28 with value: 0.7119718843054141.


Trial 48 finished with value: 0.6984956145480425 and parameters: {'min_samples_leaf': 456, 'l2_regularization': 2.501143382968713, 'max_features': 0.4410452786547464}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 11:28:10,708] Trial 55 finished with value: 0.6987962968345722 and parameters: {'min_samples_leaf': 471, 'l2_regularization': 2.002465582829385, 'max_features': 0.4128536438155287}. Best is trial 28 with value: 0.7119718843054141.


Trial 55 finished with value: 0.6987962968345722 and parameters: {'min_samples_leaf': 471, 'l2_regularization': 2.002465582829385, 'max_features': 0.4128536438155287}. Best is trial 28 with value: 0.7119718843054141.


[I 2025-07-16 11:29:38,576] Trial 40 finished with value: 0.7126244748542385 and parameters: {'min_samples_leaf': 166, 'l2_regularization': 2.604398595565794, 'max_features': 0.6438479523164877}. Best is trial 40 with value: 0.7126244748542385.


Trial 40 finished with value: 0.7126244748542385 and parameters: {'min_samples_leaf': 166, 'l2_regularization': 2.604398595565794, 'max_features': 0.6438479523164877}. Best is trial 40 with value: 0.7126244748542385.


In [10]:
study.trials_dataframe().to_csv('../../output/models/optuna_trials_main_model.csv', index=False)
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_l2_regularization,params_max_features,params_min_samples_leaf,state
0,0,0.643835,2025-07-16 05:48:38.114757,2025-07-16 06:16:58.552221,0 days 00:28:20.437464,1.095288,0.701722,8683,COMPLETE
1,1,0.697928,2025-07-16 05:48:38.123765,2025-07-16 06:42:42.083827,0 days 00:54:03.960062,2.983811,0.481322,454,COMPLETE
2,2,0.659298,2025-07-16 05:48:38.116758,2025-07-16 06:34:42.652394,0 days 00:46:04.535636,1.811874,0.242433,5886,COMPLETE
3,3,0.696454,2025-07-16 05:48:38.188883,2025-07-16 07:00:35.304627,0 days 01:11:57.115744,0.617105,0.894941,687,COMPLETE
4,4,0.665061,2025-07-16 05:48:38.218679,2025-07-16 06:21:30.509608,0 days 00:32:52.290929,0.000921,0.371337,4492,COMPLETE
...,...,...,...,...,...,...,...,...,...
60,60,,2025-07-16 11:09:23.326100,NaT,NaT,2.800856,0.128202,141,RUNNING
61,61,,2025-07-16 11:20:10.230007,NaT,NaT,1.297375,0.108463,147,RUNNING
62,62,,2025-07-16 11:22:43.819580,NaT,NaT,2.794004,0.114294,142,RUNNING
63,63,,2025-07-16 11:28:10.729829,NaT,NaT,2.807814,0.108430,140,RUNNING


In [11]:
params = study.best_params
params

{'min_samples_leaf': 166,
 'l2_regularization': 2.604398595565794,
 'max_features': 0.6438479523164877}

In [14]:
# Now retrain the model with the best parameters
model = hgbr(
    learning_rate=0.1,
    min_samples_leaf=params['min_samples_leaf'],
    l2_regularization=params['l2_regularization'],
    max_features=params['max_features'],
    max_leaf_nodes=None,
    max_depth=None,
    early_stopping=True,
    scoring='r2',
    n_iter_no_change=10,
    max_iter = 1000,
    random_state=1234,
    verbose=2
)
model.fit(x_train, y_train, X_val=x_val, y_val=y_val)
pickle.dump(model, open('../../output/models/lightgbm/lightgbm_all_tuned.pkl', 'wb'))

Binning 3.407 GB of training data: 9.656 s
Binning 0.426 GB of validation data: 0.405 s
Fitting gradient boosted rounds:
[1/1000] 1 tree, 604 leaves, max depth = 19, train score: 0.12947, val score: 0.12288, in 3.171s
[2/1000] 1 tree, 618 leaves, max depth = 17, train score: 0.23752, val score: 0.23000, in 3.431s
[3/1000] 1 tree, 644 leaves, max depth = 21, train score: 0.32654, val score: 0.31771, in 3.343s
[4/1000] 1 tree, 677 leaves, max depth = 21, train score: 0.39812, val score: 0.39037, in 3.436s
[5/1000] 1 tree, 686 leaves, max depth = 21, train score: 0.45823, val score: 0.44830, in 3.396s
[6/1000] 1 tree, 705 leaves, max depth = 22, train score: 0.50879, val score: 0.49925, in 3.475s
[7/1000] 1 tree, 727 leaves, max depth = 21, train score: 0.55191, val score: 0.54018, in 3.460s
[8/1000] 1 tree, 741 leaves, max depth = 23, train score: 0.58756, val score: 0.57330, in 3.462s
[9/1000] 1 tree, 763 leaves, max depth = 26, train score: 0.61722, val score: 0.60167, in 3.510s
[10/10

In [15]:
r21 = model.score(x_val, y_val)
r22 = model.score(x_test, y_test)
score = (r21 + r22) / 2  # Average of validation and test scores
print(f"Validation: {r21:.4f}, Test: {r22:.4f}")

Validation: 0.7599, Test: 0.6654


In [12]:
# Now retrain the model with the best parameters and a lower learning rate
model = hgbr(
    learning_rate=0.005,  # Lower learning rate for final model
    min_samples_leaf=params['min_samples_leaf'],
    l2_regularization=params['l2_regularization'],
    max_features=params['max_features'],
    max_leaf_nodes=None,
    max_depth=None,
    early_stopping=True,
    scoring='r2',
    n_iter_no_change=10,
    max_iter = 10000,
    random_state=1234,
    verbose=2
)
model.fit(x_train, y_train, X_val=x_val, y_val=y_val)
pickle.dump(model, open('../../output/models/lightgbm/lightgbm_all_tuned_retrained.pkl', 'wb'))

Binning 3.407 GB of training data: 8.671 s
Binning 0.426 GB of validation data: 0.408 s
Fitting gradient boosted rounds:
[1/10000] 1 tree, 604 leaves, max depth = 19, train score: 0.00679, val score: 0.00534, in 3.183s
[2/10000] 1 tree, 618 leaves, max depth = 17, train score: 0.01350, val score: 0.01230, in 3.165s
[3/10000] 1 tree, 615 leaves, max depth = 18, train score: 0.02030, val score: 0.01919, in 3.136s
[4/10000] 1 tree, 618 leaves, max depth = 18, train score: 0.02699, val score: 0.02575, in 3.098s
[5/10000] 1 tree, 608 leaves, max depth = 19, train score: 0.03376, val score: 0.03220, in 3.104s
[6/10000] 1 tree, 619 leaves, max depth = 17, train score: 0.04029, val score: 0.03875, in 3.234s
[7/10000] 1 tree, 624 leaves, max depth = 18, train score: 0.04682, val score: 0.04500, in 3.224s
[8/10000] 1 tree, 630 leaves, max depth = 19, train score: 0.05325, val score: 0.05114, in 3.159s
[9/10000] 1 tree, 615 leaves, max depth = 20, train score: 0.05958, val score: 0.05741, in 3.09

In [13]:
r21 = model.score(x_val, y_val)
r22 = model.score(x_test, y_test)
score = (r21 + r22) / 2  # Average of validation and test scores
print(f"Validation: {r21:.4f}, Test: {r22:.4f}")

Validation: 0.7605, Test: 0.6662
