In [1]:
import optuna
import json
import gc
import numpy as np
import pandas as pd
from feature_settings_ import *
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import root_mean_squared_error


In [2]:
# ✅ 1. Load your data
# Assume your dataset is a pandas DataFrame
# and the target column is named 'option_return'
# Example: df = pd.read_csv('your_data.csv')
df = pd.read_csv(r"C:\Users\55479\PycharmProjects\MS_thesis\datasets\whole_df_all_features_simple_option_return.csv") # open the time series data

X = df[df.columns[:-1]]     # your features
y = df['option_return']     # your target

print(f"--- Your prediction target is [{df.columns[-1]}] ---") # print the target to check

# ✅ 2. Split into train and test sets (e.g., 80/20)
X_full_train, X_test, y_full_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False  # shuffle=False to preserve time order
)


--- Your prediction target is [option_return] ---


In [3]:
# ✅ 3. Set is_tuning_done switch
is_tuning_done = False

In [4]:
# ✅ 4. Tune hyperparameters
def objective(trial):
    # --------- Choose scaler for data ------------
    scaler_name = trial.suggest_categorical("scaler", ["standard", "minmax", "maxabs", "robust", "none"])
    scaler = {
        "standard": StandardScaler(),
        "minmax": MinMaxScaler(),
        "maxabs": MaxAbsScaler(),
        "robust": RobustScaler(),
        "none": None
    }[scaler_name]

    target_scaler_name = trial.suggest_categorical("target_scaler", ["standard", "minmax", "maxabs", "robust", "none"])
    target_scaler = {
        "standard": StandardScaler(),
        "minmax": MinMaxScaler(),
        "maxabs": MaxAbsScaler(),
        "robust": RobustScaler(),
        "none": None
    }[target_scaler_name]

    # --------- Tune Lasso Hyperparameters ----------
    parameters = {
        # Most important: regularization strength
        "alpha": trial.suggest_float("alpha", 1e-6, 10.0, log=True),
        
        # Important: algorithm behavior
        "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
        "positive": trial.suggest_categorical("positive", [True, False]),
        "selection": trial.suggest_categorical("selection", ["cyclic", "random"]),
        
        # Convergence parameters (reasonable ranges)
        "max_iter": trial.suggest_int("max_iter", 1000, 5000, step=500),
        "tol": trial.suggest_float("tol", 1e-6, 1e-3, log=True),
        
        # Fixed parameters (don't tune these)
        "random_state": 42,
        "copy_X": True,        # Always True for safety
        "warm_start": False,   # Not useful in CV context
        "precompute": False    # Let sklearn decide
    }

    # --------- Time Series Cross Validation ----------
    tscv = TimeSeriesSplit(n_splits=5)
    rmses = []

    for train_idx, val_idx in tscv.split(X_full_train):
        X_train, X_val = X_full_train.iloc[train_idx], X_full_train.iloc[val_idx]
        y_train, y_val = y_full_train.iloc[train_idx], y_full_train.iloc[val_idx]

        # --------- Apply scaling if needed ----------
        if scaler:
            ct = ColumnTransformer(
                transformers=[('scale', scaler, FEATURES_WHOLE_SCALED)],
                remainder='passthrough'  # leave all other columns untouched
            )
            X_train_scaled = ct.fit_transform(X_train)
            X_val_scaled = ct.transform(X_val)
        else:
            X_train_scaled = X_train
            X_val_scaled = X_val

        if target_scaler:
            y_train_scaled = target_scaler.fit_transform(y_train.to_numpy().reshape(-1, 1)).ravel()
        else:
            y_train_scaled = y_train

        # --------- Train model ----------
        model = Lasso(**parameters)
        model.fit(X_train_scaled, y_train_scaled)

        # --------- Predict & evaluate ----------
        y_pred_scaled = model.predict(X_val_scaled)

        if target_scaler:
            y_pred = target_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
        else:
            y_pred = y_pred_scaled

        rmse = root_mean_squared_error(y_val, y_pred)
        rmses.append(rmse)

        del model, X_train_scaled, X_val_scaled, y_train_scaled, y_pred_scaled, y_pred
    
    gc.collect()

    return np.mean(rmses)  # return average cross-validated rmse

# --------- Set the switch ----------
if not is_tuning_done:

    # --------- Run Optuna Study ----------
    study = optuna.create_study(
    direction='minimize',
    storage='sqlite:///hyper_tuning.db',
    study_name='Lasso_whole_v2',
    load_if_exists=True,
    sampler=optuna.samplers.TPESampler(
        seed=42,                  # seed for reproduce
    ),
    )

    study.optimize(
        objective, 
        n_trials=25, 
        n_jobs=3, 
        show_progress_bar=True,
        gc_after_trial=True
    )

    # --------- Best Result ----------
    print("Best params:", study.best_trial.params)
    print("Best rmse:", study.best_value)

    # --------- Save Best Parameters ----------
    with open("Lasso_whole_tuning_best_parameters.json", "w") as f:
        json.dump(study.best_trial.params, f, indent=4)


[I 2025-07-14 23:58:46,271] A new study created in RDB with name: Lasso_whole_v2


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2025-07-14 23:59:01,197] Trial 0 finished with value: 1.2307383861907069 and parameters: {'scaler': 'standard', 'target_scaler': 'robust', 'alpha': 3.2823506798492446, 'fit_intercept': True, 'positive': False, 'selection': 'cyclic', 'max_iter': 2500, 'tol': 0.00010764952033794947}. Best is trial 0 with value: 1.2307383861907069.
[I 2025-07-14 23:59:16,054] Trial 3 finished with value: 1.2307383861907069 and parameters: {'scaler': 'standard', 'target_scaler': 'maxabs', 'alpha': 0.09025408721825014, 'fit_intercept': True, 'positive': False, 'selection': 'cyclic', 'max_iter': 1000, 'tol': 0.0009372007967993426}. Best is trial 0 with value: 1.2307383861907069.
[I 2025-07-14 23:59:35,951] Trial 4 finished with value: 1.3212338334192864 and parameters: {'scaler': 'maxabs', 'target_scaler': 'maxabs', 'alpha': 2.1627703080897674e-06, 'fit_intercept': False, 'positive': True, 'selection': 'random', 'max_iter': 1000, 'tol': 5.705871927080755e-05}. Best is trial 0 with value: 1.230738386190706

  model = cd_fast.enet_coordinate_descent(


[I 2025-07-15 00:00:05,896] Trial 7 finished with value: 1.1777129982545391 and parameters: {'scaler': 'robust', 'target_scaler': 'maxabs', 'alpha': 0.00023836640501884016, 'fit_intercept': True, 'positive': True, 'selection': 'random', 'max_iter': 3500, 'tol': 8.00534625257519e-05}. Best is trial 1 with value: 1.131702007144939.
[I 2025-07-15 00:00:26,333] Trial 8 finished with value: 1.2141345392135972 and parameters: {'scaler': 'robust', 'target_scaler': 'minmax', 'alpha': 0.0030153808936957266, 'fit_intercept': False, 'positive': False, 'selection': 'random', 'max_iter': 4000, 'tol': 3.081814990528114e-06}. Best is trial 1 with value: 1.131702007144939.
[I 2025-07-15 00:00:53,389] Trial 9 finished with value: 1.1938093737676456 and parameters: {'scaler': 'standard', 'target_scaler': 'standard', 'alpha': 4.7057019767997596e-05, 'fit_intercept': False, 'positive': True, 'selection': 'random', 'max_iter': 1500, 'tol': 0.00024102905952513997}. Best is trial 1 with value: 1.131702007144

  model = cd_fast.enet_coordinate_descent(


[I 2025-07-15 00:01:28,919] Trial 11 finished with value: 1.1801330859452777 and parameters: {'scaler': 'robust', 'target_scaler': 'minmax', 'alpha': 2.8138791175967447e-05, 'fit_intercept': True, 'positive': True, 'selection': 'random', 'max_iter': 4000, 'tol': 7.541962455293902e-05}. Best is trial 1 with value: 1.131702007144939.
[I 2025-07-15 00:01:52,508] Trial 12 finished with value: 1.1348839796228982 and parameters: {'scaler': 'minmax', 'target_scaler': 'standard', 'alpha': 0.016520757478247397, 'fit_intercept': False, 'positive': False, 'selection': 'cyclic', 'max_iter': 2000, 'tol': 9.95385136878162e-06}. Best is trial 1 with value: 1.131702007144939.
[I 2025-07-15 00:02:16,843] Trial 13 finished with value: 1.1351657218599585 and parameters: {'scaler': 'minmax', 'target_scaler': 'standard', 'alpha': 0.01787177536310821, 'fit_intercept': False, 'positive': False, 'selection': 'cyclic', 'max_iter': 2000, 'tol': 7.433818262818531e-06}. Best is trial 1 with value: 1.1317020071449

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[I 2025-07-15 00:05:33,332] Trial 15 finished with value: 1.1316371627921233 and parameters: {'scaler': 'maxabs', 'target_scaler': 'none', 'alpha': 0.0009615386534827813, 'fit_intercept': False, 'positive': False, 'selection': 'cyclic', 'max_iter': 2000, 'tol': 1.9797492435803774e-05}. Best is trial 15 with value: 1.1316371627921233.


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[I 2025-07-15 00:13:08,557] Trial 16 finished with value: 1.1317811727605396 and parameters: {'scaler': 'maxabs', 'target_scaler': 'none', 'alpha': 0.0004474737965282124, 'fit_intercept': False, 'positive': False, 'selection': 'cyclic', 'max_iter': 1500, 'tol': 1.3204482052003408e-06}. Best is trial 15 with value: 1.1316371627921233.
[I 2025-07-15 00:13:48,874] Trial 17 finished with value: 1.2307426095057772 and parameters: {'scaler': 'maxabs', 'target_scaler': 'none', 'alpha': 0.22423795455469286, 'fit_intercept': False, 'positive': False, 'selection': 'cyclic', 'max_iter': 2500, 'tol': 3.334122305706759e-05}. Best is trial 15 with value: 1.1316371627921233.


  model = cd_fast.enet_coordinate_descent(


[I 2025-07-15 00:19:47,318] Trial 18 finished with value: 1.1317120627881057 and parameters: {'scaler': 'maxabs', 'target_scaler': 'standard', 'alpha': 0.000501368383371332, 'fit_intercept': False, 'positive': False, 'selection': 'cyclic', 'max_iter': 1500, 'tol': 2.3629494603001266e-05}. Best is trial 15 with value: 1.1316371627921233.


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[I 2025-07-15 00:20:57,062] Trial 2 finished with value: 1.1382330836129555 and parameters: {'scaler': 'robust', 'target_scaler': 'robust', 'alpha': 0.0001077522519311029, 'fit_intercept': True, 'positive': False, 'selection': 'random', 'max_iter': 4500, 'tol': 1.8857699823355324e-06}. Best is trial 15 with value: 1.1316371627921233.


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[I 2025-07-15 00:28:06,514] Trial 6 finished with value: 1.138164391784613 and parameters: {'scaler': 'none', 'target_scaler': 'none', 'alpha': 3.429511120576089e-06, 'fit_intercept': False, 'positive': False, 'selection': 'random', 'max_iter': 4500, 'tol': 4.554571563642678e-05}. Best is trial 15 with value: 1.1316371627921233.
[I 2025-07-15 00:28:48,741] Trial 21 finished with value: 1.1319939079683055 and parameters: {'scaler': 'maxabs', 'target_scaler': 'standard', 'alpha': 0.002597533106748593, 'fit_intercept': False, 'positive': False, 'selection': 'cyclic', 'max_iter': 3000, 'tol': 0.0002533499186656308}. Best is trial 15 with value: 1.1316371627921233.


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[I 2025-07-15 00:39:51,681] Trial 20 finished with value: 1.132445647620899 and parameters: {'scaler': 'none', 'target_scaler': 'none', 'alpha': 0.0016184068477496589, 'fit_intercept': False, 'positive': False, 'selection': 'cyclic', 'max_iter': 3000, 'tol': 0.0001753760912203571}. Best is trial 15 with value: 1.1316371627921233.


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[I 2025-07-15 00:40:30,480] Trial 19 finished with value: 1.1370741170124083 and parameters: {'scaler': 'none', 'target_scaler': 'none', 'alpha': 1.3758821617251278e-05, 'fit_intercept': False, 'positive': False, 'selection': 'cyclic', 'max_iter': 2500, 'tol': 0.000205315684966594}. Best is trial 15 with value: 1.1316371627921233.


  model = cd_fast.enet_coordinate_descent(


[I 2025-07-15 00:42:54,793] Trial 23 finished with value: 1.1316429589977801 and parameters: {'scaler': 'maxabs', 'target_scaler': 'standard', 'alpha': 0.0006274689216437179, 'fit_intercept': False, 'positive': False, 'selection': 'cyclic', 'max_iter': 1500, 'tol': 1.9145363993959486e-05}. Best is trial 15 with value: 1.1316371627921233.


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[I 2025-07-15 00:44:10,822] Trial 24 finished with value: 1.131721505107852 and parameters: {'scaler': 'maxabs', 'target_scaler': 'standard', 'alpha': 0.0004872820627974898, 'fit_intercept': False, 'positive': False, 'selection': 'cyclic', 'max_iter': 1500, 'tol': 2.0118635642027816e-05}. Best is trial 15 with value: 1.1316371627921233.


  model = cd_fast.enet_coordinate_descent(


[I 2025-07-15 00:44:59,599] Trial 22 finished with value: 1.137205074559184 and parameters: {'scaler': 'maxabs', 'target_scaler': 'none', 'alpha': 2.1061565507610632e-05, 'fit_intercept': False, 'positive': False, 'selection': 'cyclic', 'max_iter': 2500, 'tol': 4.819452092467235e-06}. Best is trial 15 with value: 1.1316371627921233.
Best params: {'scaler': 'maxabs', 'target_scaler': 'none', 'alpha': 0.0009615386534827813, 'fit_intercept': False, 'positive': False, 'selection': 'cyclic', 'max_iter': 2000, 'tol': 1.9797492435803774e-05}
Best rmse: 1.1316371627921233
