In [1]:
import optuna
import json
import gc
import numpy as np
import pandas as pd
from feature_settings_ import *
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import root_mean_squared_error


In [2]:
# ✅ 1. Load your data
# Assume your dataset is a pandas DataFrame
# and the target column is named 'option_return'
# Example: df = pd.read_csv('your_data.csv')
df = pd.read_csv(r"C:\Users\55479\PycharmProjects\MS_thesis\datasets\whole_df_all_features_simple_option_return.csv") # open the time series data

X = df[df.columns[:-1]]     # your features
y = df['option_return']     # your target

print(f"--- Your prediction target is [{df.columns[-1]}] ---") # print the target to check

# ✅ 2. Split into train and test sets (e.g., 80/20)
X_full_train, X_test, y_full_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False  # shuffle=False to preserve time order
)


--- Your prediction target is [option_return] ---


In [None]:
# ✅ 3. Set is_tuning_done switch
is_tuning_done = True

In [4]:
# ✅ 4. Tune hyperparameters
def objective(trial):
    # --------- Choose scaler for data ------------
    scaler_name = trial.suggest_categorical("scaler", ["standard", "minmax", "maxabs", "robust", "none"])
    scaler = {
        "standard": StandardScaler(),
        "minmax": MinMaxScaler(),
        "maxabs": MaxAbsScaler(),
        "robust": RobustScaler(),
        "none": None
    }[scaler_name]

    target_scaler_name = trial.suggest_categorical("target_scaler", ["standard", "minmax", "maxabs", "robust", "none"])
    target_scaler = {
        "standard": StandardScaler(),
        "minmax": MinMaxScaler(),
        "maxabs": MaxAbsScaler(),
        "robust": RobustScaler(),
        "none": None
    }[target_scaler_name]

    # --------- Time Series Cross Validation ----------
    tscv = TimeSeriesSplit(n_splits=5)
    rmses = []

    for train_idx, val_idx in tscv.split(X_full_train):
        X_train, X_val = X_full_train.iloc[train_idx], X_full_train.iloc[val_idx]
        y_train, y_val = y_full_train.iloc[train_idx], y_full_train.iloc[val_idx]

        # --------- Apply scaling if needed ----------
        if scaler:
            ct = ColumnTransformer(
                transformers=[('scale', scaler, FEATURES_WHOLE_SCALED)],
                remainder='passthrough'  # leave all other columns untouched
            )
            X_train_scaled = ct.fit_transform(X_train)
            X_val_scaled = ct.transform(X_val)
        else:
            X_train_scaled = X_train
            X_val_scaled = X_val

        if target_scaler:
            y_train_scaled = target_scaler.fit_transform(y_train.to_numpy().reshape(-1, 1)).ravel()
        else:
            y_train_scaled = y_train

        # --------- Train model ----------
        model = LinearRegression()
        model.fit(X_train_scaled, y_train_scaled)

        # --------- Predict & evaluate ----------
        y_pred_scaled = model.predict(X_val_scaled)

        if target_scaler:
            y_pred = target_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
        else:
            y_pred = y_pred_scaled

        rmse = root_mean_squared_error(y_val, y_pred)
        rmses.append(rmse)

        del model, X_train_scaled, X_val_scaled, y_train_scaled, y_pred_scaled, y_pred
    
    gc.collect()

    return np.mean(rmses)  # return average cross-validated rmse

# --------- Set the switch ----------
if not is_tuning_done:

    # --------- Run Optuna Study ----------
    study = optuna.create_study(
    direction='minimize',
    storage='sqlite:///hyper_tuning.db',
    study_name='OLS_whole',
    load_if_exists=True,
    sampler=optuna.samplers.TPESampler(
        seed=42,                  # seed for reproduce
    ),
    )

    study.optimize(
        objective, 
        n_trials=25, 
        n_jobs=3, 
        show_progress_bar=True,
        gc_after_trial=True
    )

    # --------- Best Result ----------
    print("Best params:", study.best_trial.params)
    print("Best rmse:", study.best_value)

    # --------- Save Best Parameters ----------
    with open("OLS_whole_tuning_best_parameters.json", "w") as f:
        json.dump(study.best_trial.params, f, indent=4)


[I 2025-07-14 23:07:16,650] A new study created in RDB with name: OLS_whole


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2025-07-14 23:07:40,841] Trial 1 finished with value: 1.1429679682922933 and parameters: {'scaler': 'minmax', 'target_scaler': 'maxabs'}. Best is trial 1 with value: 1.1429679682922933.
[I 2025-07-14 23:07:40,859] Trial 2 finished with value: 1.1429679682923115 and parameters: {'scaler': 'minmax', 'target_scaler': 'minmax'}. Best is trial 0 with value: 1.14296796829227.
[I 2025-07-14 23:07:40,859] Trial 0 finished with value: 1.14296796829227 and parameters: {'scaler': 'minmax', 'target_scaler': 'standard'}. Best is trial 0 with value: 1.14296796829227.
[I 2025-07-14 23:07:55,809] Trial 4 finished with value: 1.1418119598211511 and parameters: {'scaler': 'none', 'target_scaler': 'minmax'}. Best is trial 4 with value: 1.1418119598211511.
[I 2025-07-14 23:08:03,774] Trial 3 finished with value: 1.1429679682922933 and parameters: {'scaler': 'minmax', 'target_scaler': 'maxabs'}. Best is trial 4 with value: 1.1418119598211511.
[I 2025-07-14 23:08:06,648] Trial 5 finished with value: 1.14