In [None]:
import optuna
import numpy as np
import pandas as pd
from feature_settings_ import *
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import root_mean_squared_error


In [2]:

# ✅ 1. Load your data
# Assume your dataset is a pandas DataFrame
# and the target column is named 'option_return'
# Example: df = pd.read_csv('your_data.csv')
df = pd.read_csv(r"C:\Users\55479\PycharmProjects\MS_thesis\datasets\whole_df_all_features_simple_option_return.csv")

X = df[df.columns[:-1]]     # your features
y = df['option_return']     # your target
print(f"--- Your prediction target is {df.columns[-1]} ---") # print the target to check

# ✅ 2. Split into train and test sets (e.g., 80/20)
X_full_train, X_test, y_full_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False  # shuffle=False to preserve time order
)


--- Your prediction target is option_return ---


In [3]:
# ✅ 3. Tune hyperparameters
def objective(trial):
    # --------- Choose scaler type for dataset ------------
    scaler_name = trial.suggest_categorical("scaler", ["standard", "minmax", "maxabs", "robust", "none"])
    if scaler_name == "standard":
        scaler = StandardScaler()
    elif scaler_name == "minmax":
        scaler = MinMaxScaler()
    elif scaler_name == "maxabs":
        scaler = MaxAbsScaler()
    elif scaler_name == "robust":
        scaler = RobustScaler()
    else:
        scaler = None

    # --------- Choose scaler type for target ------------
    target_scaler_name = trial.suggest_categorical("target_scaler", ["standard", "minmax", "maxabs", "robust", "none"])
    if target_scaler_name == "standard":
        target_scaler = StandardScaler()
    elif target_scaler_name == "minmax":
        target_scaler = MinMaxScaler()
    elif target_scaler_name == "maxabs":
        target_scaler = MaxAbsScaler()
    elif target_scaler_name == "robust":
        target_scaler = RobustScaler()
    else:
        target_scaler = None

    # --------- Time Series Cross Validation ----------
    tscv = TimeSeriesSplit(n_splits=5)
    rmses = []

    for train_idx, val_idx in tscv.split(X_full_train):
        X_train, X_val = X_full_train.iloc[train_idx], X_full_train.iloc[val_idx]
        y_train, y_val = y_full_train.iloc[train_idx], y_full_train.iloc[val_idx]

        # --------- Apply scaling if needed ----------
        if scaler:
            ct = ColumnTransformer(
                transformers=[('scale', scaler, FEATURES_WHOLE_SCALED)],
                remainder='passthrough'  # leave all other columns untouched
            )
            X_train_scaled = pd.DataFrame(
                ct.fit_transform(X_train),
                index=X_train.index
            )
            X_val_scaled = pd.DataFrame(
                ct.transform(X_val),
                index=X_val.index
)
        else:
            X_train_scaled = X_train
            X_val_scaled = X_val

        if target_scaler:
            y_train_scaled = target_scaler.fit_transform(y_train.to_numpy().reshape(-1, 1)).ravel()
            y_val_scaled = target_scaler.transform(y_val.to_numpy().reshape(-1, 1)).ravel()
        else:
            y_train_scaled = y_train
            y_val_scaled = y_val

        # --------- Train model ----------
        model = LinearRegression()
        model.fit(X_train_scaled, y_train_scaled)

        # --------- Predict & evaluate ----------
        y_pred_scaled = model.predict(X_val_scaled)

        if target_scaler:
            y_pred = target_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
        else:
            y_pred = y_pred_scaled

        rmse = root_mean_squared_error(y_val, y_pred)
        rmses.append(rmse)

    return np.mean(rmses)  # return average cross-validated rmse

# --------- Run Optuna Study ----------
study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=42)
)
study.optimize(objective, n_trials=10, show_progress_bar=True)

# --------- Best Result ----------
print("Best params:", study.best_trial.params)
print("Best rmse:", study.best_value)


[I 2025-07-13 19:51:31,041] A new study created in memory with name: no-name-78f3a486-cd9f-4a34-9f6b-a0117d3734c3


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-07-13 19:51:37,829] Trial 0 finished with value: 1.1429679682922933 and parameters: {'scaler': 'minmax', 'target_scaler': 'maxabs'}. Best is trial 0 with value: 1.1429679682922933.
[I 2025-07-13 19:51:44,591] Trial 1 finished with value: 1.1429679682922933 and parameters: {'scaler': 'minmax', 'target_scaler': 'maxabs'}. Best is trial 0 with value: 1.1429679682922933.
[I 2025-07-13 19:51:52,281] Trial 2 finished with value: 1.1429679682923228 and parameters: {'scaler': 'standard', 'target_scaler': 'standard'}. Best is trial 0 with value: 1.1429679682922933.
[I 2025-07-13 19:51:58,452] Trial 3 finished with value: 1.1418119597139182 and parameters: {'scaler': 'none', 'target_scaler': 'standard'}. Best is trial 3 with value: 1.1418119597139182.
[I 2025-07-13 19:52:07,236] Trial 4 finished with value: 1.142967968292352 and parameters: {'scaler': 'robust', 'target_scaler': 'standard'}. Best is trial 3 with value: 1.1418119597139182.
[I 2025-07-13 19:52:15,222] Trial 5 finished with 

In [4]:
# # ✅ 4. Train an OLS Linear Regression model
# model = LinearRegression()
# model.fit(X_train, y_train)

# # ✅ 5. Make predictions on the test set
# y_pred = model.predict(X_test)

# # ✅ 6. Evaluate the model
# r2 = r2_score(y_test, y_pred)
# rmse = mean_squared_error(y_test, y_pred)
# mae = mean_absolute_error(y_test, y_pred)

# print(f"R² Score: {r2:.4f}")
# print(f"RMSE:     {rmse:.4f}")
# print(f"MAE:      {mae:.4f}")