# Train and Tune all Models for Comparison

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from src.config import Config
from src.runners import run_experiment
from src.utils import set_seed

In [2]:
cfg = Config(Path("../config/config.yaml"))
rng = set_seed(cfg.runtime.seed)

2025-08-22 19:51:30,072 - INFO - src.utils - Global random seed set to 42


In [3]:
def plot_price_overlay_multi(df_feat, X_test, preds_dict, path=None):
    idx = X_test.index.to_numpy()
    valid = idx + 1 < len(df_feat)
    idx, idx1 = idx[valid], idx[valid] + 1
    dates = pd.to_datetime(df_feat.loc[idx1, "date"])
    actual_vals = df_feat.loc[idx1, "adj_close"].to_numpy(dtype=float)

    plt.figure(figsize=(12, 5))
    plt.plot(dates, actual_vals, label="Actual (t+1)", linewidth=2)

    for label, y_pred in preds_dict.items():
        y_pred = np.asarray(y_pred)
        lr_next = y_pred[valid, 0] if (y_pred.ndim == 2 and y_pred.shape[1] >= 1) else y_pred[valid]
        pred_vals = df_feat.loc[idx, "adj_close"].to_numpy(dtype=float) * np.exp(lr_next)
        plt.plot(dates, pred_vals, label=f"{label} (t+1)", linestyle="--", linewidth=2)

    ylabel = globals().get("ADJ_CLOSE_LABEL", "Adj Close")
    plt.title("Actual vs Predicted Adj Close (H=1) — Multiple Models")
    plt.xlabel("Date");
    plt.ylabel(ylabel);
    plt.legend();
    plt.grid(True, alpha=0.25)
    plt.tight_layout()
    if path is not None: plt.savefig(path)
    plt.show();
    plt.close()

def plot_price_overlay_next_30_multi(df_feat, test_df, preds_dict, *, horizon=30, hist_window=200, path=None):
    anchor_idx = int(test_df.index[-1])
    anchor_date = pd.to_datetime(df_feat.loc[anchor_idx, "date"])
    start_price = float(df_feat.loc[anchor_idx, "adj_close"])

    future = df_feat.iloc[anchor_idx + 1: anchor_idx + 1 + horizon]
    future_dates = pd.to_datetime(future["date"].to_numpy())
    actual_price_path = future["adj_close"].to_numpy(dtype=float)

    hist = df_feat.iloc[max(0, anchor_idx - hist_window + 1): anchor_idx + 1].copy()
    hist["date"] = pd.to_datetime(hist["date"])

    plt.figure(figsize=(12, 5))
    plt.plot(hist["date"], hist["adj_close"], label="History (adj_close)", alpha=0.8)
    plt.plot(future_dates, actual_price_path, label=f"Actual next {horizon}d", linewidth=2)

    for label, y_pred in preds_dict.items():
        y_pred = np.asarray(y_pred)
        lr_path = y_pred[-1].reshape(-1)[:horizon]
        pred_price_path = start_price * np.exp(np.cumsum(lr_path))
        plt.plot(future_dates, pred_price_path, label=f"{label} forecast", linestyle="--", linewidth=2)

    plt.axvline(anchor_date, linestyle=":", alpha=0.7)
    plt.xlabel("Date");
    plt.ylabel("Price")
    plt.title(f"{horizon}-Day Forecast vs Actuals from {anchor_date.date()} — Multiple Models")
    plt.legend();
    plt.grid(True, alpha=0.25);
    plt.tight_layout()
    if path is not None: plt.savefig(path)
    plt.show();
    plt.close()

def plot_forecast_diagnostics_multi(future_dates, actual_price_path, pred_price_paths, path_prefix=None):
    future_dates = pd.to_datetime(future_dates)
    actual = np.asarray(actual_price_path, dtype=float)

    # Residuals
    plt.figure(figsize=(10, 4))
    for label, pred in pred_price_paths.items():
        pred = np.asarray(pred, dtype=float)
        plt.plot(future_dates, actual - pred, label=f"{label} residuals")
    plt.axhline(0, linestyle="--", color="gray", linewidth=1)
    plt.title("Residuals over time (Actual - Forecast)")
    plt.xlabel("Date");
    plt.ylabel("Residual (Price)")
    plt.grid(True, alpha=0.3);
    plt.legend();
    plt.tight_layout()
    if path_prefix is not None: plt.savefig(path_prefix.with_name(path_prefix.stem + "_residuals.png"))
    plt.show();
    plt.close()

    # Cumulative returns
    plt.figure(figsize=(10, 4))
    act_ret = np.cumsum(np.log(actual / actual[0]));
    plt.plot(future_dates, act_ret, label="Actual", linewidth=2)
    for label, pred in pred_price_paths.items():
        pr = np.asarray(pred, dtype=float)
        plt.plot(future_dates, np.cumsum(np.log(pr / pr[0])), label=label, linestyle="--")
    plt.title("Cumulative log return (next horizon)")
    plt.xlabel("Date");
    plt.ylabel("Cumulative return")
    plt.grid(True, alpha=0.3);
    plt.legend();
    plt.tight_layout()
    if path_prefix is not None: plt.savefig(path_prefix.with_name(path_prefix.stem + "_cumret.png"))
    plt.show();
    plt.close()

    # Price comparison
    plt.figure(figsize=(10, 4))
    plt.plot(future_dates, actual, label="Actual price", linewidth=2)
    for label, pred in pred_price_paths.items():
        plt.plot(future_dates, np.asarray(pred, dtype=float), label=label, linestyle="--")
    plt.title("Forecast vs Actual Price")
    plt.xlabel("Date");
    plt.ylabel("Price")
    plt.grid(True, alpha=0.3);
    plt.legend();
    plt.tight_layout()
    if path_prefix is not None: plt.savefig(path_prefix.with_name(path_prefix.stem + "_price.png"))
    plt.show();
    plt.close()


In [None]:
EXPERIMENTS = [
    ("linreg", True),
    ("xgboost", True)
]

all_results = []
overlay_cache = {}  # scenario -> {df_full, test, X_test, future_dates, actual_path, last_price, preds{label: y_pred}}
last_ctx = None
last_trainer = None

df_full = pd.read_csv(Path(cfg.data.processed_dir) / cfg.data.name_features_full)

for kind, include_sent in EXPERIMENTS:
    print(f"\n=== Running: {kind} | sentiment={include_sent} ===")
    res_df, study, final_trainer, ctx = run_experiment(kind, df_full, include_sent, cfg.data.processed_dir, random_state=cfg.runtime.seed)
    all_results.append(res_df)
    last_ctx = ctx
    last_trainer = final_trainer

    # cache preds for overlays (by scenario)
    scenario_key = f"direct_{'with' if include_sent else 'wo'}_sent"
    df_full = ctx["df_full"];
    test = ctx["test"];
    X_test = ctx["X_test"]
    y_pred = final_trainer.predict(X_test)

    if scenario_key not in overlay_cache:
        anchor_idx = int(X_test.index[-1])
        last_price = float(df_full.loc[anchor_idx, "adj_close"])
        future = df_full.iloc[anchor_idx + 1: anchor_idx + 1 + 30]
        overlay_cache[scenario_key] = {
            "df_full": df_full,
            "test": test,
            "X_test": X_test,
            "future_dates": pd.to_datetime(future["date"].to_numpy()),
            "actual_path": future["adj_close"].to_numpy(dtype=float),
            "last_price": last_price,
            "preds": {}
        }
    overlay_cache[scenario_key]["preds"][kind] = y_pred

results_all = pd.concat(all_results, ignore_index=True)
# results_all.to_csv(BASE_PROCESSED / "results_all_models.csv", index=False)
results_all

2025-08-22 19:51:30,163 - INFO - ModelTrainer - Initialized ModelTrainer for model: linreg_h30
[I 2025-08-22 19:51:30,164] A new study created in memory with name: linreg_with_sent_h30
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2025-08-22 19:51:30,283] Trial 0 finished with value: 0.01056771325105004 and parameters: {'alpha': 0.0001329291894316216, 'l1_ratio': 0.9507143064099162, 'selection': 'cyclic', 'max_iter': 1500}. Best is trial 0 with value: 0.01056771325105004.



=== Running: linreg | sentiment=True ===


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2025-08-22 19:51:30,349] Trial 1 pruned. 
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2025-08-22 19:51:30,380] Trial 2 pruned. 
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2025-08-22 19:51:30,413] Trial 3 pruned. 
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  mod

Best params: {'alpha': 0.009311403260660715, 'l1_ratio': 0.8397646127079323, 'selection': 'random', 'max_iter': 5000, 'random_state': 42}
Saved model: ..\data\models\linreg_h30.pkl

=== Running: xgboost | sentiment=True ===


[I 2025-08-22 19:51:38,143] Trial 0 finished with value: 0.010348727731907792 and parameters: {'n_estimators': 1400, 'learning_rate': 0.08927180304353628, 'max_depth': 10, 'min_child_weight': 12.013303835521027, 'subsample': 0.6624074561769746, 'colsample_bytree': 0.662397808134481, 'reg_alpha': 0.05808361216819946, 'reg_lambda': 4.3977926559872085, 'gamma': 1.2022300234864176, 'max_bin': 384, 'grow_policy': 'lossguide', 'objective': 'reg:squarederror', 'max_leaves': 128}. Best is trial 0 with value: 0.010348727731907792.
[I 2025-08-22 19:51:50,445] Trial 1 finished with value: 0.009338306385273153 and parameters: {'n_estimators': 900, 'learning_rate': 0.02014847788415866, 'max_depth': 8, 'min_child_weight': 8.695705870978102, 'subsample': 0.7164916560792167, 'colsample_bytree': 0.8447411578889518, 'reg_alpha': 0.13949386065204183, 'reg_lambda': 1.8146509184084816, 'gamma': 0.7327236865873834, 'max_bin': 320, 'grow_policy': 'depthwise', 'objective': 'reg:absoluteerror'}. Best is trial 

In [None]:
for sc_key, bundle in overlay_cache.items():
    print(f"\n== Overlays for scenario: {sc_key} ==")
    df_full = bundle["df_full"]
    test = bundle["test"]
    X_test = bundle["X_test"]
    preds = bundle["preds"]
    last_px = bundle["last_price"]
    f_dates = bundle["future_dates"]
    act_path = bundle["actual_path"]

    # H=1 overlay across test
    plot_price_overlay_multi(
        df_feat=df_full,
        X_test=X_test,
        preds_dict=preds,
        # path=Path(cfg.data.fig_dir) / f"overlay_h1_{sc_key}.png",
    )

    # Next-30 overlay from last test anchor
    plot_price_overlay_next_30_multi(
        df_feat=df_full,
        test_df=test,
        preds_dict=preds,
        horizon=30,
        hist_window=200,
        # path=Path(cfg.data.fig_dir) / f"overlay_next30_{sc_key}.png",
    )

    # Diagnostics overlays
    pred_price_paths = {}
    for label, y_pred in preds.items():
        lr_path = np.asarray(y_pred[-1]).reshape(-1)[:30]
        pred_price_paths[label] = last_px * np.exp(np.cumsum(lr_path))

    plot_forecast_diagnostics_multi(
        future_dates=f_dates,
        actual_price_path=act_path,
        pred_price_paths=pred_price_paths,
        # path_prefix=Path(cfg.data.fig_dir) / f"diagnostics_{sc_key}",
    )