In [27]:
# Load the features parquet file
import pandas as pd

features_df = pd.read_parquet('../artifacts/features_long.parquet')
print(f"✅ Loaded features: {features_df.shape}")
print(f"Columns: {list(features_df.columns)}")
print(f"Date range: {features_df['date'].min()} to {features_df['date'].max()}")
print(f"Unique symbols: {features_df['symbol'].nunique()}")
print(f"Sample data:")
print(features_df.tail())

✅ Loaded features: (2904927, 147)
Columns: ['volume', 'low', 'adjclose', 'high', 'open', 'close', 'ret', 'ma_10', 'pct_slope_ma_10', 'sign_ma_10', 'ma_20', 'pct_slope_ma_20', 'sign_ma_20', 'ma_30', 'pct_slope_ma_30', 'sign_ma_30', 'ma_50', 'pct_slope_ma_50', 'sign_ma_50', 'ma_75', 'pct_slope_ma_75', 'sign_ma_75', 'ma_100', 'pct_slope_ma_100', 'sign_ma_100', 'ma_150', 'pct_slope_ma_150', 'sign_ma_150', 'ma_200', 'pct_slope_ma_200', 'sign_ma_200', 'trend_score_granular', 'trend_score_sign', 'trend_score_slope', 'trend_persist_ema', 'trend_alignment', 'rv_10', 'rv_20', 'rv_60', 'rv_100', 'rv_ratio_10_60', 'rv_ratio_20_100', 'vol_regime', 'vol_regime_ema10', 'rv_z_60', 'vol_of_vol_20d', 'rv60_slope_norm', 'rv100_slope_norm', 'quiet_trend', 'hurst_ret_64', 'hurst_ret_128', 'hurst_ret_64_emaHL5', 'pct_dist_ma_20', 'pct_dist_ma_20_z', 'pct_dist_ma_50', 'pct_dist_ma_50_z', 'pct_dist_ma_100', 'pct_dist_ma_100_z', 'pct_dist_ma_200', 'pct_dist_ma_200_z', 'min_pct_dist_ma', 'relative_dist_20_50', 

In [28]:
from scipy.stats.mstats import winsorize
from typing import Iterator, Tuple
import numpy as np
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import r2_score, mean_squared_error, \
            mean_absolute_error, accuracy_score, precision_recall_fscore_support,roc_auc_score

def winsorize_series(s: pd.Series, limits=(0.01, 0.01)) -> pd.Series:
    mask = s.notna()
    if not mask.any():
        return s.copy()
    arr_w = winsorize(s[mask].astype(float).to_numpy(), limits=limits)
    out = pd.Series(np.nan, index=s.index)
    out.loc[mask] = arr_w
    return out
def walk_forward_splits(
    df: pd.DataFrame,
    date_col: str = "date",
    min_train_days: int = 252,
    test_days: int = 63,
    horizon: int = None,     # default to global H if available
    embargo_days: int = 1,
) -> Iterator[Tuple[np.ndarray, np.ndarray]]:
    """
    Time-based folds with purge (horizon-1) + embargo (embargo_days).
    Splits are by unique dates across the whole panel.
    """
    if horizon is None:
        horizon = globals().get("H", 10)
    dates = pd.to_datetime(df[date_col]).sort_values().unique()
    start = 0
    while True:
        train_end = start + min_train_days
        test_end  = train_end + test_days
        if test_end > len(dates):
            break

        train_dates = dates[:train_end]
        test_dates  = dates[train_end:test_end]

        test_start_date = test_dates[0]
        test_end_date   = test_dates[-1]

        # purge + embargo windows
        purge_start = test_start_date - np.timedelta64(horizon - 1, 'D')
        purge_end   = test_end_date
        embargo_end = test_end_date + np.timedelta64(embargo_days, 'D')

        d = pd.to_datetime(df[date_col]).values
        in_base_train = (d <= train_dates[-1])
        in_purge      = (d >= purge_start) & (d <= purge_end)
        in_embargo    = (d > test_end_date) & (d <= embargo_end)
        in_test       = (d >= test_start_date) & (d <= test_end_date)

        train_mask = in_base_train & (~in_purge) & (~in_embargo) & (~in_test)
        test_mask  = in_test

        tr_idx = np.flatnonzero(train_mask)
        te_idx = np.flatnonzero(test_mask)
        if tr_idx.size and te_idx.size:
            yield tr_idx, te_idx

        start += test_days

In [29]:
import numpy as np
import pandas as pd

# Only numeric columns for inf check
numeric_cols = features_df.select_dtypes(include=[np.number]).columns

nan_counts = features_df.isna().sum()
inf_counts = pd.Series(0, index=features_df.columns)

inf_counts[numeric_cols] = np.isinf(features_df[numeric_cols].to_numpy()).sum(axis=0)

nan_inf_summary = pd.DataFrame({
    "NaN_count": nan_counts,
    "Inf_count": inf_counts,
    "Total_rows": len(features_df)
})

nan_inf_summary["NaN_pct"] = nan_inf_summary["NaN_count"] / nan_inf_summary["Total_rows"] * 100
nan_inf_summary["Inf_pct"] = nan_inf_summary["Inf_count"] / nan_inf_summary["Total_rows"] * 100

nan_inf_summary.sort_values(["NaN_count", "Inf_count"], ascending=False)
# Filter to top offenders by max percentage of NaNs/Infs
top_nan_inf = (
    nan_inf_summary
    .assign(Max_pct=lambda df: df[["NaN_pct", "Inf_pct"]].max(axis=1))
    .sort_values("Max_pct", ascending=False)
)

# Show top 20 offenders
top_nan_inf.head(20)

Unnamed: 0,NaN_count,Inf_count,Total_rows,NaN_pct,Inf_pct,Max_pct
alpha_mom_combo_120_ema10,625110,0,2904927,21.518957,0.0,21.518957
alpha_mom_sector_120_ema10,625110,0,2904927,21.518957,0.0,21.518957
alpha_mom_sector_60_ema10,582965,0,2904927,20.068146,0.0,20.068146
alpha_mom_combo_60_ema10,582965,0,2904927,20.068146,0.0,20.068146
alpha_mom_combo_20_ema10,553425,0,2904927,19.051253,0.0,19.051253
alpha_mom_sector_20_ema10,553425,0,2904927,19.051253,0.0,19.051253
alpha_resid_sector,544993,0,2904927,18.760988,0.0,18.760988
alpha_mom_sector_ema10,542875,0,2904927,18.688077,0.0,18.688077
alpha_mom_combo_ema10,542875,0,2904927,18.688077,0.0,18.688077
pct_slope_ma_200,470991,0,2904927,16.213523,0.0,16.213523


In [30]:
from typing import Iterator, Tuple
import numpy as np
import pandas as pd

# ---- Fold controls ----
MAX_FOLDS   = 3        # <= your request
EXPANDING   = True     # expanding train window; set False for fixed-length rolling

def walk_forward_splits(
    df: pd.DataFrame,
    date_col: str,
    min_train_days: int,
    test_days: int,
    horizon: int,
    embargo_days: int = 1,
    max_folds: int = 3,
    expanding: bool = True,
) -> Iterator[Tuple[np.ndarray, np.ndarray]]:
    """
    Time-based folds with purge (horizon-1) + embargo.
    Splits are formed on UNIQUE DATES across the whole panel.
    If expanding=True, the training window grows each fold.
    """
    dates = pd.to_datetime(df[date_col]).sort_values().unique()
    n_dates = len(dates)
    k = 0
    start = 0

    while True:
        train_end = start + min_train_days              # index into `dates` (exclusive of test)
        test_end  = train_end + test_days
        if test_end > n_dates or (k >= max_folds):
            break

        # Expanding vs fixed train window
        if expanding:
            # train from 0 .. train_end-1
            train_dates = dates[:train_end]
        else:
            # fixed-length rolling window: start .. train_end-1
            train_dates = dates[start:train_end]

        test_dates = dates[train_end:test_end]

        test_start_date = test_dates[0]
        test_end_date   = test_dates[-1]

        # Purge window: anything whose target overlaps the test period
        purge_start = test_start_date - np.timedelta64(horizon - 1, "D")
        purge_end   = test_end_date
        embargo_end = test_end_date + np.timedelta64(embargo_days, "D")

        d = pd.to_datetime(df[date_col]).values
        in_base_train = (d <= train_dates[-1]) if expanding else ((d >= train_dates[0]) & (d <= train_dates[-1]))
        in_purge      = (d >= purge_start) & (d <= purge_end)
        in_embargo    = (d >  test_end_date) & (d <= embargo_end)
        in_test       = (d >= test_start_date) & (d <= test_end_date)

        train_mask = in_base_train & (~in_purge) & (~in_embargo) & (~in_test)
        test_mask  = in_test

        tr_idx = np.flatnonzero(train_mask)
        te_idx = np.flatnonzero(test_mask)

        if len(tr_idx) and len(te_idx):
            yield tr_idx, te_idx
            k += 1

        # advance the window by one test block
        start += test_days

# ---- Build just a few strong folds ----
folds = list(
    walk_forward_splits(
        model_df,
        date_col=DATE_COL,
        min_train_days=MIN_TRAIN_DAYS,
        test_days=TEST_DAYS,
        horizon=H,
        embargo_days=EMBARGO_DAYS,
        max_folds=MAX_FOLDS,
        expanding=EXPANDING,
    )
)

print(f"✅ Built {len(folds)} folds (max={MAX_FOLDS}, expanding={EXPANDING})")
for i, (tr, te) in enumerate(folds, 1):
    tr_dates = (model_df.iloc[tr][DATE_COL].min(), model_df.iloc[tr][DATE_COL].max())
    te_dates = (model_df.iloc[te][DATE_COL].min(), model_df.iloc[te][DATE_COL].max())
    print(f"  Fold {i}: train {tr_dates[0].date()} → {tr_dates[1].date()} "
          f"| test {te_dates[0].date()} → {te_dates[1].date()} "
          f"(n_train={len(tr):,}, n_test={len(te):,})")

✅ Built 3 folds (max=3, expanding=True)
  Fold 1: train 2021-03-01 → 2022-02-23 | test 2022-02-28 → 2022-05-26 (n_train=449,224, n_test=120,277)
  Fold 2: train 2021-03-01 → 2022-05-20 | test 2022-05-27 → 2022-08-26 (n_train=565,566, n_test=122,647)
  Fold 3: train 2021-03-01 → 2022-08-24 | test 2022-08-29 → 2022-11-25 (n_train=692,036, n_test=123,501)


In [None]:
import numpy as np
import pandas as pd
import itertools

from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import (
    root_mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, roc_auc_score, precision_recall_fscore_support
)

# ---- Config for limiting folds ----
MAX_FOLDS_TO_RUN = 3  # cap the number of folds we actually train/evaluate

# ---- Pre-allocate OOF containers ----
oof_reg_preds = np.full(len(model_df), np.nan, dtype=float)
oof_cls_proba = np.full(len(model_df), np.nan, dtype=float)
oof_cls_pred  = np.full(len(model_df), -1,   dtype=int)

fold_summaries = []

# ---- Build all possible splits first, then cap to MAX_FOLDS_TO_RUN ----
all_splits = list(
    walk_forward_splits(
        model_df, date_col=DATE_COL,
        min_train_days=MIN_TRAIN_DAYS,
        test_days=TEST_DAYS,
        horizon=H,
        embargo_days=EMBARGO_DAYS
    )
)
total_possible_folds = len(all_splits)
splits = all_splits[:MAX_FOLDS_TO_RUN]

print(f"Total possible folds: {total_possible_folds} | Running: {len(splits)} "
      f"(MIN_TRAIN_DAYS={MIN_TRAIN_DAYS}, TEST_DAYS={TEST_DAYS}, EMBARGO_DAYS={EMBARGO_DAYS}, H={H})")

# ---- Walk-forward training/eval ----
for fold_no, (tr_idx, te_idx) in enumerate(splits, start=1):
    X_tr = model_df.iloc[tr_idx][features].astype("float32")
    X_te = model_df.iloc[te_idx][features].astype("float32")

    # Train‑set medians only (no leakage)
    med = X_tr.median(numeric_only=True)
    X_tr = X_tr.fillna(med)
    X_te = X_te.fillna(med)

    y_reg_tr = model_df.iloc[tr_idx]["target_logret_H"].astype("float32").to_numpy()
    y_reg_te = model_df.iloc[te_idx]["target_logret_H"].astype("float32").to_numpy()

    y_cls_tr = model_df.iloc[tr_idx]["target_dir_H"].astype("int8").to_numpy()
    y_cls_te = model_df.iloc[te_idx]["target_dir_H"].astype("int8").to_numpy()

    # Models
    reg = XGBRegressor(**REG_PARAMS)
    reg.fit(X_tr, y_reg_tr)

    pos = int((y_cls_tr == 1).sum())
    neg = int((y_cls_tr == 0).sum())
    spw = (neg / max(pos, 1)) if pos > 0 else 1.0

    cls = XGBClassifier(**CLS_PARAMS, scale_pos_weight=spw)
    cls.fit(X_tr, y_cls_tr)

    # Predictions
    y_reg_hat = reg.predict(X_te)
    y_cls_pro = cls.predict_proba(X_te)[:, 1]
    y_cls_hat = (y_cls_pro >= 0.5).astype(int)

    # Store OOF
    oof_reg_preds[te_idx] = y_reg_hat
    oof_cls_proba[te_idx] = y_cls_pro
    oof_cls_pred[te_idx]  = y_cls_hat

    # Fold metrics (evaluated on RAW targets)
    rmse = root_mean_squared_error(y_reg_te, y_reg_hat)  # ✅ no FutureWarning
    mae  = mean_absolute_error(y_reg_te, y_reg_hat)
    r2   = r2_score(y_reg_te, y_reg_hat)
    corr = np.corrcoef(y_reg_te, y_reg_hat)[0, 1] if len(y_reg_te) > 1 else np.nan

    acc = accuracy_score(y_cls_te, y_cls_hat)
    auc = roc_auc_score(y_cls_te, y_cls_pro)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_cls_te, y_cls_hat, average="binary", zero_division=0
    )

    print(
        f"Fold {fold_no:02d}  |  [REG] RMSE={rmse:.4f}  MAE={mae:.4f}  R^2={r2:.4f}  Corr={corr:.4f}  "
        f"||  [CLS] Acc={acc:.4f}  AUC={auc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  F1={f1:.4f}"
    )

    fold_summaries.append(dict(
        fold=fold_no, rmse=rmse, mae=mae, r2=r2, corr=corr,
        acc=acc, auc=auc, prec=prec, rec=rec, f1=f1
    ))

print(f"\nRan {len(splits)} folds (out of {total_possible_folds} possible).")

# ------- Aggregate OOF diagnostics (full backtest) -------
mask_reg = np.isfinite(oof_reg_preds) & model_df["target_logret_H"].notna().values
y_true_reg = model_df.loc[mask_reg, "target_logret_H"].to_numpy()
y_pred_reg = oof_reg_preds[mask_reg]

rmse = root_mean_squared_error(y_true_reg, y_pred_reg)  # ✅ new API
mae  = mean_absolute_error(y_true_reg, y_pred_reg)
r2   = r2_score(y_true_reg, y_pred_reg)
corr = np.corrcoef(y_true_reg, y_pred_reg)[0, 1] if len(y_true_reg) > 1 else np.nan

mask_cls = np.isfinite(oof_cls_proba) & model_df["target_dir_H"].notna().values
y_true_cls = model_df.loc[mask_cls, "target_dir_H"].to_numpy().astype(int)
y_prob_cls = oof_cls_proba[mask_cls]
y_pred_cls = (y_prob_cls >= 0.5).astype(int)

acc = accuracy_score(y_true_cls, y_pred_cls)
auc = roc_auc_score(y_true_cls, y_prob_cls)
prec, rec, f1, _ = precision_recall_fscore_support(y_true_cls, y_pred_cls, average="binary", zero_division=0)

print("\n==== OOF Summary (evaluated on RAW returns) ====")
print(f"[REG] H={H}d log-return  | RMSE={rmse:.4f}  MAE={mae:.4f}  R^2={r2:.4f}  Corr={corr:.4f}")
print(f"[CLS] H={H}d direction   | Acc={acc:.4f}  AUC={auc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  F1={f1:.4f}")

# Optional: confusion matrix for sign
cm = pd.crosstab(pd.Series(y_true_cls, name="Actual"),
                 pd.Series(y_pred_cls, name="Predicted"))
print("\nConfusion Matrix (OOF):")
print(cm)

Total possible folds: 3 | Running: 3 (MIN_TRAIN_DAYS=252, TEST_DAYS=63, EMBARGO_DAYS=1, H=5)


In [None]:
def predict_latest_per_symbol(latest_df: pd.DataFrame,
                              feature_cols: List[str],
                              reg_model: XGBRegressor,
                              cls_model: XGBClassifier,
                              train_medians: pd.Series) -> pd.DataFrame:
    last_idx = latest_df.groupby(SYMBOL_COL)[DATE_COL].idxmax()
    snap = latest_df.loc[last_idx, [SYMBOL_COL, DATE_COL] + feature_cols].copy()
    Xsnap = snap[feature_cols].astype("float32").fillna(train_medians)

    reg_hat = reg_model.predict(Xsnap)
    up_prob = cls_model.predict_proba(Xsnap)[:, 1]
    up_lab  = (up_prob >= 0.5).astype(int)

    out = snap[[SYMBOL_COL, DATE_COL]].copy()
    out[f"pred_logret_{H}d"]   = reg_hat
    out[f"pred_up_{H}d_prob"]  = up_prob
    out[f"pred_up_{H}d"]       = up_lab
    return out.sort_values(f"pred_logret_{H}d", ascending=False).reset_index(drop=True)

# Example usage with the last trained fold’s models:
# (If you want a production “latest” model, retrain on *all* data up to the latest date.)