In [1]:
# Load the features parquet file
import pandas as pd

features_df = pd.read_parquet('../artifacts/features_long.parquet')
print(f"✅ Loaded features: {features_df.shape}")
print(f"Columns: {list(features_df.columns)}")
print(f"Date range: {features_df['date'].min()} to {features_df['date'].max()}")
print(f"Unique symbols: {features_df['symbol'].nunique()}")
print(f"Sample data:")
print(features_df.tail())

✅ Loaded features: (2904927, 147)
Columns: ['low', 'volume', 'close', 'high', 'adjclose', 'open', 'ret', 'ma_10', 'pct_slope_ma_10', 'sign_ma_10', 'ma_20', 'pct_slope_ma_20', 'sign_ma_20', 'ma_30', 'pct_slope_ma_30', 'sign_ma_30', 'ma_50', 'pct_slope_ma_50', 'sign_ma_50', 'ma_75', 'pct_slope_ma_75', 'sign_ma_75', 'ma_100', 'pct_slope_ma_100', 'sign_ma_100', 'ma_150', 'pct_slope_ma_150', 'sign_ma_150', 'ma_200', 'pct_slope_ma_200', 'sign_ma_200', 'trend_score_granular', 'trend_score_sign', 'trend_score_slope', 'trend_persist_ema', 'trend_alignment', 'rv_10', 'rv_20', 'rv_60', 'rv_100', 'rv_ratio_10_60', 'rv_ratio_20_100', 'vol_regime', 'vol_regime_ema10', 'rv_z_60', 'vol_of_vol_20d', 'rv60_slope_norm', 'rv100_slope_norm', 'quiet_trend', 'hurst_ret_64', 'hurst_ret_128', 'hurst_ret_64_emaHL5', 'pct_dist_ma_20', 'pct_dist_ma_20_z', 'pct_dist_ma_50', 'pct_dist_ma_50_z', 'pct_dist_ma_100', 'pct_dist_ma_100_z', 'pct_dist_ma_200', 'pct_dist_ma_200_z', 'min_pct_dist_ma', 'relative_dist_20_50', 

In [2]:
import numpy as np
import pandas as pd

# Only numeric columns for inf check
numeric_cols = features_df.select_dtypes(include=[np.number]).columns

nan_counts = features_df.isna().sum()
inf_counts = pd.Series(0, index=features_df.columns)

inf_counts[numeric_cols] = np.isinf(features_df[numeric_cols].to_numpy()).sum(axis=0)

nan_inf_summary = pd.DataFrame({
    "NaN_count": nan_counts,
    "Inf_count": inf_counts,
    "Total_rows": len(features_df)
})

nan_inf_summary["NaN_pct"] = nan_inf_summary["NaN_count"] / nan_inf_summary["Total_rows"] * 100
nan_inf_summary["Inf_pct"] = nan_inf_summary["Inf_count"] / nan_inf_summary["Total_rows"] * 100

nan_inf_summary.sort_values(["NaN_count", "Inf_count"], ascending=False)
# Filter to top offenders by max percentage of NaNs/Infs
top_nan_inf = (
    nan_inf_summary
    .assign(Max_pct=lambda df: df[["NaN_pct", "Inf_pct"]].max(axis=1))
    .sort_values("Max_pct", ascending=False)
)

# Show top 20 offenders
display(top_nan_inf.head(20))

# Only numeric columns for inf check
numeric_cols = features_df.select_dtypes(include=[np.number]).columns

nan_counts = features_df.isna().sum()
inf_counts = pd.Series(0, index=features_df.columns)

inf_counts[numeric_cols] = np.isinf(features_df[numeric_cols].to_numpy()).sum(axis=0)

nan_inf_summary = pd.DataFrame({
    "NaN_count": nan_counts,
    "Inf_count": inf_counts,
    "Total_rows": len(features_df)
})

nan_inf_summary["NaN_pct"] = nan_inf_summary["NaN_count"] / nan_inf_summary["Total_rows"] * 100
nan_inf_summary["Inf_pct"] = nan_inf_summary["Inf_count"] / nan_inf_summary["Total_rows"] * 100

# Filter to columns with any inf values
inf_only = nan_inf_summary.query("Inf_count > 0")

print(f"Columns with Infs ({len(inf_only)} found):")
display(inf_only.sort_values("Inf_count", ascending=False))

Unnamed: 0,NaN_count,Inf_count,Total_rows,NaN_pct,Inf_pct,Max_pct
alpha_mom_combo_120_ema10,625110,0,2904927,21.518957,0.0,21.518957
alpha_mom_sector_120_ema10,625110,0,2904927,21.518957,0.0,21.518957
alpha_mom_sector_60_ema10,582965,0,2904927,20.068146,0.0,20.068146
alpha_mom_combo_60_ema10,582965,0,2904927,20.068146,0.0,20.068146
alpha_mom_combo_20_ema10,553425,0,2904927,19.051253,0.0,19.051253
alpha_mom_sector_20_ema10,553425,0,2904927,19.051253,0.0,19.051253
alpha_resid_sector,544993,0,2904927,18.760988,0.0,18.760988
alpha_mom_sector_ema10,542875,0,2904927,18.688077,0.0,18.688077
alpha_mom_combo_ema10,542875,0,2904927,18.688077,0.0,18.688077
pct_slope_ma_200,470991,0,2904927,16.213523,0.0,16.213523


Columns with Infs (4 found):


Unnamed: 0,NaN_count,Inf_count,Total_rows,NaN_pct,Inf_pct
range_expansion_5d,205421,462,2904927,7.071469,0.015904
range_x_rvol20,217948,240,2904927,7.502701,0.008262
range_expansion_10d,206814,118,2904927,7.119422,0.004062
range_expansion_20d,213236,39,2904927,7.340494,0.001343


In [23]:
from typing import Iterator, Tuple
import numpy as np
import pandas as pd

# ==== CONFIG ====
DATE_COL = "date"              # adjust to match features_df
MIN_TRAIN_DAYS = 252            # ~1 year of training
TEST_DAYS = 21                  # ~1 month test per fold
H = 5                           # horizon in days
EMBARGO_DAYS = 1
EXPANDING = True
MAX_FOLDS = 5
RECENT_ONLY_CUTOFF_DAYS = 365   # only use folds with test_start in last year

# Model hyperparameters
# Shared stability-focused GBM hyperparameters
GBM_COMMON_PARAMS = {
    "n_estimators": 500,         # More trees for stability
    "max_depth": 4,              # Keep shallow to avoid overfitting
    "learning_rate": 0.02,       # Lower LR for smoother convergence
    "subsample": 0.8,            # Row subsampling for robustness
    "colsample_bytree": 0.8,     # Feature subsampling
    "min_child_weight": 5,       # Avoid splits on very small leaf sizes
    "reg_lambda": 1.0,           # L2 regularization
    "reg_alpha": 0.1,            # L1 regularization
    "gamma": 0.1,                # Minimum loss reduction for split
    "random_state": 42           # Reproducibility
}

# Model-specific parameters
REG_PARAMS = {
    **GBM_COMMON_PARAMS,
    "objective": "reg:squarederror"
}

CLS_PARAMS = {
    **GBM_COMMON_PARAMS,
    "objective": "binary:logistic",  # Or 'multi:softprob' for multiclass
    "eval_metric": "logloss"         # Good for classification stability
}
# --- Symbol / Price / Winsorization config ---
SYMBOL_COL = "symbol"      # column in features_df with ticker or asset ID
PRICE_COL  = "adjclose"    # column to use for log-return calculations
PCT_WINSOR = (0.01, 0.01)  # winsorize limits (lower, upper) for returns


from hyperopt import hp

# Regression space
REG_SPACE = {
    "n_estimators": hp.quniform("n_estimators", 200, 800, 50),  # fewer than default to save time
    "max_depth": hp.quniform("max_depth", 3, 8, 1),
    "learning_rate": hp.loguniform("learning_rate", -3.0, -1.2),  # ~0.05 to 0.3
    "subsample": hp.uniform("subsample", 0.6, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.6, 1.0),
    "min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
    "reg_lambda": hp.loguniform("reg_lambda", -2.3, 1.0),  # ~0.1 to 2.7
}

# Classification space
CLS_SPACE = {
    "n_estimators": hp.quniform("cls_n_estimators", 200, 800, 50),
    "max_depth": hp.quniform("cls_max_depth", 3, 8, 1),
    "learning_rate": hp.loguniform("cls_learning_rate", -3.0, -1.2),
    "subsample": hp.uniform("cls_subsample", 0.6, 1.0),
    "colsample_bytree": hp.uniform("cls_colsample_bytree", 0.6, 1.0),
    "min_child_weight": hp.quniform("cls_min_child_weight", 1, 10, 1),
    "reg_lambda": hp.loguniform("cls_reg_lambda", -2.3, 1.0),
}

# ---------- Controls ----------

EVAL_FRAC          = 0.2                  # last 20% of each TRAIN fold used as validation
EARLY_STOP_ROUNDS  = 50
FIXED_RANDOM_STATE = 42

In [4]:
from scipy.stats.mstats import winsorize
from typing import Iterator, Tuple
import numpy as np
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import r2_score, mean_squared_error, \
            mean_absolute_error, accuracy_score, precision_recall_fscore_support,roc_auc_score

def winsorize_series(s: pd.Series, limits=(0.01, 0.01)) -> pd.Series:
    mask = s.notna()
    if not mask.any():
        return s.copy()
    arr_w = winsorize(s[mask].astype(float).to_numpy(), limits=limits)
    out = pd.Series(np.nan, index=s.index)
    out.loc[mask] = arr_w
    return out
def walk_forward_splits(
    df: pd.DataFrame,
    date_col: str,
    min_train_days: int,
    test_days: int,
    horizon: int,
    embargo_days: int = 1,
    max_folds: int = 10_000,
    expanding: bool = True,
) -> Iterator[Tuple[np.ndarray, np.ndarray]]:
    """
    Time-based folds with purge (horizon-1) + embargo.
    Splits are formed on UNIQUE DATES across the whole panel.
    If expanding=True, the training window grows each fold.
    """
    dates = pd.to_datetime(df[date_col]).sort_values().unique()
    n_dates = len(dates)
    k = 0
    start = 0

    while True:
        train_end = start + min_train_days
        test_end = train_end + test_days
        if test_end > n_dates or (k >= max_folds):
            break

        # Expanding vs fixed train window
        if expanding:
            train_dates = dates[:train_end]  # 0 .. train_end-1
        else:
            train_dates = dates[start:train_end]

        test_dates = dates[train_end:test_end]
        if len(test_dates) == 0:
            break

        test_start_date = test_dates[0]
        test_end_date = test_dates[-1]

        # Purge + embargo
        purge_start = test_start_date - np.timedelta64(horizon - 1, "D")
        purge_end = test_end_date
        embargo_end = test_end_date + np.timedelta64(embargo_days, "D")

        d = pd.to_datetime(df[date_col]).values
        in_base_train = (d <= train_dates[-1]) if expanding else ((d >= train_dates[0]) & (d <= train_dates[-1]))
        in_purge = (d >= purge_start) & (d <= purge_end)
        in_embargo = (d > test_end_date) & (d <= embargo_end)
        in_test = (d >= test_start_date) & (d <= test_end_date)

        train_mask = in_base_train & (~in_purge) & (~in_embargo) & (~in_test)
        test_mask = in_test

        tr_idx = np.flatnonzero(train_mask)
        te_idx = np.flatnonzero(test_mask)

        if len(tr_idx) and len(te_idx):
            yield tr_idx, te_idx
            k += 1

        # advance window by one test block
        start += test_days


In [5]:
import numpy as np
import pandas as pd

# --- sanity checks ---
req = ["features_df", "winsorize_series", "SYMBOL_COL", "DATE_COL", "PRICE_COL", "H", "PCT_WINSOR"]
missing = [r for r in req if r not in globals()]
if missing:
    raise NameError(f"Missing required names: {missing}")

# --- copy & sort ---
features_df = features_df.copy()
features_df[DATE_COL] = pd.to_datetime(features_df[DATE_COL])
features_df = features_df.sort_values([SYMBOL_COL, DATE_COL])

# --- raw daily log returns (for targets & eval) ---
if PRICE_COL not in features_df.columns:
    raise ValueError(f"Missing '{PRICE_COL}' in features_df.")
features_df["ret_raw"] = (
    features_df
      .groupby(SYMBOL_COL, group_keys=False)[PRICE_COL]
      .apply(lambda s: np.log(s.astype(float)).diff())
)

# --- winsorized returns (for model inputs only; NOT for targets) ---
features_df["ret_wins"] = (
    features_df
      .groupby(SYMBOL_COL, group_keys=False)["ret_raw"]
      .apply(lambda s: winsorize_series(s, limits=PCT_WINSOR))
)

# --- H‑day targets on RAW returns (leakage‑safe) ---
# H‑day log return ≈ sum of next H daily log returns = log(P_{t+H}/P_t)
features_df["target_logret_H"] = (
    features_df
      .groupby(SYMBOL_COL)["ret_raw"]
      .transform(lambda r: r.shift(-H).rolling(H, min_periods=H).sum())
)
features_df["target_dir_H"] = (features_df["target_logret_H"] > 0).astype("int8")

# --- feature list (drop IDs, targets, raw OHLC; keep adjclose-derived) ---
exclude_cols = {SYMBOL_COL, DATE_COL, "target_logret_H", "target_dir_H", "ret_raw"}
exclude_cols |= {"open", "high", "low", "close"}  # keep adjclose + derived features
features = [c for c in features_df.columns
            if (c not in exclude_cols) and (not str(c).lower().startswith("target"))]

# --- clean known ±inf offenders (set to NaN; impute inside each fold) ---
for col in ("range_expansion_5d", "range_expansion_10d", "range_x_rvol20", 'range_expansion_20d'):
    if col in features_df.columns:
        v = pd.to_numeric(features_df[col], errors="coerce")
        features_df.loc[~np.isfinite(v), col] =0

# --- final modeling frame: drop rows with missing target only ---
model_df = features_df.dropna(subset=["target_logret_H"]).copy()

print(
    f"✅ Targets built | H={H} | rows(all)={len(features_df):,} → model_df={len(model_df):,} | features={len(features)}"
)
print(
    f"Date span: {features_df[DATE_COL].min().date()} → {features_df[DATE_COL].max().date()} | "
    f"Symbols: {features_df[SYMBOL_COL].nunique()}"
)

✅ Targets built | H=5 | rows(all)=2,904,927 → model_df=2,685,476 | features=142
Date span: 2020-08-07 → 2025-08-08 | Symbols: 2311


In [15]:
# ==== BUILD & FILTER FOLDS ====
# Align fold base to the training base (rows with valid targets only)
base_df = features_df.dropna(subset=["target_logret_H"]).copy()

all_folds = list(
    walk_forward_splits(
        base_df,                     # <-- was features_df
        date_col=DATE_COL,
        min_train_days=MIN_TRAIN_DAYS,
        test_days=TEST_DAYS,
        horizon=H,
        embargo_days=EMBARGO_DAYS,
        max_folds=10_000,
        expanding=EXPANDING,
    )
)

meta = []
for tr_idx, te_idx in all_folds:
    tr_dates = base_df.iloc[tr_idx][DATE_COL]
    te_dates = base_df.iloc[te_idx][DATE_COL]
    meta.append(dict(
        tr_idx=tr_idx, te_idx=te_idx,
        train_start=tr_dates.min(), train_end=tr_dates.max(),
        test_start=te_dates.min(), test_end=te_dates.max(),
        n_train=len(tr_idx), n_test=len(te_idx),
    ))

print(f"✅ Total folds available: {len(meta)}")

cutoff_date = base_df[DATE_COL].max() - pd.Timedelta(days=RECENT_ONLY_CUTOFF_DAYS)
recent_meta = [m for m in meta if m['test_start'] >= cutoff_date]
if not recent_meta:
    recent_meta = meta[-MAX_FOLDS:]
else:
    recent_meta = sorted(recent_meta, key=lambda m: m['test_start'])[-MAX_FOLDS:]

print(f"✅ Using {len(recent_meta)} folds with test_start ≥ {cutoff_date.date()} (expanding={EXPANDING})")
for i, f in enumerate(recent_meta, 1):
    print(f"  Fold {i}: train {f['train_start'].date()} → {f['train_end'].date()} | "
          f"test {f['test_start'].date()} → {f['test_end'].date()} "
          f"(n_train={f['n_train']:,}, n_test={f['n_test']:,})")

folds = [(f['tr_idx'], f['te_idx']) for f in recent_meta]

✅ Total folds available: 47
✅ Using 3 folds with test_start ≥ 2024-08-01 (expanding=True)
  Fold 1: train 2020-08-13 → 2025-04-16 | test 2025-04-21 → 2025-05-19 (n_train=2,520,036, n_test=48,037)
  Fold 2: train 2020-08-13 → 2025-05-15 | test 2025-05-20 → 2025-06-18 (n_train=2,565,780, n_test=48,196)
  Fold 3: train 2020-08-13 → 2025-06-13 | test 2025-06-20 → 2025-07-21 (n_train=2,611,651, n_test=48,426)


In [16]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import (
    root_mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, roc_auc_score, precision_recall_fscore_support
)

# ---- How many folds to actually train/eval ----
MAX_FOLDS_TO_RUN = 3

# Same base as fold-building
model_df = features_df.dropna(subset=["target_logret_H"]).copy()
model_df[DATE_COL] = pd.to_datetime(model_df[DATE_COL])  # dtype only; no reordering

# ---- Reuse the prebuilt `folds` ----
assert 'folds' in globals(), "Expected `folds` from the fold-building cell."
n_rows = len(model_df)

# Defensive: keep only splits whose indices fit current model_df
safe_splits = []
for tr_idx, te_idx in folds:
    if len(tr_idx) and len(te_idx) and tr_idx.max() < n_rows and te_idx.max() < n_rows:
        safe_splits.append((tr_idx, te_idx))

if not safe_splits:
    raise RuntimeError("No valid folds after alignment. Make sure folds were built on the same frame/order.")

splits = safe_splits[:MAX_FOLDS_TO_RUN]
print(f"Total possible folds: {len(safe_splits)} | Running: {len(splits)} "
      f"(MIN_TRAIN_DAYS={MIN_TRAIN_DAYS}, TEST_DAYS={TEST_DAYS}, EMBARGO_DAYS={EMBARGO_DAYS}, H={H})")

# ---- Pre-allocate OOF arrays ----
oof_reg_preds = np.full(n_rows, np.nan, dtype=float)
oof_cls_proba = np.full(n_rows, np.nan, dtype=float)
oof_cls_pred  = np.full(n_rows, -1,   dtype=int)

fold_summaries = []

# ---- Train / evaluate ----
for fold_no, (tr_idx, te_idx) in enumerate(splits, start=1):
    tr_dates = (model_df.iloc[tr_idx][DATE_COL].min(), model_df.iloc[tr_idx][DATE_COL].max())
    te_dates = (model_df.iloc[te_idx][DATE_COL].min(), model_df.iloc[te_idx][DATE_COL].max())

    X_tr = model_df.iloc[tr_idx][features].astype("float32").replace([np.inf, -np.inf], np.nan)
    X_te = model_df.iloc[te_idx][features].astype("float32").replace([np.inf, -np.inf], np.nan)

    med = X_tr.median(numeric_only=True)
    X_tr = X_tr.fillna(med)
    X_te = X_te.fillna(med)

    y_reg_tr = model_df.iloc[tr_idx]["target_logret_H"].astype("float32").to_numpy()
    y_reg_te = model_df.iloc[te_idx]["target_logret_H"].astype("float32").to_numpy()
    y_cls_tr = model_df.iloc[tr_idx]["target_dir_H"].astype("int8").to_numpy()
    y_cls_te = model_df.iloc[te_idx]["target_dir_H"].astype("int8").to_numpy()

    reg = XGBRegressor(**REG_PARAMS).fit(X_tr, y_reg_tr)

    pos = int((y_cls_tr == 1).sum()); neg = int((y_cls_tr == 0).sum())
    spw = (neg / max(pos, 1)) if pos > 0 else 1.0
    cls = XGBClassifier(**CLS_PARAMS, scale_pos_weight=spw).fit(X_tr, y_cls_tr)

    y_reg_hat = reg.predict(X_te)
    y_cls_pro = cls.predict_proba(X_te)[:, 1]
    y_cls_hat = (y_cls_pro >= 0.5).astype(int)

    oof_reg_preds[te_idx] = y_reg_hat
    oof_cls_proba[te_idx] = y_cls_pro
    oof_cls_pred[te_idx]  = y_cls_hat

    rmse = root_mean_squared_error(y_reg_te, y_reg_hat)
    mae  = mean_absolute_error(y_reg_te, y_reg_hat)
    r2   = r2_score(y_reg_te, y_reg_hat)
    corr = np.corrcoef(y_reg_te, y_reg_hat)[0, 1] if len(y_reg_te) > 1 else np.nan

    acc = accuracy_score(y_cls_te, y_cls_hat)
    auc = roc_auc_score(y_cls_te, y_cls_pro) if np.unique(y_cls_te).size == 2 else np.nan
    prec, rec, f1, _ = precision_recall_fscore_support(y_cls_te, y_cls_hat, average="binary", zero_division=0)

    print(
        f"Fold {fold_no:02d} "
        f"[{tr_dates[0].date()}→{tr_dates[1].date()} | {te_dates[0].date()}→{te_dates[1].date()}]  "
        f"|  [REG] RMSE={rmse:.4f}  MAE={mae:.4f}  R^2={r2:.4f}  Corr={corr:.4f}  "
        f"||  [CLS] Acc={acc:.4f}  AUC={auc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  F1={f1:.4f}"
    )

    fold_summaries.append(dict(fold=fold_no, rmse=rmse, mae=mae, r2=r2, corr=corr,
                               acc=acc, auc=auc, prec=prec, rec=rec, f1=f1))

print(f"\nRan {len(splits)} folds (out of {len(safe_splits)} possible).")

# ---- OOF summary ----
mask_reg = np.isfinite(oof_reg_preds) & model_df["target_logret_H"].notna().values
y_true_reg = model_df.loc[mask_reg, "target_logret_H"].to_numpy()
y_pred_reg = oof_reg_preds[mask_reg]
rmse = root_mean_squared_error(y_true_reg, y_pred_reg)
mae  = mean_absolute_error(y_true_reg, y_pred_reg)
r2   = r2_score(y_true_reg, y_pred_reg)
corr = np.corrcoef(y_true_reg, y_pred_reg)[0, 1] if len(y_true_reg) > 1 else np.nan

mask_cls = np.isfinite(oof_cls_proba) & model_df["target_dir_H"].notna().values
y_true_cls = model_df.loc[mask_cls, "target_dir_H"].to_numpy().astype(int)
y_prob_cls = oof_cls_proba[mask_cls]
y_pred_cls = (y_prob_cls >= 0.5).astype(int)
acc = accuracy_score(y_true_cls, y_pred_cls)
auc = roc_auc_score(y_true_cls, y_prob_cls) if np.unique(y_true_cls).size == 2 else np.nan
prec, rec, f1, _ = precision_recall_fscore_support(y_true_cls, y_pred_cls, average="binary", zero_division=0)

print("\n==== OOF Summary (evaluated on RAW returns) ====")
print(f"[REG] H={H}d log-return  | RMSE={rmse:.4f}  MAE={mae:.4f}  R^2={r2:.4f}  Corr={corr:.4f}")
print(f"[CLS] H={H}d direction   | Acc={acc:.4f}  AUC={auc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  F1={f1:.4f}")

cm = pd.crosstab(pd.Series(y_true_cls, name="Actual"),
                 pd.Series(y_pred_cls, name="Predicted"))
print("\nConfusion Matrix (OOF):")
print(cm)

Total possible folds: 3 | Running: 3 (MIN_TRAIN_DAYS=252, TEST_DAYS=21, EMBARGO_DAYS=1, H=5)
Fold 01 [2020-08-13→2025-04-16 | 2025-04-21→2025-05-19]  |  [REG] RMSE=0.0906  MAE=0.0604  R^2=-0.1235  Corr=0.1813  ||  [CLS] Acc=0.4781  AUC=0.5926  Prec=0.8031  Rec=0.2325  F1=0.3606
Fold 02 [2020-08-13→2025-05-15 | 2025-05-20→2025-06-18]  |  [REG] RMSE=0.0859  MAE=0.0511  R^2=-0.1136  Corr=0.1143  ||  [CLS] Acc=0.4415  AUC=0.4247  Prec=0.0000  Rec=0.0000  F1=0.0000


KeyboardInterrupt: 

In [None]:
def predict_latest_per_symbol(latest_df: pd.DataFrame,
                              feature_cols: List[str],
                              reg_model: XGBRegressor,
                              cls_model: XGBClassifier,
                              train_medians: pd.Series) -> pd.DataFrame:
    last_idx = latest_df.groupby(SYMBOL_COL)[DATE_COL].idxmax()
    snap = latest_df.loc[last_idx, [SYMBOL_COL, DATE_COL] + feature_cols].copy()
    Xsnap = snap[feature_cols].astype("float32").fillna(train_medians)

    reg_hat = reg_model.predict(Xsnap)
    up_prob = cls_model.predict_proba(Xsnap)[:, 1]
    up_lab  = (up_prob >= 0.5).astype(int)

    out = snap[[SYMBOL_COL, DATE_COL]].copy()
    out[f"pred_logret_{H}d"]   = reg_hat
    out[f"pred_up_{H}d_prob"]  = up_prob
    out[f"pred_up_{H}d"]       = up_lab
    return out.sort_values(f"pred_logret_{H}d", ascending=False).reset_index(drop=True)

# Example usage with the last trained fold’s models:
# (If you want a production “latest” model, retrain on *all* data up to the latest date.)

In [None]:
features_df.columns

In [None]:
# === Hyperopt wrapper for XGB (classification OR regression) ===
import numpy as np
import pandas as pd
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll import scope
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import (
    root_mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, roc_auc_score, precision_recall_fscore_support
)

# ---- Reuse your existing base (no reordering!) ----
assert 'folds' in globals(), "Expected `folds` from the fold-building cell."
assert 'model_df' in globals(), "Expected `model_df` to be defined."
assert 'features' in globals(), "Expected `features` to be defined."
n_rows = len(model_df)

# ---- A small, safe subset of folds to keep tuning fast ----
MAX_FOLDS_TO_RUN = min(3, len(folds))  # tweak if you want more signal
splits = folds[:MAX_FOLDS_TO_RUN]

# ---------- CV runner (single evaluation of a param set) ----------
def run_cv_once(params, verbose=False):
    """
    Returns: dict with aggregate metrics over folds.
    - We always compute both reg + cls metrics so you can optimize either.
    - Early stopping uses a time-based split: last EVAL_FRAC of *train fold* as validation.
    """
    reg_metrics, cls_metrics = [], []

    # Cast a few here to avoid surprises
    n_estimators      = int(params["n_estimators"])
    max_depth         = int(params["max_depth"])
    min_child_weight  = float(params["min_child_weight"])
    learning_rate     = float(params["learning_rate"])
    subsample         = float(params["subsample"])
    colsample_bytree  = float(params["colsample_bytree"])
    reg_lambda        = float(params["reg_lambda"])
    gamma             = float(params["gamma"])
    early_stopping    = int(params.get("early_stopping_rounds", EARLY_STOP_ROUNDS))
    rng_state         = int(params.get("random_state", FIXED_RANDOM_STATE))

    for (tr_idx, te_idx) in splits:
        # --- time-based val split inside train fold ---
        tr_idx_sorted = np.sort(tr_idx)
        n_tr = len(tr_idx_sorted)
        n_val = max(1, int(n_tr * EVAL_FRAC))
        val_idx = tr_idx_sorted[-n_val:]
        tr_core_idx = tr_idx_sorted[:-n_val] if n_tr > n_val else tr_idx_sorted

        # --- X: sanitize inf -> NaN, then impute with train-core medians only ---
        X_tr_core = model_df.iloc[tr_core_idx][features].astype("float32").replace([np.inf, -np.inf], np.nan)
        X_val     = model_df.iloc[val_idx][features].astype("float32").replace([np.inf, -np.inf], np.nan)
        X_te      = model_df.iloc[te_idx][features].astype("float32").replace([np.inf, -np.inf], np.nan)

        med = X_tr_core.median(numeric_only=True)
        X_tr_core = X_tr_core.fillna(med)
        X_val     = X_val.fillna(med)
        X_te      = X_te.fillna(med)

        # --- y ---
        y_reg_tr_core = model_df.iloc[tr_core_idx]["target_logret_H"].astype("float32").to_numpy()
        y_reg_val     = model_df.iloc[val_idx]["target_logret_H"].astype("float32").to_numpy()
        y_reg_te      = model_df.iloc[te_idx]["target_logret_H"].astype("float32").to_numpy()

        y_cls_tr_core = model_df.iloc[tr_core_idx]["target_dir_H"].astype("int8").to_numpy()
        y_cls_val     = model_df.iloc[val_idx]["target_dir_H"].astype("int8").to_numpy()
        y_cls_te      = model_df.iloc[te_idx]["target_dir_H"].astype("int8").to_numpy()

        # --- models (no reg_alpha; tree_method='hist') ---
        common = dict(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            min_child_weight=min_child_weight,
            reg_lambda=reg_lambda,
            gamma=gamma,
            tree_method="hist",
            random_state=rng_state,
            n_jobs=-1,
        )

        reg = XGBRegressor(objective="reg:squarederror",eval_metric="rmse", **common)
        reg.fit(
            X_tr_core, y_reg_tr_core,
            eval_set=[(X_val, y_reg_val)],
            verbose=False,
        )

        pos = int((y_cls_tr_core == 1).sum()); neg = int((y_cls_tr_core == 0).sum())
        spw = (neg / max(pos, 1)) if pos > 0 else 1.0
        cls = XGBClassifier(eval_metric="logloss", scale_pos_weight=spw, **common)
        cls.fit(
            X_tr_core, y_cls_tr_core,
            eval_set=[(X_val, y_cls_val)],
            verbose=False,
        )

        # --- predictions & metrics ---
        y_reg_hat = reg.predict(X_te)
        reg_metrics.append({
            "rmse": root_mean_squared_error(y_reg_te, y_reg_hat),
            "mae":  mean_absolute_error(y_reg_te, y_reg_hat),
        })

        y_cls_pro = cls.predict_proba(X_te)[:, 1]
        y_cls_hat = (y_cls_pro >= 0.5).astype(int)
        auc = roc_auc_score(y_cls_te, y_cls_pro) if np.unique(y_cls_te).size == 2 else np.nan
        cls_metrics.append({
            "acc": accuracy_score(y_cls_te, y_cls_hat),
            "auc": auc,
        })

    # Aggregate
    mean_rmse = float(np.mean([m["rmse"] for m in reg_metrics]))
    mean_mae  = float(np.mean([m["mae"]  for m in reg_metrics]))
    auc_vals  = [m["auc"] for m in cls_metrics if np.isfinite(m["auc"])]
    mean_auc  = float(np.mean(auc_vals)) if auc_vals else np.nan
    mean_acc  = float(np.mean([m["acc"] for m in cls_metrics]))

    if verbose:
        print(f"[CV] rmse={mean_rmse:.5f}  mae={mean_mae:.5f}  auc={mean_auc:.5f}  acc={mean_acc:.5f}")

    return dict(mean_rmse=mean_rmse, mean_mae=mean_mae, mean_auc=mean_auc, mean_acc=mean_acc)

# ---------- Hyperopt objective ----------
def hyperopt_objective(params, model_type="cls"):
    metrics = run_cv_once(params, verbose=False)
    if model_type == "cls":
        # Maximize AUC -> minimize 1 - AUC; fallback to 1 - Acc if AUC NaN
        loss = (1.0 - metrics["mean_auc"]) if np.isfinite(metrics["mean_auc"]) else (1.0 - metrics["mean_acc"])
    elif model_type == "reg":
        # Minimize RMSE
        loss = metrics["mean_rmse"]
    else:
        # Joint objective (optional): rmse + (1 - auc)
        loss = metrics["mean_rmse"] + (1.0 - metrics["mean_auc"] if np.isfinite(metrics["mean_auc"]) else 0.5*(1.0 - metrics["mean_acc"]))
    return {"loss": float(loss), "status": STATUS_OK, "metrics": metrics}

# ---------- Search spaces (no reg_alpha) ----------
space_common = {
    "n_estimators":     scope.int(hp.quniform("n_estimators", 300, 1200, 50)),
    "max_depth":        scope.int(hp.quniform("max_depth", 4, 10, 1)),
    "learning_rate":    hp.loguniform("learning_rate", np.log(0.01), np.log(0.20)),
    "subsample":        hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
    "min_child_weight": hp.loguniform("min_child_weight", np.log(1e-1), np.log(10.0)),
    "reg_lambda":       hp.loguniform("reg_lambda", np.log(1e-2), np.log(10.0)),
    "gamma":            hp.loguniform("gamma", np.log(1e-8), np.log(1.0)),
    "early_stopping_rounds": scope.int(hp.quniform("early_stopping_rounds", 30, 100, 10)),
    "random_state":     42,   # keep fixed for repeatability
}

# ---------- Convenience runners ----------
def tune_classifier(max_evals=25):
    trials = Trials()
    best = fmin(
        fn=lambda p: hyperopt_objective(p, model_type="cls"),
        space=space_common,
        algo=tpe.suggest,
        max_evals=max_evals,
        trials=trials,
        rstate=np.random.default_rng(123),   # ✅ avoids RandomState.integers error
        show_progressbar=True,
    )
    # Cast ints cleanly and attach fixed bits
    best = {
        "n_estimators": int(best["n_estimators"]),
        "max_depth": int(best["max_depth"]),
        "learning_rate": float(best["learning_rate"]),
        "subsample": float(best["subsample"]),
        "colsample_bytree": float(best["colsample_bytree"]),
        "min_child_weight": float(best["min_child_weight"]),
        "reg_lambda": float(best["reg_lambda"]),
        "gamma": float(best["gamma"]),
        "early_stopping_rounds": int(best["early_stopping_rounds"]),
        "tree_method": "hist",
        "n_jobs": -1,
        "random_state": 42,
        "eval_metric": "auc",
    }
    return best, trials

def tune_regressor(max_evals=25):
    trials = Trials()
    best = fmin(
        fn=lambda p: hyperopt_objective(p, model_type="reg"),
        space=space_common,
        algo=tpe.suggest,
        max_evals=max_evals,
        trials=trials,
        rstate=np.random.default_rng(321),   # ✅
        show_progressbar=True,
    )
    best = {
        "n_estimators": int(best["n_estimators"]),
        "max_depth": int(best["max_depth"]),
        "learning_rate": float(best["learning_rate"]),
        "subsample": float(best["subsample"]),
        "colsample_bytree": float(best["colsample_bytree"]),
        "min_child_weight": float(best["min_child_weight"]),
        "reg_lambda": float(best["reg_lambda"]),
        "gamma": float(best["gamma"]),
        "early_stopping_rounds": int(best["early_stopping_rounds"]),
        "tree_method": "hist",
        "n_jobs": -1,
        "random_state": 42,
        "objective": "reg:squarederror",
    }
    return best, trials

print("✔ Hyperopt is ready. Call:")
print("  best_cls, cls_trials = tune_classifier(max_evals=25)")
print("  best_reg, reg_trials = tune_regressor(max_evals=25)")
#---- Example usage (pick ONE to run) ----
best_cls_params, cls_trials = tune_classifier(max_evals=25)
print("Best CLS params:", best_cls_params)
# best_reg_params, reg_trials = tune_regressor(max_evals=25)
# print("Best REG params:", best_reg_params)

✔ Hyperopt is ready. Call:
  best_cls, cls_trials = tune_classifier(max_evals=25)
  best_reg, reg_trials = tune_regressor(max_evals=25)
  8%|██████████▏                                                                                                                    | 2/25 [1:28:14<18:45:17, 2935.56s/trial, best loss: 0.4757726980405964]

'2.1.1'