In [None]:
# --- NB9: imports & run config ---
import json, re
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Repro + IO
RNG = 42
np.random.seed(RNG)
DATA_DIR = Path("data")
ART_DIR  = Path("artifacts")
FIG_DIR  = Path("reports/figures")
for p in [DATA_DIR, ART_DIR, FIG_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Costs & threshold (τ from artifacts/threshold.json if available)
FEE_BPS = 5.0
SLIPPAGE_BPS = 0.0
TAU = 0.59
thr_p = ART_DIR / "threshold.json"
if thr_p.exists():
    try:
        tj = json.loads(thr_p.read_text(encoding="utf-8"))
        if isinstance(tj, dict):
            if "threshold" in tj:             TAU = float(tj["threshold"])
            elif "LR" in tj and "tau" in tj["LR"]: TAU = float(tj["LR"]["tau"])
            elif "tau" in tj:                 TAU = float(tj["tau"])
    except Exception:
        pass

print(f"Using threshold τ = {TAU} | Fee bps = {FEE_BPS} | Slippage bps = {SLIPPAGE_BPS}")


In [None]:
# --- Load df_nb02 (robust) & ensure label 'y' ---
df_path = DATA_DIR/"df_nb02.parquet" if (DATA_DIR/"df_nb02.parquet").exists() else DATA_DIR/"df_nb02.csv"
df = pd.read_parquet(df_path) if df_path.suffix == ".parquet" else pd.read_csv(df_path, parse_dates=["date"])
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)

label_col = next((c for c in ["y","target","label","y_next_up"] if c in df.columns), None)
if label_col is None:
    if "ret1" in df.columns:
        df["y"] = (df["ret1"].shift(-1) > 0).astype(int)
    elif "close" in df.columns:
        df["y"] = (pd.Series(df["close"]).pct_change().shift(-1) > 0).astype(int)
    else:
        raise RuntimeError("No label & cannot derive from ret1/close.")
elif label_col != "y":
    df["y"] = df[label_col].astype(int)

print(f"Rows/Features: {len(df)} / {df.shape[1]-1}")
print(f"Span: {df['date'].min().date()} → {df['date'].max().date()}")
print("Label balance:", round(float(df['y'].mean()), 3))


In [None]:
# --- Leak-safe feature list from artifacts/feature_list.json ---
feat_cols = json.loads((ART_DIR/"feature_list.json").read_text(encoding="utf-8"))

LEAK_WORDS = re.compile(r"(next|lead|t\+|ahead|future|target|label)$", re.I)
BAD = {"y","ret_next","y_next_up","target","label"}
feat_cols = [c for c in feat_cols if c in df.columns and c not in BAD and not LEAK_WORDS.search(c)]
feat_cols = [c for c in feat_cols if pd.api.types.is_numeric_dtype(df[c])]

assert "y" not in feat_cols and "ret_next" not in feat_cols, "Leak columns slipped into features."

# quick FYI scan (safe: excludes y/ret_next)
def _safe_auc(y, x):
    try:
        return roc_auc_score(y, x) if y.nunique() > 1 else np.nan
    except Exception:
        return np.nan

top = []
if "ret_next" in df.columns:
    corr_abs = df[feat_cols + ["ret_next"]].corr(numeric_only=True)["ret_next"].abs().sort_values(ascending=False)
    for c in corr_abs.index[:20]:
        if c == "ret_next": continue
        top.append([c, float(corr_abs[c]), float(_safe_auc(df["y"], df[c]))])
else:
    for c in feat_cols[:20]:
        top.append([c, np.nan, float(_safe_auc(df["y"], df[c]))])

leak_scan = pd.DataFrame(top, columns=["col","|corr(ret+1)|","AUC(dir ret+1)"])
print("Top-20 potential leaks (FYI only):")
print(leak_scan.to_string(index=False))


In [None]:
# --- Build monthly folds with 12-month warmup ---
df["month"] = df["date"].dt.to_period("M").astype(str)
months = sorted(df["month"].unique())
WARMUP_MONTHS = 12
test_months = months[WARMUP_MONTHS:]  # start after warm-up
print(f"Folds: {len(test_months)} | first test month: {test_months[0]} | last: {test_months[-1]}")


In [None]:
# --- Helpers: model, returns, KPIs ---
def fit_lr(X, y):
    scaler = StandardScaler().fit(X)
    Xs = scaler.transform(X)
    lr = LogisticRegression(max_iter=200, solver="lbfgs", random_state=RNG)
    lr.fit(Xs, y)
    return lr, scaler

def predict_lr(lr, scaler, X):
    return lr.predict_proba(scaler.transform(X))[:, 1]

def strat_returns_from_close(prob, close, tau, fee_bps=5.0):
    prob = np.asarray(prob)
    ret_next = pd.Series(close).pct_change().shift(-1).to_numpy()
    pos = (prob >= tau).astype(int)
    flips = (np.abs(np.diff(np.r_[0, pos])) > 0).astype(int)
    fee = fee_bps / 10000.0
    r = pos * ret_next - flips * fee
    eq = (1 + pd.Series(r).fillna(0.0)).cumprod()
    return r, eq

def kpis_from_returns(r, freq=252):
    r = pd.Series(r).fillna(0.0)
    eq = (1 + r).cumprod()
    cagr = (1 + r).prod() ** (freq / max(len(r),1)) - 1
    vol  = r.std() * np.sqrt(freq)
    sharpe = (cagr / vol) if vol > 0 else np.nan
    mdd = (eq / eq.cummax() - 1).min()
    hit = (r > 0).mean()
    return dict(CAGR=cagr, Sharpe=sharpe, vol_annual=vol,
                total_return=(eq.iloc[-1]-1) if len(eq) else 0.0,
                max_drawdown=mdd, hit_rate=hit)


In [None]:
# --- Walk-forward training & inference by month ---
all_month_rows = []
daily_rows = []
r_list = []
date_list = []

# overall buy&hold will be computed after we gather all test dates
for m in test_months:
    te_idx = df.index[df["month"] == m]
    if len(te_idx) == 0: 
        continue
    start_date = df.loc[te_idx[0], "date"]
    tr_df = df[df["date"] < start_date].copy()
    te_df = df.loc[te_idx].copy()

    X_tr = tr_df[feat_cols].fillna(0.0).to_numpy()
    y_tr = tr_df["y"].astype(int).to_numpy()
    X_te = te_df[feat_cols].fillna(0.0).to_numpy()
    y_te = te_df["y"].astype(int).to_numpy()

    lr, scaler = fit_lr(X_tr, y_tr)
    p_te = predict_lr(lr, scaler, X_te)

    auc_m = roc_auc_score(y_te, p_te) if y_te.min() != y_te.max() else np.nan
    r_m, eq_m = strat_returns_from_close(p_te, te_df["close"].to_numpy(), TAU, fee_bps=FEE_BPS)
    k_m = kpis_from_returns(r_m)

    all_month_rows.append(dict(month=m, AUC=auc_m, Sharpe=k_m["Sharpe"],
                               TotalReturn=k_m["total_return"], N=float(len(te_df))))

    pos = (p_te >= TAU).astype(float)
    daily_rows.append(pd.DataFrame({
        "date": te_df["date"].to_numpy(),
        "y": y_te,
        "prob": p_te,
        "month": m,
        "pos": pos,
        "pos_prev": np.r_[0, pos[:-1]]
    }))

    r_list.append(pd.Series(r_m))
    date_list.extend(te_df["date"].tolist())


In [None]:
# --- Consolidate daily stream for plotting/inspection ---
daily = pd.concat(daily_rows, ignore_index=True) if daily_rows else pd.DataFrame()
r_all = pd.concat(r_list, ignore_index=True) if r_list else pd.Series(dtype="float64")

# Buy & Hold on the same dates
test_mask = df["date"].isin(daily["date"])
close_test = df.loc[test_mask, "close"].reset_index(drop=True)
ret_mkt = close_test.pct_change().shift(-1)
eq_mkt = (1 + ret_mkt.fillna(0.0)).cumprod()
eq_strat = (1 + r_all.fillna(0.0)).cumprod()

turnover = (np.abs(np.diff(np.r_[0, daily["pos"].to_numpy()])) > 0).mean() if len(daily) else 0.0
print(f"Turnover (avg): {turnover:.4f} | Final equity: {float(eq_strat.iloc[-1]) if len(eq_strat) else 1.0:.6f}")
print(daily[["date","y","prob","month","pos"]].head())


In [None]:
# --- Plot equity curve ---
import matplotlib.pyplot as plt

plt.figure(figsize=(9,4.5))
plt.plot(daily.index, eq_mkt.values[:len(daily)], label="Buy & Hold")
plt.plot(daily.index, eq_strat.values, label="Walk-forward (LR, τ)")
plt.title("Equity Curve — Walk-Forward (Monthly Refit)")
plt.xlabel("Time (test)")
plt.ylabel("Equity (×)")
plt.legend()
plt.tight_layout()
out_fig = FIG_DIR/"equity_curve_walkforward.png"
plt.savefig(out_fig, dpi=140)
plt.show()
print(f"Saved figure: {out_fig}")


In [None]:
# --- NB9 Sanity diagnostics (robust) ---
import numpy as np, pandas as pd

# If 'wf' isn't defined, build it from 'daily' + returns from df['close']
if "wf" not in globals():
    if "daily" not in globals():
        raise RuntimeError("Run the walk-forward cells first (the ones that create 'daily').")
    if "df" not in globals() or "close" not in df.columns or "date" not in df.columns:
        raise RuntimeError("Need 'df' with 'date' and 'close' to compute market returns.")
    tmp = df[["date","close"]].copy()
    tmp["ret_mkt"] = tmp["close"].pct_change().shift(-1)
    wf = daily.merge(tmp[["date","ret_mkt"]], on="date", how="left")

# 1) Return distribution sanity
print("ret_mkt describe:\n", wf["ret_mkt"].describe().to_string(), "\n")
big20 = (wf["ret_mkt"].abs() > 0.20).sum()
big100 = (wf["ret_mkt"].abs() > 1.00).sum()
print(f"Large |ret_mkt| counts:\n> 20%: {big20} | > 100%: {big100}\n")

print("Sample of biggest magnitudes:")
cols = ["date","ret_mkt"] + [c for c in ["pos","pos_prev"] if c in wf.columns]
print(wf.loc[wf["ret_mkt"].abs().nlargest(5).index, cols].sort_values("date"))

# 2) Label alignment: SAME- vs NEXT-day
df_tmp = df[["date","y"]].copy()
ret_series = wf.set_index("date")["ret_mkt"]
df_tmp["ret_mkt"]  = df_tmp["date"].map(ret_series)
df_tmp["ret_next"] = df_tmp["ret_mkt"].shift(-1)

same = (df_tmp["y"] == (df_tmp["ret_mkt"]  > 0).astype(int)).mean()
nxt  = (df_tmp["y"] == (df_tmp["ret_next"] > 0).astype(int)).mean()
base = (df_tmp["ret_mkt"] > 0).mean()

print("\nLabel alignment:")
print("Match(y, SAME-day up):", round(float(same), 3))
print("Match(y, NEXT-day up):", round(float(nxt), 3))
print("Baseline up-day rate :", round(float(base), 3))


In [None]:
# --- KPIs + monthly table (preview) ---
overall = kpis_from_returns(r_all)
overall_s = pd.Series(overall).round(6)
print("\nOverall walk-forward:")
print(overall_s)

mf = pd.DataFrame(all_month_rows)
mf_preview = mf.copy()
mf_preview["Sharpe"] = mf_preview["Sharpe"].round(2)
mf_preview["AUC"] = mf_preview["AUC"].round(3)
mf_preview["TotalReturn"] = (mf_preview["TotalReturn"]*100).map(lambda x: f"{x:.2f}%")
print("\nHead of per-month metrics:")
print(mf_preview.head(10))


In [None]:
# --- Save monthly metrics & summary JSON ---
mf.to_csv(DATA_DIR/"walkforward_metrics.csv", index=False)

summary = {
    "tau": TAU,
    "fee_bps": FEE_BPS,
    "slippage_bps": SLIPPAGE_BPS,
    "overall": {k: float(v) for k, v in kpis_from_returns(r_all).items()}
}
(ART_DIR/"walkforward_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")

print(f"Saved: {DATA_DIR/'walkforward_metrics.csv'} | {ART_DIR/'walkforward_summary.json'}")
