In [72]:
# --- Imports & params ---
import json, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.metrics import roc_auc_score

warnings.filterwarnings("ignore")

DATA_DIR = Path("data")
ART_DIR  = Path("artifacts")
FIG_DIR  = Path("reports/figures")
for p in [DATA_DIR, ART_DIR, FIG_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Backtest knobs
FEE_BPS      = 5.0      # round-trip trading cost in bps (e.g., 5 = 0.05%)
SLIPPAGE_BPS = 0.0      # extra bps per trade (optional)
TAU_GRID     = np.linspace(0.50, 0.65, 16)  # probability threshold sweep
ALLOW_SHORT  = False     # True → 3-way long/short/flat; False → long/flat


In [73]:
# --- Split loader with Parquet-or-CSV fallback (drop-in replacement for Cell 2) ---
from pathlib import Path
import pandas as pd
import numpy as np

def _can_parquet():
    try:
        import pyarrow  # noqa: F401
        return True
    except Exception:
        try:
            import fastparquet  # noqa: F401
            return True
        except Exception:
            return False

def write_table(df: pd.DataFrame, base: Path):
    """
    Save to <base>.parquet if a parquet engine is installed, else <base>.csv
    Returns the actual Path written.
    """
    if _can_parquet():
        path = base.with_suffix(".parquet")
        df.to_parquet(path, index=False)
    else:
        path = base.with_suffix(".csv")
        df.to_csv(path, index=False)
    return path

def read_table(base: Path) -> pd.DataFrame:
    """
    Read from <base>.parquet if it exists, else <base>.csv.
    """
    p_parq = base.with_suffix(".parquet")
    p_csv  = base.with_suffix(".csv")
    if p_parq.exists():
        return pd.read_parquet(p_parq)
    if p_csv.exists():
        return pd.read_csv(p_csv)
    raise FileNotFoundError(f"Neither {p_parq.name} nor {p_csv.name} exists.")

# --- Load train/test splits; rebuild from df_nb02.csv if missing ---
def rebuild_splits_from_phase2():
    base_csv = DATA_DIR / "df_nb02.csv"
    if not base_csv.exists():
        raise FileNotFoundError("Missing data/df_nb02.csv (Phase 2).")
    df = pd.read_csv(base_csv)

    # time order if available
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.tz_localize(None)
        df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)

    label_col = next((c for c in ["y","target","label","y_next_up"] if c in df.columns), None)
    if label_col is None:
        raise RuntimeError("No label column in df_nb02.csv (expected y/target/label/y_next_up).")

    y = df.pop(label_col).astype(int)
    # drop obvious non-features
    X = df.drop(columns=[c for c in ["date","ticker","symbol","spy_close","vix_close"] if c in df.columns], errors="ignore")
    X = X.select_dtypes(include=[np.number]).copy()

    n = len(X); cut = int(n*0.8)
    X_tr, X_te = X.iloc[:cut].copy(), X.iloc[cut:].copy()
    y_tr, y_te = y.iloc[:cut].copy(), y.iloc[cut:].copy()

    # persist for next runs (Parquet if possible, else CSV)
    write_table(X_tr, DATA_DIR / "train")
    write_table(X_te, DATA_DIR / "test")
    write_table(pd.DataFrame({"y": y_tr}), DATA_DIR / "y_train")
    write_table(pd.DataFrame({"y": y_te}), DATA_DIR / "y_test")
    print("Rebuilt splits → data/train.*, data/test.*, y_*.{parquet|csv}")
    return X_tr, X_te, y_tr.values, y_te.values

train_base = DATA_DIR / "train"
test_base  = DATA_DIR / "test"
ytr_base   = DATA_DIR / "y_train"
yte_base   = DATA_DIR / "y_test"

try:
    X_tr = read_table(train_base).copy()
    X_te = read_table(test_base).copy()
except FileNotFoundError:
    X_tr, X_te, y_tr, y_te = rebuild_splits_from_phase2()
else:
    # labels either embedded or separate y files
    label_col = next((c for c in ["y","target","label","y_next_up"] if c in X_tr.columns), None)
    if label_col:
        y_tr = X_tr.pop(label_col).astype(int).values
        y_te = X_te.pop(label_col).astype(int).values
    else:
        y_tr = read_table(ytr_base).iloc[:,0].astype(int).values
        y_te = read_table(yte_base).iloc[:,0].astype(int).values

print("Loaded splits:", X_tr.shape, X_te.shape, "| labels:", len(y_tr), len(y_te))


Loaded splits: (1662, 14) (416, 14) | labels: 1662 416


In [74]:
# --- Load models ---
import joblib
path_lr  = ART_DIR / "lr.joblib"      # your saved LR model
path_xgb = ART_DIR / "model_xgb.pkl"  # only if you trained/saved XGB

HAS_LR  = path_lr.exists()
HAS_XGB = path_xgb.exists()

mdl_lr  = joblib.load(path_lr)  if HAS_LR  else None
mdl_xgb = joblib.load(path_xgb) if HAS_XGB else None
print("HAS_LR:", HAS_LR, "| HAS_XGB:", HAS_XGB)


HAS_LR: True | HAS_XGB: False


In [75]:
# --- Align features to training spec + apply scaler, then predict ---
import json

# 1) feature list
feat_path   = ART_DIR / "feature_list.json"
scaler_path = ART_DIR / "scaler.joblib"

if feat_path.exists():
    with open(feat_path, "r") as fh:
        feat_list = json.load(fh)
elif HAS_LR and hasattr(mdl_lr, "feature_names_in_"):
    feat_list = list(mdl_lr.feature_names_in_)
else:
    raise RuntimeError("Missing artifacts/feature_list.json and model has no feature_names_in_.")

def conform(df, cols):
    df = df.copy()
    for c in cols:
        if c not in df.columns:
            df[c] = 0.0
    return df[cols].copy()

X_tr_aligned = conform(X_tr, feat_list)
X_te_aligned = conform(X_te, feat_list)

# 2) scaler
if scaler_path.exists():
    scaler = joblib.load(scaler_path)
    X_tr_in = scaler.transform(X_tr_aligned.values)
    X_te_in = scaler.transform(X_te_aligned.values)
else:
    X_tr_in = X_tr_aligned.values
    X_te_in = X_te_aligned.values

print("Aligned shapes:", X_tr_in.shape, X_te_in.shape)
print("Model expects:", getattr(mdl_lr, 'n_features_in_', 'unknown'))

# 3) predict (use aligned arrays!)
from sklearn.metrics import roc_auc_score, brier_score_loss
probs = {}

if HAS_LR and mdl_lr is not None:
    probs["LR"] = mdl_lr.predict_proba(X_te_in)[:, 1]

if HAS_XGB and mdl_xgb is not None:
    # XGB usually trained on same feat_list; scale only if you trained it with scaling
    probs["XGB"] = mdl_xgb.predict_proba(X_te_aligned.values)[:, 1]

# quick metrics
rows = []
for name, p in probs.items():
    rows.append({
        "model": name,
        "AUC_test": float(roc_auc_score(y_te, p)),
        "Brier_test": float(brier_score_loss(y_te, p)),
        "mean_p": float(np.mean(p)),
        "n": int(len(p)),
    })
pd.DataFrame(rows).round(4)


Aligned shapes: (1662, 16) (416, 16)
Model expects: 16


Unnamed: 0,model,AUC_test,Brier_test,mean_p,n
0,LR,0.4531,0.2549,0.527,416


In [76]:
# --- Predict on test (aligned) ---
from sklearn.metrics import roc_auc_score, brier_score_loss

probs = {}

# LR uses the scaled/aligned matrix
if HAS_LR and mdl_lr is not None:
    probs["LR"] = mdl_lr.predict_proba(X_te_in)[:, 1]

# XGB (if present) uses aligned *unscaled* features unless you also scaled at train time
if HAS_XGB and mdl_xgb is not None:
    probs["XGB"] = mdl_xgb.predict_proba(X_te_aligned.values)[:, 1]

if not probs:
    raise RuntimeError("No models available to predict. Ensure lr.joblib or model_xgb.pkl exists.")

# Quick metrics table
rows = []
for name, p in probs.items():
    rows.append({
        "model": name,
        "AUC_test": float(roc_auc_score(y_te, p)),
        "Brier_test": float(brier_score_loss(y_te, p)),
        "mean_p": float(np.mean(p)),
        "n": int(len(p)),
    })
metrics = pd.DataFrame(rows).round(4)
display(metrics)


Unnamed: 0,model,AUC_test,Brier_test,mean_p,n
0,LR,0.4531,0.2549,0.527,416


In [77]:
# --- Build next-day returns (ret1) aligned to test period length ---
base_csv = DATA_DIR / "df_nb02.csv"
df_all = pd.read_csv(base_csv)
if "date" not in df_all.columns:
    raise KeyError("'date' missing in df_nb02.csv")
df_all["date"] = pd.to_datetime(df_all["date"], errors="coerce").dt.tz_localize(None)
df_all = df_all.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)

price_col = next((c for c in ["adj_close","Adj Close","close","Close","price"] if c in df_all.columns), None)
if price_col is None:
    raise RuntimeError("No price column found in df_nb02.csv (need adj_close/close).")

df_all["ret1"] = pd.Series(df_all[price_col]).pct_change().shift(-1)

N = len(y_te)
df_test_slice = df_all.iloc[-N:].copy().reset_index(drop=True)
dates_te = df_test_slice["date"].values
r_next   = df_test_slice["ret1"].fillna(0.0).values

print("Test span:", df_test_slice['date'].min(), "→", df_test_slice['date'].max(), "| n=", len(df_test_slice))


Test span: 2024-02-14 00:00:00 → 2025-10-10 00:00:00 | n= 416


In [78]:
# --- Backtest helpers ---
def positions_from_probs(p, tau, allow_short=False):
    if allow_short:
        return np.where(p>tau, 1.0, np.where(p<1.0-tau, -1.0, 0.0))
    return (p > tau).astype(float)

def trade_costs_from_positions(pos, fee_bps=5.0, slippage_bps=0.0):
    return (abs(np.diff(np.r_[0.0, pos])) * (fee_bps + slippage_bps)) / 1e4

def equity_curve(returns, pos, fee_bps=5.0, slippage_bps=0.0):
    gross = pos * returns
    costs = trade_costs_from_positions(pos, fee_bps, slippage_bps)
    net   = gross - costs
    eq = (1.0 + net).cumprod()
    return eq, {"gross": gross, "costs": costs, "net": net}

def perf_metrics(net_ret, dates=None, periods_per_year=252):
    tot_ret = float(np.prod(1.0 + net_ret) - 1.0)
    if dates is not None and len(dates)>1:
        years = (pd.to_datetime(dates[-1]) - pd.to_datetime(dates[0])).days / 365.25
        cagr = (1.0 + tot_ret)**(1.0/years) - 1.0 if years>0 else np.nan
    else:
        years = len(net_ret)/periods_per_year
        cagr = (1.0 + tot_ret)**(1.0/years) - 1.0 if years>0 else np.nan

    mu = np.mean(net_ret); sd = np.std(net_ret, ddof=1)
    sharpe = (mu/(sd + 1e-12)) * np.sqrt(periods_per_year)

    curve = np.cumprod(1.0 + net_ret)
    peak  = np.maximum.accumulate(curve)
    maxdd = float((curve/peak - 1.0).min())

    hit_rate = float(np.mean(net_ret > 0.0))
    turnover = float(np.mean(np.abs(np.diff(np.r_[0.0, (net_ret!=0).astype(float)]))))

    return {"total_return": float(tot_ret), "CAGR": float(cagr), "vol_annual": float(sd*np.sqrt(periods_per_year)),
            "Sharpe": float(sharpe), "max_drawdown": maxdd, "hit_rate": hit_rate, "turnover": turnover}


In [79]:
# --- Helper: get test returns by index-aligning to the full series tail ---
from pathlib import Path
import pandas as pd

def get_ret_next_tail_for_test(test_len: int):
    """
    Compute next-day returns from the FULL df_nb02.* series (using 'close' or an existing 'ret_next'),
    then take the last test_len rows and reset index, to align with the TEST split by index.
    """
    df_path = Path("data/df_nb02.parquet") if Path("data/df_nb02.parquet").exists() else Path("data/df_nb02.csv")
    if not df_path.exists():
        raise FileNotFoundError("Missing data/df_nb02.{csv|parquet}")

    df_full = pd.read_parquet(df_path) if df_path.suffix == ".parquet" else pd.read_csv(df_path, parse_dates=["date"])
    # If ret_next is already there, use it; else compute from 'close'
    if "ret_next" in df_full.columns:
        ret_full = pd.Series(df_full["ret_next"], dtype="float64")
    else:
        price_col = next((c for c in ["close","Close","Adj Close","adj_close"] if c in df_full.columns), None)
        if price_col is None:
            raise KeyError("No price column ('close'/'Adj Close') to compute returns.")
        ret_full = pd.Series(df_full[price_col], dtype="float64").pct_change().shift(-1)

    # Take the tail of the full series to match the test length, index-aligned
    ret_tail = ret_full.iloc[-test_len:].reset_index(drop=True)
    return ret_tail.to_numpy()


In [80]:
# ----- Cell 8: Threshold sweep (tau) + backtest KPIs (index-aligned returns) -----
import numpy as np
import pandas as pd

# fee setup (bps → fraction)
FEE_BPS = 5.0
fee = FEE_BPS / 10000.0

# Use LR test probabilities computed earlier (from Cell 4)
# Expect either `p_te` or `preds["LR"]["te"]` to exist
if 'p_te' in globals():
    probs_te = p_te
elif 'preds' in globals() and "LR" in preds and "te" in preds["LR"]:
    probs_te = preds["LR"]["te"]
else:
    raise RuntimeError("No test probabilities found (expected p_te or preds['LR']['te']).")

# Returns from FULL series tail, index-aligned to TEST length (matches saved backtest)
ret_next = get_ret_next_tail_for_test(len(probs_te))

def kpis_for_tau(tau, p=probs_te, ret=ret_next, fee_frac=fee, freq=252):
    signal = (p >= tau).astype(int)
    flips  = (np.abs(np.diff(np.r_[0, signal])) > 0).astype(int)  # trades when position changes
    r      = pd.Series(signal * ret - flips * fee_frac).fillna(0.0)

    eq = (1 + r).cumprod()
    cagr = (1 + r).prod() ** (freq / max(len(r), 1)) - 1
    vol  = r.std() * np.sqrt(freq)
    sharpe = (cagr / vol) if vol > 0 else np.nan
    maxdd = (eq / eq.cummax() - 1).min()
    hit_rate = (r > 0).mean()
    turnover = flips.sum() / max(len(r), 1)

    return {
        "total_return": float(eq.iloc[-1] - 1) if len(eq) else 0.0,
        "CAGR": float(cagr),
        "vol_annual": float(vol),
        "Sharpe": float(sharpe),
        "max_drawdown": float(maxdd),
        "hit_rate": float(hit_rate),
        "turnover": float(turnover),
    }

# tau grid (matches your earlier run)
taus = np.round(np.linspace(0.50, 0.62, 13), 2)

rows = []
for t in taus:
    m = kpis_for_tau(t)
    rows.append({"model": "LR", "tau": float(t), **m})

tau_df = pd.DataFrame(rows).sort_values("Sharpe", ascending=False).reset_index(drop=True)

# Report best by Sharpe
best = tau_df.iloc[0]
print(f"LR: best Sharpe @ tau={best['tau']:.3f} | Sharpe={best['Sharpe']:.2f} | CAGR={best['CAGR']*100:.2f}%")
display(tau_df)

# Save sweep
out_path = Path("data/multiticker_tau_sweep.csv")
tau_df.to_csv(out_path, index=False)
print(f"Saved: {out_path}")


LR: best Sharpe @ tau=0.590 | Sharpe=1.09 | CAGR=5.04%


Unnamed: 0,model,tau,total_return,CAGR,vol_annual,Sharpe,max_drawdown,hit_rate,turnover
0,LR,0.59,0.084581,0.050415,0.046056,1.094631,-0.032745,0.033654,0.028846
1,LR,0.51,0.491386,0.273959,0.265829,1.030585,-0.326602,0.391827,0.120192
2,LR,0.52,0.466461,0.261019,0.258467,1.009873,-0.309151,0.358173,0.110577
3,LR,0.58,0.122053,0.072251,0.073834,0.978565,-0.048955,0.0625,0.072115
4,LR,0.62,0.007635,0.004618,0.004834,0.955375,-0.0005,0.004808,0.009615
5,LR,0.5,0.375159,0.212858,0.273161,0.77924,-0.333605,0.430288,0.120192
6,LR,0.61,0.021966,0.013249,0.017988,0.736566,-0.008664,0.012019,0.019231
7,LR,0.6,0.030501,0.018367,0.033379,0.550251,-0.034897,0.019231,0.028846
8,LR,0.57,0.082119,0.048969,0.109442,0.447445,-0.123255,0.088942,0.110577
9,LR,0.53,0.178405,0.104557,0.246878,0.423516,-0.314957,0.302885,0.139423


Saved: data\multiticker_tau_sweep.csv


In [81]:
# --- Equity curve plots (best per model) ---
for name, (eq, pos, parts) in curves.items():
    plt.figure(figsize=(8,4))
    plt.plot(eq, label=f"{name}")
    plt.title(f"Equity Curve – {name}")
    plt.xlabel("Time (test)")
    plt.ylabel("Equity (x)")
    plt.grid(True, alpha=.3)
    plt.legend()
    out = FIG_DIR / f"equity_curve_{name.lower()}.png"
    plt.tight_layout()
    plt.savefig(out, dpi=150)
    plt.close()
    print("Saved:", out)


Saved: reports\figures\equity_curve_lr.png


In [82]:
# --- Backtest summary JSON ---
summary = {"fee_bps": FEE_BPS, "slippage_bps": SLIPPAGE_BPS, "allow_short": ALLOW_SHORT, "models": {}}

for name in probs.keys():
    best_row = bt_df[bt_df["model"]==name].iloc[0].to_dict()
    summary["models"][name] = {
        "tau": best_row["tau"],
        "Sharpe": best_row["Sharpe"],
        "CAGR": best_row["CAGR"],
        "max_drawdown": best_row["max_drawdown"],
        "hit_rate": best_row["hit_rate"],
        "total_return": best_row["total_return"],
        "vol_annual": best_row["vol_annual"]
    }

out_json = ART_DIR / "backtest_summary.json"
with open(out_json, "w") as f:
    json.dump(summary, f, indent=2)
print("Wrote:", out_json)
summary


Wrote: artifacts\backtest_summary.json


{'fee_bps': 5.0,
 'slippage_bps': 0.0,
 'allow_short': False,
 'models': {'LR': {'tau': 0.5900000000000001,
   'Sharpe': 1.0908623916378921,
   'CAGR': 0.05032491896714353,
   'max_drawdown': -0.03274495697381219,
   'hit_rate': 0.03365384615384615,
   'total_return': 0.08458136662655846,
   'vol_annual': 0.04605617349906653}}}

In [83]:
# --- Optional: overlay buy & hold with best-τ strategy ---
# best τ for LR from the sweep table
best_tau_lr = float(
    bt_df[bt_df["model"]=="LR"].sort_values("Sharpe", ascending=False).iloc[0]["tau"]
)

# Buy & hold = always in the market, no fees
bh_eq, _ = equity_curve(r_next, np.ones_like(r_next), fee_bps=0.0, slippage_bps=0.0)

plt.figure(figsize=(8,4))
plt.plot(bh_eq, label="Buy & Hold")

# Plot each strategy curve you saved in `curves`
for name, (eq, _, _) in curves.items():
    plt.plot(eq, label=f"{name} (best τ)")

plt.title("Equity Curve — Strategy vs Buy & Hold")
plt.xlabel("Time (test)"); plt.ylabel("Equity (x)")
plt.grid(True, alpha=.3); plt.legend(); plt.tight_layout()
out = FIG_DIR / "equity_curve_vs_buyhold.png"
plt.savefig(out, dpi=150); plt.close()
print("Saved:", out)


Saved: reports\figures\equity_curve_vs_buyhold.png


In [84]:
# --- Optional: fee sensitivity for LR at best τ ---
fees_bps = [0, 2.5, 5, 10, 20]
best_tau_lr = float(
    bt_df[bt_df["model"]=="LR"].sort_values("Sharpe", ascending=False).iloc[0]["tau"]
)
pos_lr = (probs["LR"] > best_tau_lr).astype(float)

rows = []
for fbps in fees_bps:
    eq, parts = equity_curve(r_next, pos_lr, fee_bps=fbps, slippage_bps=0.0)
    m = perf_metrics(parts["net"], dates_te)
    rows.append({"fee_bps": fbps, **m})

sens = pd.DataFrame(rows).round(4)
display(sens)
sens.to_csv(DATA_DIR / "multiticker_fee_sweep_AAPL_LR.csv", index=False)
print("Saved:", DATA_DIR / "multiticker_fee_sweep_AAPL_LR.csv")


Unnamed: 0,fee_bps,total_return,CAGR,vol_annual,Sharpe,max_drawdown,hit_rate,turnover
0,0.0,0.0911,0.0541,0.0464,1.1618,-0.0323,0.0337,0.0288
1,2.5,0.0878,0.0522,0.0462,1.1266,-0.0325,0.0337,0.0192
2,5.0,0.0846,0.0503,0.0461,1.0909,-0.0327,0.0337,0.0192
3,10.0,0.0781,0.0465,0.0458,1.0181,-0.0332,0.0337,0.0192
4,20.0,0.0653,0.039,0.0453,0.8678,-0.035,0.0337,0.0192


Saved: data\multiticker_fee_sweep_AAPL_LR.csv


In [85]:
import json, pandas as pd
best_tau = float(pd.read_csv("data/multiticker_tau_sweep.csv")
                 .sort_values(["model","Sharpe"], ascending=[True, False])
                 .query("model=='LR'").iloc[0]["tau"])
with open("artifacts/threshold.json","w") as f:
    json.dump({"LR":{"tau":best_tau}}, f, indent=2)
print("Saved best LR threshold:", best_tau)


Saved best LR threshold: 0.59
