In [4]:
# --- Imports & params ---
import json, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.metrics import roc_auc_score

warnings.filterwarnings("ignore")

DATA_DIR = Path("data")
ART_DIR  = Path("artifacts")
FIG_DIR  = Path("reports/figures")
for p in [DATA_DIR, ART_DIR, FIG_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Backtest knobs
FEE_BPS      = 5.0      # round-trip trading cost in bps (e.g., 5 = 0.05%)
SLIPPAGE_BPS = 0.0      # extra bps per trade (optional)
TAU_GRID     = np.linspace(0.50, 0.65, 16)  # probability threshold sweep
ALLOW_SHORT  = False     # True → 3-way long/short/flat; False → long/flat


In [5]:
# --- Split loader with Parquet-or-CSV fallback (drop-in replacement for Cell 2) ---
from pathlib import Path
import pandas as pd
import numpy as np

def _can_parquet():
    try:
        import pyarrow  # noqa: F401
        return True
    except Exception:
        try:
            import fastparquet  # noqa: F401
            return True
        except Exception:
            return False

def write_table(df: pd.DataFrame, base: Path):
    """
    Save to <base>.parquet if a parquet engine is installed, else <base>.csv
    Returns the actual Path written.
    """
    if _can_parquet():
        path = base.with_suffix(".parquet")
        df.to_parquet(path, index=False)
    else:
        path = base.with_suffix(".csv")
        df.to_csv(path, index=False)
    return path

def read_table(base: Path) -> pd.DataFrame:
    """
    Read from <base>.parquet if it exists, else <base>.csv.
    """
    p_parq = base.with_suffix(".parquet")
    p_csv  = base.with_suffix(".csv")
    if p_parq.exists():
        return pd.read_parquet(p_parq)
    if p_csv.exists():
        return pd.read_csv(p_csv)
    raise FileNotFoundError(f"Neither {p_parq.name} nor {p_csv.name} exists.")

# --- Load train/test splits; rebuild from df_nb02.csv if missing ---
def rebuild_splits_from_phase2():
    base_csv = DATA_DIR / "df_nb02.csv"
    if not base_csv.exists():
        raise FileNotFoundError("Missing data/df_nb02.csv (Phase 2).")
    df = pd.read_csv(base_csv)

    # time order if available
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.tz_localize(None)
        df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)

    label_col = next((c for c in ["y","target","label","y_next_up"] if c in df.columns), None)
    if label_col is None:
        raise RuntimeError("No label column in df_nb02.csv (expected y/target/label/y_next_up).")

    y = df.pop(label_col).astype(int)
    # drop obvious non-features
    X = df.drop(columns=[c for c in ["date","ticker","symbol","spy_close","vix_close"] if c in df.columns], errors="ignore")
    X = X.select_dtypes(include=[np.number]).copy()

    n = len(X); cut = int(n*0.8)
    X_tr, X_te = X.iloc[:cut].copy(), X.iloc[cut:].copy()
    y_tr, y_te = y.iloc[:cut].copy(), y.iloc[cut:].copy()

    # persist for next runs (Parquet if possible, else CSV)
    write_table(X_tr, DATA_DIR / "train")
    write_table(X_te, DATA_DIR / "test")
    write_table(pd.DataFrame({"y": y_tr}), DATA_DIR / "y_train")
    write_table(pd.DataFrame({"y": y_te}), DATA_DIR / "y_test")
    print("Rebuilt splits → data/train.*, data/test.*, y_*.{parquet|csv}")
    return X_tr, X_te, y_tr.values, y_te.values

train_base = DATA_DIR / "train"
test_base  = DATA_DIR / "test"
ytr_base   = DATA_DIR / "y_train"
yte_base   = DATA_DIR / "y_test"

try:
    X_tr = read_table(train_base).copy()
    X_te = read_table(test_base).copy()
except FileNotFoundError:
    X_tr, X_te, y_tr, y_te = rebuild_splits_from_phase2()
else:
    # labels either embedded or separate y files
    label_col = next((c for c in ["y","target","label","y_next_up"] if c in X_tr.columns), None)
    if label_col:
        y_tr = X_tr.pop(label_col).astype(int).values
        y_te = X_te.pop(label_col).astype(int).values
    else:
        y_tr = read_table(ytr_base).iloc[:,0].astype(int).values
        y_te = read_table(yte_base).iloc[:,0].astype(int).values

print("Loaded splits:", X_tr.shape, X_te.shape, "| labels:", len(y_tr), len(y_te))


Loaded splits: (1662, 14) (416, 14) | labels: 1662 416


In [6]:
# --- Load models ---
import joblib
path_lr  = ART_DIR / "lr.joblib"      # your saved LR model
path_xgb = ART_DIR / "model_xgb.pkl"  # only if you trained/saved XGB

HAS_LR  = path_lr.exists()
HAS_XGB = path_xgb.exists()

mdl_lr  = joblib.load(path_lr)  if HAS_LR  else None
mdl_xgb = joblib.load(path_xgb) if HAS_XGB else None
print("HAS_LR:", HAS_LR, "| HAS_XGB:", HAS_XGB)


HAS_LR: True | HAS_XGB: False


In [7]:
# --- Align features to training spec + apply scaler, then predict ---
import json

# 1) feature list
feat_path   = ART_DIR / "feature_list.json"
scaler_path = ART_DIR / "scaler.joblib"

if feat_path.exists():
    with open(feat_path, "r") as fh:
        feat_list = json.load(fh)
elif HAS_LR and hasattr(mdl_lr, "feature_names_in_"):
    feat_list = list(mdl_lr.feature_names_in_)
else:
    raise RuntimeError("Missing artifacts/feature_list.json and model has no feature_names_in_.")

def conform(df, cols):
    df = df.copy()
    for c in cols:
        if c not in df.columns:
            df[c] = 0.0
    return df[cols].copy()

X_tr_aligned = conform(X_tr, feat_list)
X_te_aligned = conform(X_te, feat_list)

# 2) scaler
if scaler_path.exists():
    scaler = joblib.load(scaler_path)
    X_tr_in = scaler.transform(X_tr_aligned.values)
    X_te_in = scaler.transform(X_te_aligned.values)
else:
    X_tr_in = X_tr_aligned.values
    X_te_in = X_te_aligned.values

print("Aligned shapes:", X_tr_in.shape, X_te_in.shape)
print("Model expects:", getattr(mdl_lr, 'n_features_in_', 'unknown'))

# 3) predict (use aligned arrays!)
from sklearn.metrics import roc_auc_score, brier_score_loss
probs = {}

if HAS_LR and mdl_lr is not None:
    probs["LR"] = mdl_lr.predict_proba(X_te_in)[:, 1]

if HAS_XGB and mdl_xgb is not None:
    # XGB usually trained on same feat_list; scale only if you trained it with scaling
    probs["XGB"] = mdl_xgb.predict_proba(X_te_aligned.values)[:, 1]

# quick metrics
rows = []
for name, p in probs.items():
    rows.append({
        "model": name,
        "AUC_test": float(roc_auc_score(y_te, p)),
        "Brier_test": float(brier_score_loss(y_te, p)),
        "mean_p": float(np.mean(p)),
        "n": int(len(p)),
    })
pd.DataFrame(rows).round(4)


Aligned shapes: (1662, 8) (416, 8)
Model expects: 8


Unnamed: 0,model,AUC_test,Brier_test,mean_p,n
0,LR,0.4525,0.275,0.5395,416


In [8]:
# --- Predict on test (aligned) ---
from sklearn.metrics import roc_auc_score, brier_score_loss

probs = {}

# LR uses the scaled/aligned matrix
if HAS_LR and mdl_lr is not None:
    probs["LR"] = mdl_lr.predict_proba(X_te_in)[:, 1]

# XGB (if present) uses aligned *unscaled* features unless you also scaled at train time
if HAS_XGB and mdl_xgb is not None:
    probs["XGB"] = mdl_xgb.predict_proba(X_te_aligned.values)[:, 1]

if not probs:
    raise RuntimeError("No models available to predict. Ensure lr.joblib or model_xgb.pkl exists.")

# Quick metrics table
rows = []
for name, p in probs.items():
    rows.append({
        "model": name,
        "AUC_test": float(roc_auc_score(y_te, p)),
        "Brier_test": float(brier_score_loss(y_te, p)),
        "mean_p": float(np.mean(p)),
        "n": int(len(p)),
    })
metrics = pd.DataFrame(rows).round(4)
display(metrics)


Unnamed: 0,model,AUC_test,Brier_test,mean_p,n
0,LR,0.4525,0.275,0.5395,416


In [9]:
# --- Build next-day returns (ret1) aligned to test period length ---
base_csv = DATA_DIR / "df_nb02.csv"
df_all = pd.read_csv(base_csv)
if "date" not in df_all.columns:
    raise KeyError("'date' missing in df_nb02.csv")
df_all["date"] = pd.to_datetime(df_all["date"], errors="coerce").dt.tz_localize(None)
df_all = df_all.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)

price_col = next((c for c in ["adj_close","Adj Close","close","Close","price"] if c in df_all.columns), None)
if price_col is None:
    raise RuntimeError("No price column found in df_nb02.csv (need adj_close/close).")

df_all["ret1"] = pd.Series(df_all[price_col]).pct_change().shift(-1)

N = len(y_te)
df_test_slice = df_all.iloc[-N:].copy().reset_index(drop=True)
dates_te = df_test_slice["date"].values
r_next   = df_test_slice["ret1"].fillna(0.0).values

print("Test span:", df_test_slice['date'].min(), "→", df_test_slice['date'].max(), "| n=", len(df_test_slice))


Test span: 2022-03-29 00:00:00 → 2023-12-28 00:00:00 | n= 416


In [10]:
# --- Backtest helpers ---
def positions_from_probs(p, tau, allow_short=False):
    if allow_short:
        return np.where(p>tau, 1.0, np.where(p<1.0-tau, -1.0, 0.0))
    return (p > tau).astype(float)

def trade_costs_from_positions(pos, fee_bps=5.0, slippage_bps=0.0):
    return (abs(np.diff(np.r_[0.0, pos])) * (fee_bps + slippage_bps)) / 1e4

def equity_curve(returns, pos, fee_bps=5.0, slippage_bps=0.0):
    gross = pos * returns
    costs = trade_costs_from_positions(pos, fee_bps, slippage_bps)
    net   = gross - costs
    eq = (1.0 + net).cumprod()
    return eq, {"gross": gross, "costs": costs, "net": net}

def perf_metrics(net_ret, dates=None, periods_per_year=252):
    tot_ret = float(np.prod(1.0 + net_ret) - 1.0)
    if dates is not None and len(dates)>1:
        years = (pd.to_datetime(dates[-1]) - pd.to_datetime(dates[0])).days / 365.25
        cagr = (1.0 + tot_ret)**(1.0/years) - 1.0 if years>0 else np.nan
    else:
        years = len(net_ret)/periods_per_year
        cagr = (1.0 + tot_ret)**(1.0/years) - 1.0 if years>0 else np.nan

    mu = np.mean(net_ret); sd = np.std(net_ret, ddof=1)
    sharpe = (mu/(sd + 1e-12)) * np.sqrt(periods_per_year)

    curve = np.cumprod(1.0 + net_ret)
    peak  = np.maximum.accumulate(curve)
    maxdd = float((curve/peak - 1.0).min())

    hit_rate = float(np.mean(net_ret > 0.0))
    turnover = float(np.mean(np.abs(np.diff(np.r_[0.0, (net_ret!=0).astype(float)]))))

    return {"total_return": float(tot_ret), "CAGR": float(cagr), "vol_annual": float(sd*np.sqrt(periods_per_year)),
            "Sharpe": float(sharpe), "max_drawdown": maxdd, "hit_rate": hit_rate, "turnover": turnover}


In [11]:
# --- Sweep tau and pick best Sharpe per model ---
results = []
curves  = {}

for name, p in probs.items():
    best = None
    for tau in TAU_GRID:
        pos = positions_from_probs(p, tau, allow_short=ALLOW_SHORT)
        eq, parts = equity_curve(r_next, pos, FEE_BPS, SLIPPAGE_BPS)
        metrics = perf_metrics(parts["net"], dates_te)
        row = {"model": name, "tau": float(tau), **metrics}
        results.append(row)
        if best is None or metrics["Sharpe"] > best["Sharpe"]:
            best = {**row}
            curves[name] = (eq, pos, parts)
    print(f"{name}: best Sharpe @ tau={best['tau']:.3f} | Sharpe={best['Sharpe']:.2f} | CAGR={best['CAGR']:.2%}")

bt_df = pd.DataFrame(results).sort_values(["model","Sharpe"], ascending=[True, False])
display(bt_df.head(12))

out_csv = DATA_DIR / "multiticker_tau_sweep.csv"
bt_df.to_csv(out_csv, index=False)
print("Saved:", out_csv)


LR: best Sharpe @ tau=0.540 | Sharpe=0.43 | CAGR=6.17%


Unnamed: 0,model,tau,total_return,CAGR,vol_annual,Sharpe,max_drawdown,hit_rate,turnover
4,LR,0.54,0.110391,0.061681,0.191302,0.426684,-0.161143,0.286058,0.086538
5,LR,0.55,0.073879,0.041583,0.182496,0.327327,-0.202796,0.269231,0.081731
3,LR,0.53,0.046769,0.026471,0.200241,0.237934,-0.202824,0.300481,0.086538
7,LR,0.57,0.038721,0.021953,0.168304,0.220221,-0.170964,0.233173,0.091346
6,LR,0.56,0.028869,0.016401,0.175484,0.185351,-0.196387,0.252404,0.076923
13,LR,0.63,0.021197,0.012062,0.141483,0.159846,-0.122411,0.122596,0.081731
12,LR,0.62,0.010087,0.005753,0.145827,0.113896,-0.14239,0.141827,0.096154
1,LR,0.51,0.002878,0.001644,0.205808,0.110968,-0.221052,0.310096,0.076923
0,LR,0.5,-0.011609,-0.006652,0.206687,0.068728,-0.223608,0.317308,0.076923
2,LR,0.52,-0.015098,-0.008658,0.203788,0.056298,-0.229025,0.300481,0.081731


Saved: data\multiticker_tau_sweep.csv


In [12]:
# --- Equity curve plots (best per model) ---
for name, (eq, pos, parts) in curves.items():
    plt.figure(figsize=(8,4))
    plt.plot(eq, label=f"{name}")
    plt.title(f"Equity Curve – {name}")
    plt.xlabel("Time (test)")
    plt.ylabel("Equity (x)")
    plt.grid(True, alpha=.3)
    plt.legend()
    out = FIG_DIR / f"equity_curve_{name.lower()}.png"
    plt.tight_layout()
    plt.savefig(out, dpi=150)
    plt.close()
    print("Saved:", out)


Saved: reports\figures\equity_curve_lr.png


In [13]:
# --- Backtest summary JSON ---
summary = {"fee_bps": FEE_BPS, "slippage_bps": SLIPPAGE_BPS, "allow_short": ALLOW_SHORT, "models": {}}

for name in probs.keys():
    best_row = bt_df[bt_df["model"]==name].iloc[0].to_dict()
    summary["models"][name] = {
        "tau": best_row["tau"],
        "Sharpe": best_row["Sharpe"],
        "CAGR": best_row["CAGR"],
        "max_drawdown": best_row["max_drawdown"],
        "hit_rate": best_row["hit_rate"],
        "total_return": best_row["total_return"],
        "vol_annual": best_row["vol_annual"]
    }

out_json = ART_DIR / "backtest_summary.json"
with open(out_json, "w") as f:
    json.dump(summary, f, indent=2)
print("Wrote:", out_json)
summary


Wrote: artifacts\backtest_summary.json


{'fee_bps': 5.0,
 'slippage_bps': 0.0,
 'allow_short': False,
 'models': {'LR': {'tau': 0.54,
   'Sharpe': 0.42668389532869555,
   'CAGR': 0.06168067915059794,
   'max_drawdown': -0.16114303068015778,
   'hit_rate': 0.2860576923076923,
   'total_return': 0.11039118539082615,
   'vol_annual': 0.19130180407665096}}}