In [4]:
# NB19 — Cell 0 (optional): install deps into THIS notebook kernel
import sys, subprocess, pathlib
print("Python:", sys.executable)
req = pathlib.Path("requirements.txt")
if not req.exists():
    alt = pathlib.Path("..")/"requirements.txt"
    req = alt if alt.exists() else None

subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip"])
if req:
    print("Installing from:", req)
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", str(req)])
else:
    subprocess.check_call([sys.executable, "-m", "pip", "install",
                           "numpy", "pandas", "scikit-learn", "joblib",
                           "matplotlib", "pyarrow"])
print("✅ Done. If prompted, restart the kernel, then continue with Cell 1.")


Python: c:\.projects\stock-direction-ml\stock-direction-ml\.venv\Scripts\python.exe
Installing from: requirements.txt
✅ Done. If prompted, restart the kernel, then continue with Cell 1.


In [5]:
# NB19 — Repo root & imports
from pathlib import Path
import sys, json, numpy as np, pandas as pd

def find_repo_root(start: Path, must_have=("data", "artifacts")) -> Path:
    cur = start.resolve()
    for _ in range(6):
        if all((cur / m).exists() for m in must_have):
            return cur
        cur = cur.parent
    if start.name.lower() == "notebooks" and all((start.parent / m).exists() for m in must_have):
        return start.parent.resolve()
    raise FileNotFoundError("Repo root not found")

ROOT = find_repo_root(Path.cwd())
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print("Repo root →", ROOT)
print("Has data?      ", (ROOT/"data").exists())
print("Has artifacts? ", (ROOT/"artifacts").exists())

from app.lib_artifacts import load_artifacts
from app.lib_features  import make_dataset
from app.lib_eval      import predict_proba


Repo root → C:\.projects\stock-direction-ml\stock-direction-ml\notebooks
Has data?       True
Has artifacts?  True


In [6]:
# NB19 — Load df_nb02.* and artifacts (equities)
if (ROOT/"data/df_nb02.csv").exists():
    df = pd.read_csv(ROOT/"data/df_nb02.csv")
elif (ROOT/"data/df_nb02.parquet").exists():
    df = pd.read_parquet(ROOT/"data/df_nb02.parquet")
else:
    raise FileNotFoundError("Missing data/df_nb02.* in data/")

# unify/parse date column if present
for c in ["date","Date","timestamp","ts"]:
    if c in df.columns:
        try: df[c] = pd.to_datetime(df[c])
        except: pass
        if c != "date": df["date"] = df[c]
        break

feature_list, scaler, model, tau_art, tau_map_existing = load_artifacts("equity")
print("Rows:", len(df), "| has 'ticker'?", "ticker" in df.columns)
print("Default τ from artifacts:", tau_art, "| existing tau_map entries:", len(tau_map_existing))


Rows: 2686 | has 'ticker'? True
Default τ from artifacts: 0.59 | existing tau_map entries: 0


In [7]:
# NB19 — Helpers
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, brier_score_loss, log_loss

def safe(fn, *a):
    try: return float(fn(*a))
    except Exception: return float("nan")

def final_equity(retn, sig, fee_bps=5):
    flips = np.zeros_like(sig)
    if len(flips)>1: flips[1:] = (sig[1:] != sig[:-1]).astype(int)
    fee = flips * (fee_bps/10000.0)
    eq  = np.cumprod(1 + (retn*sig - fee))
    return float(eq[-1]) if len(eq) else float("nan")

def sweep_tau(y, p, retn, fee_bps=5, grid=np.linspace(0.05, 0.95, 91)):
    f1s, finals = [], []
    for t in grid:
        sig = (p >= t).astype(int)
        f1s.append(safe(f1_score, y, sig))
        finals.append(final_equity(retn, sig, fee_bps))
    best_f1_tau = float(grid[int(np.nanargmax(f1s))]) if np.any(~np.isnan(f1s)) else None
    best_eq_tau = float(grid[int(np.nanargmax(finals))])
    return best_f1_tau, best_eq_tau


In [8]:
# NB19 — Compute per-ticker τ (equities)
results = []
tau_map = dict(tau_map_existing)  # start from existing

tickers = [None] if "ticker" not in df.columns else sorted(df["ticker"].dropna().unique().tolist())
for tk in tickers:
    sdf = df if tk is None else df.loc[df["ticker"] == tk].copy()
    if tk is not None and len(sdf) < 80:   # skip very short series
        continue

    try:
        X, y, retn, idx, used_cols = make_dataset(sdf, feature_list)
    except Exception as e:
        print(f"Skip {tk}: {e}")
        continue
    if len(X)==0 or y is None:
        print(f"Skip {tk}: no usable rows/labels")
        continue

    p = np.clip(predict_proba(model, scaler.transform(X)), 1e-6, 1-1e-6)

    auc   = safe(roc_auc_score, y, p)
    ap    = safe(average_precision_score, y, p)
    brier = safe(brier_score_loss, y, p)
    ll    = safe(log_loss, y, p)

    f1_tau, eq_tau = sweep_tau(y, p, retn, fee_bps=5)
    chosen_tau = eq_tau   # choose by final equity; switch to f1_tau if you prefer

    key = tk if tk is not None else "_GLOBAL"
    tau_map[key] = float(chosen_tau)

    results.append(dict(
        ticker=key, rows=len(sdf), used_rows=len(p), features=len(used_cols),
        auc=auc, ap=ap, brier=brier, logloss=ll,
        best_f1_tau=f1_tau, best_eq_tau=eq_tau, chosen_tau=chosen_tau
    ))

res_df = pd.DataFrame(results).sort_values(by=["ticker"]).reset_index(drop=True)
print(res_df.head(10))


  ticker  rows  used_rows  features       auc        ap     brier   logloss  \
0   AAPL  2686       2686        16  0.521484  0.549973  0.250372  0.693868   

   best_f1_tau  best_eq_tau  chosen_tau  
0         0.15         0.45        0.45  


In [9]:
# NB19 — Write artifacts/tau_map.json and reports/ticker_metrics.csv
out_map = ROOT/"artifacts"/"tau_map.json"
out_map.write_text(json.dumps(tau_map, indent=2), encoding="utf-8")
print("Wrote τ map:", out_map.resolve())

reports_dir = ROOT/"reports"
reports_dir.mkdir(parents=True, exist_ok=True)
out_csv = reports_dir/"ticker_metrics.csv"
res_df.to_csv(out_csv, index=False)
print("Wrote metrics CSV:", out_csv.resolve())

# show first lines of tau_map
print((out_map.read_text(encoding="utf-8")[:600] + "\n..."))


Wrote τ map: C:\.projects\stock-direction-ml\stock-direction-ml\notebooks\artifacts\tau_map.json
Wrote metrics CSV: C:\.projects\stock-direction-ml\stock-direction-ml\notebooks\reports\ticker_metrics.csv
{
  "AAPL": 0.4499999999999999
}
...


In [10]:
# NB19 — Quick check + instructions
print("tau_map keys:", list(tau_map.keys())[:10], "..." if len(tau_map)>10 else "")
print("\nNext:")
print("  1) Run the app and verify defaults:")
print("     streamlit run app/streamlit_app.py")
print("     • Pick a ticker present in tau_map.json → τ slider should default to that value.")
print("  2) Commit:")
print("     git add artifacts/tau_map.json reports/ticker_metrics.csv")
print("     git commit -m \"nb19: seed per-ticker tau_map + metrics\"")
print("     git push")


tau_map keys: ['AAPL'] 

Next:
  1) Run the app and verify defaults:
     streamlit run app/streamlit_app.py
     • Pick a ticker present in tau_map.json → τ slider should default to that value.
  2) Commit:
     git add artifacts/tau_map.json reports/ticker_metrics.csv
     git commit -m "nb19: seed per-ticker tau_map + metrics"
     git push
