In [1]:
# NB22 — repo root + folders
from pathlib import Path

def find_repo_root(start: Path, must_have=("data","artifacts")) -> Path:
    cur = start.resolve()
    for _ in range(6):
        if all((cur/m).exists() for m in must_have): return cur
        cur = cur.parent
    if start.name.lower()=="notebooks" and all((start.parent/m).exists() for m in must_have):
        return start.parent.resolve()
    raise FileNotFoundError("Repo root not found")

ROOT = find_repo_root(Path.cwd())
print("Repo root →", ROOT)

# Folders we’ll use
(REPORTS := ROOT/"reports").mkdir(parents=True, exist_ok=True)
(FIGS    := REPORTS/"figures").mkdir(parents=True, exist_ok=True)
(DEMO    := REPORTS/"demo").mkdir(parents=True, exist_ok=True)

for p in [REPORTS, FIGS, DEMO]:
    print(p.relative_to(ROOT), "ok")


Repo root → C:\.projects\stock-direction-ml\stock-direction-ml\notebooks
reports ok
reports\figures ok
reports\demo ok


In [2]:
# NB22 — write README.md (safe: no triple-quote pitfalls)
from pathlib import Path

lines = [
"# Stock/Crypto Direction Classifier",
"",
"An end-to-end ML project that predicts the **next-day direction** (up/down) for equities and crypto, with:",
"- Data prep & features (NB01–NB04)",
"- Model training + eval (Logistic Regression) (NB05–NB06, NB13)",
"- Backtest + threshold parity (NB07)",
"- Monitoring + promotion checks (NB10–NB12, NB21)",
"- Daily runner (NB14)",
"- Streamlit demo app (NB15–NB16, NB18–NB21)",
"- Crypto artifacts (NB20)",
"",
"> **Not financial advice.** This is a demo research system.",
"",
"## Quickstart",
"",
"### Run locally",
"```bash",
"# from the repo folder that contains app/",
"python -m pip install -r notebooks/requirements.txt",
"python -m streamlit run app/streamlit_app.py",
"```",
"",
"### Streamlit Cloud",
"- Main file path: `app/streamlit_app.py`",
"- Secrets are not required (uses Yahoo Finance for public data).",
"",
"## App Features",
"- **Data source:** Repo file or live fetch (Yahoo) for equities & crypto.",
"- **Metrics:** ROC AUC, PR AUC, Brier, LogLoss (when labels exist).",
"- **Charts:** Equity vs Buy & Hold, ROC/PR, Calibration, τ-sweep.",
"- **Controls:** Decision threshold (τ) & fee (bps). Per-ticker τ defaults.",
"- **Monitoring tab:** 60-day KPIs, drift summary (PSI/KS), backtest parity, promotion recommendation.",
"",
"### Screenshots (placeholders)",
"Add screenshots here (save under `reports/figures/`):",
"- `reports/figures/screenshot_model.png` – Model tab",
"- `reports/figures/screenshot_monitor.png` – Monitoring tab",
"",
"## Repo Layout",
"```",
"app/",
"  config.py",
"  lib_artifacts.py",
"  lib_features.py",
"  lib_fetch.py",
"  lib_eval.py",
"  lib_monitor.py",
"  streamlit_app.py",
"artifacts/                 # equity artifacts (features, scaler, lr, threshold, tau_map)",
"artifacts_crypto/          # crypto artifacts (NB20)",
"data/                      # df_nb02.* and signals.csv",
"reports/",
"  figures/",
"  demo/                    # small prediction CSVs for sharing",
"notebooks/                 # NB01–NB22",
"```",
"",
"## How It Works (short)",
"1. **Features:** Returns (1/5/10), realized vol (10d), z-vol, RSI(14), MACD+signal, market ret (SPY/BTC), VIX or BTC-vol proxy.",
"2. **Model:** Logistic Regression on standardized features.",
"3. **Threshold (τ):** Picked by final equity in backtest; can be per-ticker (`artifacts/*/tau_map.json`).",
"4. **Equity curve:** Long-only (1 = long, 0 = cash). Fee applied per position flip.",
"5. **Monitoring:** 60-day KPIs + drift + backtest parity → promotion PASS/HOLD.",
"",
"## Repro & Daily",
"- **Daily runner:** NB14 (updates data/paper trade).",
"- **Monitoring snapshot:** NB11 outputs `artifacts/monitor_snapshot.json`.",
"- **Backtest summary:** NB07 writes `artifacts/backtest_summary.json`.",
"",
"## Limitations",
"- AUC is modest (≈0.5–0.55 on splits); demonstration focus.",
"- Long-only, 1-day horizon; no risk sizing.",
"- Yahoo data used for convenience.",
"",
"## License",
"MIT (or your choice)",
]

out = (ROOT / "README.md")
out.write_text("\n".join(lines) + "\n", encoding="utf-8")
print("Wrote:", out.resolve())


Wrote: C:\.projects\stock-direction-ml\stock-direction-ml\notebooks\README.md


In [3]:
# NB22 — export demo predictions (equity)
import numpy as np, pandas as pd
from app.lib_artifacts import load_artifacts
from app.lib_features  import make_dataset

# Load repo df
if (ROOT/"data/df_nb02.csv").exists():
    df = pd.read_csv(ROOT/"data/df_nb02.csv")
elif (ROOT/"data/df_nb02.parquet").exists():
    df = pd.read_parquet(ROOT/"data/df_nb02.parquet")
else:
    raise FileNotFoundError("Missing data/df_nb02.* in data/")

# Parse date
for c in ["date","Date","timestamp","ts"]:
    if c in df.columns:
        try: df[c] = pd.to_datetime(df[c])
        except: pass
        if c != "date": df["date"] = df[c]
        break

# Pick a ticker (prefer AAPL)
ticker = None
if "ticker" in df.columns and "AAPL" in set(df["ticker"]):
    ticker = "AAPL"
elif "ticker" in df.columns:
    ticker = df["ticker"].value_counts().idxmax()
if ticker:
    df = df.loc[df["ticker"]==ticker].copy()

feature_list, scaler, model, tau_art, tau_map = load_artifacts("equity")
default_tau = float(tau_map.get(ticker, tau_art if tau_art is not None else 0.59)) if ticker else (tau_art or 0.59)

X, y, retn, idx, used_cols = make_dataset(df, feature_list)
Xs = scaler.transform(X)
if hasattr(model, "predict_proba"):
    p = model.predict_proba(Xs); p = p[:,1] if p.ndim==2 else p
elif hasattr(model, "decision_function"):
    s = model.decision_function(Xs); p = 1/(1+np.exp(-s))
else:
    p = np.clip(model.predict(Xs).astype(float), 0, 1)
p = np.clip(p, 1e-6, 1-1e-6)
sig = (p >= default_tau).astype(int)

pred = pd.DataFrame({
    "date": (df.iloc[idx]["date"].values if "date" in df.columns else df.index.values),
    "ticker": ticker if ticker else (df["ticker"].iloc[0] if "ticker" in df.columns else "UNKNOWN"),
    "close": (df.iloc[idx]["close"].values if "close" in df.columns else np.nan),
    "proba": p,
    "signal": sig,
    "tau_used": default_tau
})
out_csv = (ROOT/"reports"/"demo"/f"preds_equity_{pred['ticker'].iloc[0]}.csv")
pred.to_csv(out_csv, index=False)
print("Wrote:", out_csv.resolve())
print(pred.tail(3))


Wrote: C:\.projects\stock-direction-ml\stock-direction-ml\notebooks\reports\demo\preds_equity_AAPL.csv
           date ticker       close     proba  signal  tau_used
2683 2025-10-08   AAPL  258.059998  0.528984       1      0.45
2684 2025-10-09   AAPL  254.039993  0.561558       1      0.45
2685 2025-10-10   AAPL  245.270004  0.565084       1      0.45


In [4]:
# NB22 — export demo predictions (crypto) — bullet-proof
import numpy as np, pandas as pd
from app.lib_fetch     import fetch_crypto_df
from app.lib_artifacts import load_artifacts
from app.lib_features  import make_dataset

ticker = "BTC-USD"
start = (pd.Timestamp.today() - pd.Timedelta(days=365)).date()
end   = pd.Timestamp.today().date()

# Build df with NB02C features
dfc = fetch_crypto_df(ticker, start, end)  # may show a PerformanceWarning; OK

# Load crypto artifacts
feature_list, scaler, model, tau_art, tau_map = load_artifacts("crypto")
default_tau = float(tau_map.get(ticker, tau_art if tau_art is not None else 0.59))

# Dataset aligned to features
X, y, retn, idx, used_cols = make_dataset(dfc, feature_list)
Xs = scaler.transform(X)

# Probabilities (1D)
if hasattr(model, "predict_proba"):
    p = model.predict_proba(Xs); p = p[:,1] if p.ndim==2 else p
elif hasattr(model, "decision_function"):
    s = model.decision_function(Xs); p = 1/(1+np.exp(-s))
else:
    p = np.clip(model.predict(Xs).astype(float), 0, 1)
p   = np.clip(np.asarray(p).ravel(), 1e-6, 1-1e-6)
sig = (p >= default_tau).astype(int).ravel()

# Slice to used rows
df_used = dfc.iloc[idx].reset_index(drop=True)
n = len(df_used)
assert len(p) == n and len(sig) == n, f"Length mismatch: p={len(p)} sig={len(sig)} rows={n}"

# --- helpers to extract date/close robustly ---
def extract_dates(df_):
    if "date" in df_.columns:
        return pd.to_datetime(df_["date"], errors="coerce").to_numpy()
    return np.asarray(df_.index.values)

def extract_close(df_, tk: str):
    # returns 1D float array; falls back to NaN if anything odd
    try:
        # If MultiIndex columns, try ('close', tk) or ('close','')
        if isinstance(df_.columns, pd.MultiIndex):
            if ("close", tk) in df_.columns:
                s = df_[("close", tk)]
            elif ("close","") in df_.columns:
                s = df_[("close","")]
            else:
                # flatten and try again
                flat_names = []
                for c in df_.columns:
                    if isinstance(c, tuple):
                        flat_names.append("_".join(str(x) for x in c if str(x)))
                    else:
                        flat_names.append(str(c))
                df2 = df_.copy()
                df2.columns = flat_names
                if f"close_{tk}" in df2.columns:
                    s = df2[f"close_{tk}"]
                elif "close" in df2.columns:
                    s = df2["close"]
                else:
                    return np.full(len(df_), np.nan, dtype=float)
        else:
            if "close" in df_.columns:
                s = df_["close"]
            else:
                return np.full(len(df_), np.nan, dtype=float)
        # ensure 1D numeric
        return pd.to_numeric(pd.Series(s).to_numpy().ravel(), errors="coerce")
    except Exception:
        return np.full(len(df_), np.nan, dtype=float)

dates_axis = extract_dates(df_used)
close_vals = extract_close(df_used, ticker)

# Build output DataFrame column-by-column (guaranteed 1-D)
pred_c = pd.DataFrame(index=np.arange(n))
pred_c["date"]     = dates_axis
pred_c["ticker"]   = ticker
pred_c["close"]    = close_vals
pred_c["proba"]    = p
pred_c["signal"]   = sig
pred_c["tau_used"] = float(default_tau)

# Debug (one-liner shapes)
print("Shapes:", {col: pred_c[col].shape for col in pred_c.columns})

out_csv_c = (ROOT/"reports"/"demo"/f"preds_crypto_{ticker.replace('-','')}.csv")
pred_c.to_csv(out_csv_c, index=False)
print("Wrote:", out_csv_c.resolve())
print(pred_c.tail(3))


Shapes: {'date': (255,), 'ticker': (255,), 'close': (255,), 'proba': (255,), 'signal': (255,), 'tau_used': (255,)}
Wrote: C:\.projects\stock-direction-ml\stock-direction-ml\notebooks\reports\demo\preds_crypto_BTCUSD.csv
          date   ticker          close     proba  signal  tau_used
252 2025-10-23  BTC-USD  110069.726562  0.499439       1      0.45
253 2025-10-24  BTC-USD  111033.921875  0.509068       1      0.45
254 2025-10-25  BTC-USD  111641.726562  0.510362       1      0.45


In [5]:
# NB22 — verify outputs
import pandas as pd

targets = [
    "README.md",
    "reports/demo",
]
for t in targets:
    print(f"{t:20s}:", (ROOT/t).exists())

for f in sorted((ROOT/"reports"/"demo").glob("*.csv")):
    print("CSV:", f.name, "| bytes:", f.stat().st_size)
    print(pd.read_csv(f).head(2))


README.md           : True
reports/demo        : True
CSV: preds_crypto_BTCUSD.csv | bytes: 18911
         date   ticker         close     proba  signal  tau_used
0  2025-02-13  BTC-USD  96623.867188  0.516461       1      0.45
1  2025-02-14  BTC-USD  97508.968750  0.496967       1      0.45
CSV: preds_equity_AAPL.csv | bytes: 202195
         date ticker      close     proba  signal  tau_used
0  2015-02-06   AAPL  26.495501  0.561385       1      0.45
1  2015-02-09   AAPL  26.671507  0.542039       1      0.45
