In [6]:
# NB20 — Build crypto artifacts (artifacts_crypto/)
from pathlib import Path
import sys, json, numpy as np, pandas as pd

def find_repo_root(start: Path, must_have=("data", "artifacts")) -> Path:
    cur = start.resolve()
    for _ in range(6):
        if all((cur/m).exists() for m in must_have): return cur
        cur = cur.parent
    if start.name.lower()=="notebooks" and all((start.parent/m).exists() for m in must_have):
        return start.parent.resolve()
    raise FileNotFoundError("Repo root not found")

ROOT = find_repo_root(Path.cwd())
if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT))

print("Repo root →", ROOT)
from app.lib_features import add_nb02_features


Repo root → C:\.projects\stock-direction-ml\stock-direction-ml\notebooks


In [7]:
# NB20 — Fetch crypto data & build NB02C features (flatten columns per ticker)
import yfinance as yf
import numpy as np
import pandas as pd
from app.lib_features import add_nb02_features  # from NB18

tickers = ["BTC-USD", "ETH-USD"]  # add more later
start   = "2019-01-01"
end     = None  # to today

REQ_COLS = [
    "date","open","high","low","close","volume","ticker",
    "ret1","ret5","ret10","vol10","volz","rsi14","macd","macd_signal",
    "spy_close","mkt_ret1","mkt_ret5","vix_close","vix_chg1",
    "ret_next","y",
]

def fetch_base(ticker, start, end):
    px = yf.download(ticker, start=start, end=end, auto_adjust=True, progress=False)
    if px.empty:
        raise ValueError(f"No data for {ticker}")
    df = px.rename_axis("date").reset_index()
    df["date"] = pd.to_datetime(df["date"])
    df = df.rename(columns={"Open":"open","High":"high","Low":"low","Close":"close","Volume":"volume"})
    df = df[["date","open","high","low","close","volume"]].copy()
    df["ticker"] = ticker
    return df

# Market proxy series (BTC close)
btc_base = fetch_base("BTC-USD", start, end)[["date","close"]].rename(columns={"close":"btc_close"})

def enrich_crypto(df, btc_ref):
    # merge BTC market proxy on date
    df = df.merge(btc_ref, on="date", how="left")
    # NB02 features on this ticker's close
    df = add_nb02_features(df)
    # Market proxies
    df = df.rename(columns={"btc_close":"spy_close"})  # consistent name with equity pipeline
    df["mkt_ret1"] = df["spy_close"].pct_change(1)
    df["mkt_ret5"] = df["spy_close"].pct_change(5)
    # VIX proxy from BTC realized vol
    ret_btc = df["spy_close"].pct_change()
    vix_proxy = ret_btc.rolling(30).std() * np.sqrt(365) * 100.0
    df["vix_close"] = vix_proxy
    df["vix_chg1"]  = df["vix_close"].pct_change(1)
    # Targets
    df["ret_next"] = df["close"].pct_change().shift(-1)
    df["y"] = (df["ret_next"] > 0).astype(int)
    return df

def _flatten_to_single_level(dfc, ticker):
    """If columns are MultiIndex like ('open','BTC-USD'), keep only this ticker (or '') and flatten."""
    if not isinstance(dfc.columns, pd.MultiIndex):
        return dfc
    keep_cols = []
    new_names = []
    for col in dfc.columns:
        if not isinstance(col, tuple):
            keep_cols.append(col); new_names.append(col); continue
        base, lvl2 = col
        if lvl2 in ("", ticker):
            keep_cols.append(col); new_names.append(base)
    flat = dfc[keep_cols].copy()
    flat.columns = new_names
    return flat

dfs = []
for t in tickers:
    base = fetch_base(t, start, end)
    dfc  = enrich_crypto(base, btc_base)
    dfc  = _flatten_to_single_level(dfc, t)  # <<< flatten here
    missing = [c for c in REQ_COLS if c not in dfc.columns]
    print(f"{t}: rows={len(dfc)} | missing={missing if missing else 'None'}")
    dfs.append(dfc)

# vertical concat of single-level frames
df_all = pd.concat(dfs, ignore_index=True).sort_values(["ticker","date"]).reset_index(drop=True)

# final assert
missing_all = [c for c in REQ_COLS if c not in df_all.columns]
print("Built crypto frame:", df_all["ticker"].value_counts().to_dict(), "| cols:", len(df_all.columns))
print("Missing columns after concat:", missing_all if missing_all else "None")
print(df_all.head(3))
assert not missing_all, f"Required columns missing: {missing_all}"


  df = df.merge(btc_ref, on="date", how="left")


BTC-USD: rows=2489 | missing=None
ETH-USD: rows=2489 | missing=['spy_close']
Built crypto frame: {'BTC-USD': 2489, 'ETH-USD': 2489} | cols: 22
Missing columns after concat: None
        date         open         high          low        close      volume  \
0 2019-01-01  3746.713379  3850.913818  3707.231201  3843.520020  4324200990   
1 2019-01-02  3849.216309  3947.981201  3817.409424  3943.409424  5244856836   
2 2019-01-03  3931.048584  3935.685059  3826.222900  3836.741211  4530215219   

    ticker    spy_close      ret1  ret5  ...  volz       rsi14      macd  \
0  BTC-USD  3843.520020       NaN   NaN  ...   NaN         NaN  0.000000   
1  BTC-USD  3943.409424  0.025989   NaN  ...   NaN  100.000000  7.968386   
2  BTC-USD  3836.741211 -0.027050   NaN  ...   NaN   92.409202  5.611470   

   macd_signal  mkt_ret1  mkt_ret5  vix_close  vix_chg1  ret_next  y  
0     0.000000       NaN       NaN        NaN       NaN  0.025989  1  
1     1.593677  0.025989       NaN        NaN       Na

  df = df.merge(btc_ref, on="date", how="left")


In [8]:
# NB20 — Train StandardScaler + LogisticRegression for crypto
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss, log_loss, f1_score
import joblib

# Feature set (crypto-friendly subset)
CANDIDATES = [
    "ret1","ret5","ret10","vol10","volz","rsi14","macd","macd_signal",
    "mkt_ret1","mkt_ret5","vix_chg1"
]
feat_list = [c for c in CANDIDATES if c in df_all.columns]
print("Using features:", feat_list)

# Drop NA rows on features + labels
tmp = df_all.dropna(subset=feat_list + ["y","ret_next"]).copy()
X = tmp[feat_list].to_numpy()
y = tmp["y"].astype(int).to_numpy()

# Time-aware split: last 20% as test
split_idx = int(len(tmp)*0.8)
X_tr, X_te = X[:split_idx], X[split_idx:]
y_tr, y_te = y[:split_idx], y[split_idx:]

scaler = StandardScaler()
X_trs = scaler.fit_transform(X_tr)
X_tes = scaler.transform(X_te)

lr = LogisticRegression(max_iter=2000, class_weight="balanced", solver="liblinear")
lr.fit(X_trs, y_tr)

# Eval
def pred_proba(model, X):
    if hasattr(model,"predict_proba"):
        p = model.predict_proba(X)
        return p[:,1] if p.ndim==2 else p
    if hasattr(model,"decision_function"):
        s = model.decision_function(X); return 1/(1+np.exp(-s))
    return np.clip(model.predict(X).astype(float), 0, 1)

p_tr = np.clip(pred_proba(lr, X_trs), 1e-6, 1-1e-6)
p_te = np.clip(pred_proba(lr, X_tes), 1e-6, 1-1e-6)

def metrics(y, p):
    def safe(fn,*a):
        try: return float(fn(*a))
        except: return float("nan")
    return dict(
        auc=safe(roc_auc_score,y,p),
        ap=safe(average_precision_score,y,p),
        brier=safe(brier_score_loss,y,p),
        logloss=safe(log_loss,y,p)
    )

print("Train:", metrics(y_tr, p_tr))
print("Test :", metrics(y_te, p_te))


Using features: ['ret1', 'ret5', 'ret10', 'vol10', 'volz', 'rsi14', 'macd', 'macd_signal', 'mkt_ret1', 'mkt_ret5', 'vix_chg1']
Train: {'auc': 0.541502498415777, 'ap': 0.5619613535368221, 'brier': 0.24838391377896568, 'logloss': 0.6898855490406733}
Test : {'auc': 0.5374152063312606, 'ap': 0.5619853446714549, 'brier': 0.24867839778011683, 'logloss': 0.6904981909660823}


In [9]:
# NB20 — pick τ by final equity; write artifacts_crypto/*
ART_DIR = ROOT/"artifacts_crypto"
ART_DIR.mkdir(parents=True, exist_ok=True)

# Simple equity objective to choose τ on test
retn_te = tmp["ret_next"].to_numpy()[split_idx:]
def final_equity(retn, sig, fee_bps=5):
    flips = np.zeros_like(sig)
    if len(flips)>1: flips[1:] = (sig[1:] != sig[:-1]).astype(int)
    fee = flips * (fee_bps/10000.0)
    eq  = np.cumprod(1 + (retn*sig - fee))
    return float(eq[-1]) if len(eq) else float("nan")

taus = np.linspace(0.05,0.95,91)
eqs  = []
for t in taus:
    sig = (p_te >= t).astype(int)
    eqs.append(final_equity(retn_te, sig, fee_bps=5))
tau_best = float(taus[int(np.nanargmax(eqs))])
print("Chosen τ (by final equity):", tau_best)

# Save artifacts
(Path := ART_DIR/"feature_list.json").write_text(json.dumps(feat_list, indent=2), encoding="utf-8")
joblib.dump(scaler, ART_DIR/"scaler.joblib")
joblib.dump(lr,     ART_DIR/"lr.joblib")
(ART_DIR/"threshold.json").write_text(json.dumps({"tau": tau_best}, indent=2), encoding="utf-8")

# Per-ticker τ map (seed from same τ; you can customize later)
tau_map = {tk: tau_best for tk in df_all["ticker"].unique()}
(ART_DIR/"tau_map.json").write_text(json.dumps(tau_map, indent=2), encoding="utf-8")

print("Wrote:", (ART_DIR/"feature_list.json").resolve())
print("Wrote:", (ART_DIR/"scaler.joblib").resolve())
print("Wrote:", (ART_DIR/"lr.joblib").resolve())
print("Wrote:", (ART_DIR/"threshold.json").resolve())
print("Wrote:", (ART_DIR/"tau_map.json").resolve())


Chosen τ (by final equity): 0.4499999999999999
Wrote: C:\.projects\stock-direction-ml\stock-direction-ml\notebooks\artifacts_crypto\feature_list.json
Wrote: C:\.projects\stock-direction-ml\stock-direction-ml\notebooks\artifacts_crypto\scaler.joblib
Wrote: C:\.projects\stock-direction-ml\stock-direction-ml\notebooks\artifacts_crypto\lr.joblib
Wrote: C:\.projects\stock-direction-ml\stock-direction-ml\notebooks\artifacts_crypto\threshold.json
Wrote: C:\.projects\stock-direction-ml\stock-direction-ml\notebooks\artifacts_crypto\tau_map.json


In [10]:
# NB20 — presence check
targets = [
    ART_DIR/"feature_list.json",
    ART_DIR/"scaler.joblib",
    ART_DIR/"lr.joblib",
    ART_DIR/"threshold.json",
    ART_DIR/"tau_map.json",
]
for t in targets:
    print(f"{t.relative_to(ROOT)}:", t.exists())

# Quick smoke: load via app loader (crypto)
from app.lib_artifacts import load_artifacts
from app.lib_features  import make_dataset

feat_list2, scaler2, model2, tau_art2, tau_map2 = load_artifacts("crypto")
print("Loaded crypto artifacts — features:", len(feat_list2), "| default τ:", tau_art2)

# Build dataset for BTC only to test end-to-end
btc_df = df_all.loc[df_all["ticker"]=="BTC-USD"].copy()
Xb, yb, rb, idxb, used = make_dataset(btc_df, feat_list2)
pb = np.clip(pred_proba := (model2.predict_proba(scaler2.transform(Xb))[:,1]), 1e-6, 1-1e-6)
print("BTC rows:", len(pb), "| metrics (AUC, AP):",
      float(roc_auc_score(yb, pb)) if yb is not None else "n/a",
      float(average_precision_score(yb, pb)) if yb is not None else "n/a")


artifacts_crypto\feature_list.json: True
artifacts_crypto\scaler.joblib: True
artifacts_crypto\lr.joblib: True
artifacts_crypto\threshold.json: True
artifacts_crypto\tau_map.json: True
Loaded crypto artifacts — features: 11 | default τ: 0.4499999999999999
BTC rows: 2379 | metrics (AUC, AP): 0.5372644590937243 0.5536785240780647


In [11]:
print("Run the app (now with real crypto artifacts):")
print("  streamlit run app/streamlit_app.py")
print("In the sidebar:")
print("  • Data source = Fetch (Yahoo)")
print("  • Asset class = crypto")
print("  • Ticker = BTC-USD / ETH-USD")
print("")
print("Note: If you previously patched a fallback in streamlit_app.py, it's harmless to keep —")
print("      now that artifacts_crypto/ exists, the fallback path won't trigger.")


Run the app (now with real crypto artifacts):
  streamlit run app/streamlit_app.py
In the sidebar:
  • Data source = Fetch (Yahoo)
  • Asset class = crypto
  • Ticker = BTC-USD / ETH-USD

Note: If you previously patched a fallback in streamlit_app.py, it's harmless to keep —
      now that artifacts_crypto/ exists, the fallback path won't trigger.
