In [1]:
# --- Imports & params ---
import os, json, pathlib, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Optional dep (market data)
try:
    import yfinance as yf
except Exception:
    yf = None

# Optional deps (models/explainability)
from pathlib import Path
from sklearn.metrics import roc_auc_score, brier_score_loss, roc_curve, precision_recall_curve, average_precision_score
from sklearn.calibration import calibration_curve

try:
    import joblib
except Exception:
    joblib = None

try:
    import shap
except Exception:
    shap = None

warnings.filterwarnings("ignore")

# Base I/O
DATA_DIR = pathlib.Path("data")
ART_DIR  = pathlib.Path("artifacts")
FIG_DIR  = pathlib.Path("reports/figures")
for p in [DATA_DIR, ART_DIR, FIG_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Experiment switches
USE_MARKET        = True     # SPY/VIX context
USE_FUNDAMENTALS  = False    # placeholder
USE_NEWS          = False    # placeholder

# Run settings
TICKER = "AAPL"
START, END = "2015-01-01", "2023-12-31"


In [2]:
# --- Load the base features/labels from Phase 2 ---
base_csv = DATA_DIR / "df_nb02.csv"
if not base_csv.exists():
    raise FileNotFoundError("Expected Phase-2 output at data/df_nb02.csv. Run notebook 02 first.")

df = pd.read_csv(base_csv)

# Ensure tz-naive datetime
if "date" not in df.columns:
    raise KeyError("'date' column missing in df_nb02.csv")
df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.tz_localize(None)
df = df.dropna(subset=["date"]).reset_index(drop=True)

print(df.shape)
df.head(3)


(2687, 20)


Unnamed: 0,date,open,high,low,close,volume,ret1,ret5,ret10,vol10,volz,rsi14,macd,macd_signal,ticker,spy_close,vix_close,mkt_ret1,mkt_ret5,vix_chg1
0,2015-02-06,26.738338,26.789579,26.38857,26.495506,174826400.0,-0.008421,0.019114,0.056819,0.023677,-0.879268,47.873804,0.616323,0.436496,AAPL,171.193649,17.290001,-0.002765,0.030584,0.026113
1,2015-02-09,26.410844,26.698232,26.38411,26.671499,155559200.0,0.006642,0.013171,0.06271,0.023624,-1.02649,49.439594,0.633758,0.475949,AAPL,170.427444,18.549999,-0.004476,0.013421,0.072874
2,2015-02-10,26.771758,27.212867,26.769531,27.183905,248034000.0,0.019212,0.032461,0.122426,0.019335,-0.104747,53.791711,0.68107,0.516973,AAPL,172.243042,17.23,0.010653,0.009617,-0.071159


In [3]:
# --- Market context via alignment (no merges) ---
def fetch_close_series(ticker: str, start: str, end: str) -> pd.Series:
    """
    Return a pd.Series of adjusted Close with a tz-naive DatetimeIndex.
    Works whether yfinance returns single-level or MultiIndex columns.
    """
    if yf is None:
        raise ImportError("Please `pip install yfinance` to enable USE_MARKET=True.")

    r = yf.download(ticker, start=start, end=end, auto_adjust=True, progress=False)
    if r is None or r.empty:
        raise ValueError(f"No data for {ticker} in {start}..{end}")

    idx = pd.to_datetime(r.index, errors="coerce")
    try:
        if getattr(idx, "tz", None) is not None:
            idx = idx.tz_localize(None)
    except Exception:
        idx = pd.to_datetime(idx, errors="coerce").tz_localize(None)

    if isinstance(r.columns, pd.MultiIndex):
        close = r.xs("Close", axis=1, level=0)
        if isinstance(close, pd.DataFrame):
            close = close.iloc[:, 0]
    else:
        close = r["Close"]

    s = pd.Series(np.asarray(close).reshape(-1), index=idx, name="Close")
    s = s.sort_index()
    s = s[~s.index.duplicated(keep="last")]
    return s

if USE_MARKET:
    dti = pd.to_datetime(df["date"], errors="coerce").dt.tz_localize(None)

    spy = fetch_close_series("SPY", START, END)
    vix = fetch_close_series("^VIX", START, END)

    # Align by label (same-day)
    df["spy_close"] = dti.map(spy)
    df["vix_close"] = dti.map(vix)

    # Context features
    df["mkt_ret1"] = df["spy_close"].pct_change(1)
    df["mkt_ret5"] = df["spy_close"].pct_change(5)
    df["vix_chg1"] = df["vix_close"].pct_change(1)

    # Keep rows with all context features present
    df = df.dropna(subset=["spy_close","vix_close","mkt_ret1","mkt_ret5","vix_chg1"]).reset_index(drop=True)

print(df.shape)
df.filter(["date","spy_close","mkt_ret1","mkt_ret5","vix_close","vix_chg1"]).head(5)


(2235, 20)


Unnamed: 0,date,spy_close,mkt_ret1,mkt_ret5,vix_close,vix_chg1
0,2015-02-13,174.716675,0.004117,0.020579,14.69,-0.042373
1,2015-02-17,174.99147,0.001573,0.02678,15.8,0.075562
2,2015-02-18,175.008133,9.5e-05,0.016054,15.45,-0.022152
3,2015-02-19,174.883163,-0.000714,0.014739,15.29,-0.010356
4,2015-02-20,175.932587,0.006001,0.011105,14.3,-0.064748


In [4]:
# --- Quality checks ---
req = ["spy_close","vix_close","mkt_ret1","mkt_ret5","vix_chg1"]
present = [c for c in req if c in df.columns]
missing = [c for c in req if c not in df.columns]

print("Shape:", df.shape)
print("Present:", present)
print("Missing:", missing)

if present:
    print("\nNulls in context cols:")
    print(df[present].isna().sum())

    print("\nReturn stats:")
    print(df[["mkt_ret1","mkt_ret5","vix_chg1"]].describe().T)


Shape: (2235, 20)
Present: ['spy_close', 'vix_close', 'mkt_ret1', 'mkt_ret5', 'vix_chg1']
Missing: []

Nulls in context cols:
spy_close    0
vix_close    0
mkt_ret1     0
mkt_ret5     0
vix_chg1     0
dtype: int64

Return stats:
           count      mean       std       min       25%       50%       75%  \
mkt_ret1  2235.0  0.000506  0.011414 -0.109424 -0.003734  0.000555  0.005916   
mkt_ret5  2235.0  0.002508  0.023450 -0.179694 -0.007008  0.004305  0.014748   
vix_chg1  2235.0  0.003202  0.085385 -0.259057 -0.044079 -0.007215  0.035938   

               max  
mkt_ret1  0.090603  
mkt_ret5  0.173581  
vix_chg1  1.155979  


In [5]:
# --- Save experiment output (separate file so we don't touch Phase-2) ---
out_name = "df_nb06_market.csv" if USE_MARKET else "df_nb06_base.csv"
out_path = DATA_DIR / out_name
df.to_csv(out_path, index=False)
print("Saved:", out_path)

# --- Record storage format (quiet) ---
meta_path = DATA_DIR / "storage_format.json"
record = {"path": str(out_path), "format": "csv"}

try:
    if meta_path.exists():
        meta = json.load(open(meta_path, "r", encoding="utf-8"))
        if isinstance(meta, dict):
            meta = [meta]
    else:
        meta = []
    meta = [m for m in meta if m.get("path") != record["path"]] + [record]
    json.dump(meta, open(meta_path, "w", encoding="utf-8"), indent=2)
except Exception:
    json.dump([record], open(meta_path, "w", encoding="utf-8"), indent=2)

print("Updated:", meta_path)


Saved: data\df_nb06_market.csv
Updated: data\storage_format.json


In [6]:
# --- Load train/test splits (Parquet-or-CSV) with robust label fallback ---
from pathlib import Path
import numpy as np
import pandas as pd

DATA_DIR = Path("data")

def _can_parquet():
    try:
        import pyarrow  # noqa: F401
        return True
    except Exception:
        try:
            import fastparquet  # noqa: F401
            return True
        except Exception:
            return False

def read_table(base: Path) -> pd.DataFrame:
    """Read <base>.parquet if exists else <base>.csv."""
    p_parq = base.with_suffix(".parquet")
    p_csv  = base.with_suffix(".csv")
    if p_parq.exists():
        return pd.read_parquet(p_parq)
    if p_csv.exists():
        return pd.read_csv(p_csv)
    raise FileNotFoundError(f"Neither {p_parq.name} nor {p_csv.name} exists.")

def write_table(df: pd.DataFrame, base: Path):
    """Write <base>.parquet if engine available else <base>.csv."""
    if _can_parquet():
        df.to_parquet(base.with_suffix(".parquet"), index=False)
    else:
        df.to_csv(base.with_suffix(".csv"), index=False)

def find_or_build_label(df: pd.DataFrame) -> str:
    """Return name of label column, creating 'y' if needed from next-day return."""
    # already present?
    for cand in ["y","target","label","y_next_up"]:
        if cand in df.columns:
            return cand
    # derive from next-day ret1 or from close
    if "ret1" in df.columns:
        next_ret = pd.Series(df["ret1"]).shift(-1)
    else:
        price_col = next((c for c in ["close","Close","Adj Close","adj_close"] if c in df.columns), None)
        if price_col is None:
            raise RuntimeError("No label found and cannot derive it: need 'ret1' or a close-price column.")
        next_ret = pd.Series(df[price_col]).pct_change().shift(-1)
    df["y"] = (next_ret > 0).astype(int)
    return "y"

# 1) Try pre-built train/test (parquet or csv) -------------------------------
try:
    X_tr = read_table(DATA_DIR / "train").copy()
    X_te = read_table(DATA_DIR / "test").copy()

    # peel labels if embedded; else read separate y files
    label_col = next((c for c in ["y","target","label","y_next_up"] if c in X_tr.columns), None)
    if label_col:
        y_tr = X_tr.pop(label_col).astype(int).values
        y_te = X_te.pop(label_col).astype(int).values
    else:
        y_tr = read_table(DATA_DIR / "y_train").iloc[:,0].astype(int).values
        y_te = read_table(DATA_DIR / "y_test").iloc[:,0].astype(int).values

except FileNotFoundError:
    # 2) Fallback: rebuild splits from df_nb02.csv ---------------------------
    base_csv = DATA_DIR / "df_nb02.csv"
    if not base_csv.exists():
        raise RuntimeError(
            "Label not found and pre-built splits missing. "
            "Expected data/df_nb02.csv to rebuild."
        )

    df2 = pd.read_csv(base_csv)
    # normalize dates/order
    if "date" in df2.columns:
        df2["date"] = pd.to_datetime(df2["date"], errors="coerce").dt.tz_localize(None)
        df2 = df2.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)

    # find/derive label
    label_col = find_or_build_label(df2)

    # drop obvious non-features; keep engineered returns, etc.
    drop_cols = [c for c in ["date","ticker","symbol","spy_close","vix_close"] if c in df2.columns]
    y_all = df2.pop(label_col).astype(int).values
    X_all = df2.drop(columns=drop_cols, errors="ignore").select_dtypes(include=[np.number]).copy()

    # time split: last 20% as test
    n = len(X_all); cut = int(n * 0.8)
    X_tr, X_te = X_all.iloc[:cut].copy(), X_all.iloc[cut:].copy()
    y_tr, y_te = y_all[:cut], y_all[cut:]

    # persist for next runs
    write_table(X_tr, DATA_DIR / "train")
    write_table(X_te, DATA_DIR / "test")
    write_table(pd.DataFrame({"y": y_tr}), DATA_DIR / "y_train")
    write_table(pd.DataFrame({"y": y_te}), DATA_DIR / "y_test")
    print("Rebuilt splits → data/train.*, data/test.*, y_*.{parquet|csv}")

print("Shapes:", X_tr.shape, X_te.shape, "| labels:", len(y_tr), len(y_te))
print("Label balance (train/test):", float(np.mean(y_tr)).__round__(3), "/", float(np.mean(y_te)).__round__(3))


Shapes: (1662, 14) (416, 14) | labels: 1662 416
Label balance (train/test): 0.532 / 0.524


In [7]:
# --- Load trained models from Phase 5 (if available) ---
path_lr  = ART_DIR / "model_logreg.pkl"
path_xgb = ART_DIR / "model_xgb.pkl"

HAS_LR  = path_lr.exists() and joblib is not None
HAS_XGB = path_xgb.exists() and joblib is not None

mdl_lr  = joblib.load(path_lr)  if HAS_LR  else None
mdl_xgb = joblib.load(path_xgb) if HAS_XGB else None

print("HAS_LR:", HAS_LR, "| HAS_XGB:", HAS_XGB)


HAS_LR: False | HAS_XGB: False


In [8]:
# --- Predict on test set ---
p_te_lr = mdl_lr.predict_proba(X_te)[:,1] if HAS_LR else None
p_te_xgb = mdl_xgb.predict_proba(X_te)[:,1] if HAS_XGB else None

rows = []
if p_te_lr is not None:
    rows.append({
        "model": "logreg",
        "AUC_test": float(roc_auc_score(y_te, p_te_lr)),
        "Brier_test": float(brier_score_loss(y_te, p_te_lr))
    })
if p_te_xgb is not None:
    rows.append({
        "model": "xgb",
        "AUC_test": float(roc_auc_score(y_te, p_te_xgb)),
        "Brier_test": float(brier_score_loss(y_te, p_te_xgb))
    })

summary = pd.DataFrame(rows).round(4)
display(summary)

(DATA_DIR / "explainability_summary.csv").write_text(summary.to_csv(index=False))
print("Saved:", DATA_DIR / "explainability_summary.csv")


Saved: data\explainability_summary.csv


In [9]:
# --- ROC and PR curves ---
def plot_and_save_curves(y, p, name):
    # ROC
    fpr, tpr, _ = roc_curve(y, p)
    plt.figure(figsize=(5,5))
    plt.plot(fpr, tpr, label=f"{name}")
    plt.plot([0,1],[0,1],"--")
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.title(f"ROC – {name}")
    plt.grid(True, alpha=.3)
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"roc_{name}.png", dpi=150)
    plt.close()

    # PR
    prec, rec, _ = precision_recall_curve(y, p)
    ap = average_precision_score(y, p)
    plt.figure(figsize=(5,5))
    plt.plot(rec, prec, label=f"AP={ap:.3f}")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Precision–Recall – {name}")
    plt.grid(True, alpha=.3)
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"pr_{name}.png", dpi=150)
    plt.close()

if p_te_lr is not None:  plot_and_save_curves(y_te, p_te_lr, "logreg")
if p_te_xgb is not None: plot_and_save_curves(y_te, p_te_xgb, "xgb")

print("Saved ROC/PR figures to", FIG_DIR)


Saved ROC/PR figures to reports\figures


In [10]:
# --- Calibration / reliability ---
def reliability_summary(y_true, p, model_name, n_bins=10):
    bins = pd.qcut(p, q=n_bins, labels=False, duplicates="drop")
    df_bin = pd.DataFrame({"y": y_true, "p": p, "bin": bins})
    grp = df_bin.groupby("bin", as_index=False).agg(
        n=("y","size"),
        avg_pred=("p","mean"),
        avg_true=("y","mean")
    )
    grp.insert(0, "model", model_name)
    return grp

to_save = []
for name, p in [("logreg", p_te_lr), ("xgb", p_te_xgb)]:
    if p is None: 
        continue
    prob_true, prob_pred = calibration_curve(y_te, p, n_bins=10, strategy="quantile")

    plt.figure(figsize=(5,5))
    plt.plot(prob_pred, prob_true, marker="o", label=f"{name}")
    plt.plot([0,1],[0,1], linestyle="--")
    plt.xlabel("Predicted probability")
    plt.ylabel("Observed frequency")
    plt.title(f"Calibration – {name}")
    plt.grid(True, alpha=.3)
    plt.legend()
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"calibration_{name}.png", dpi=150)
    plt.close()

    to_save.append(reliability_summary(y_te, p, name, n_bins=10))

if to_save:
    rel = pd.concat(to_save, ignore_index=True)
    rel.to_csv(DATA_DIR / "reliability_by_decile.csv", index=False)
    print("Saved calibration_*.png and reliability_by_decile.csv")
else:
    print("No model probabilities found for calibration.")


No model probabilities found for calibration.


In [11]:
# --- Global importance ---
saved_any = False

if (shap is not None) and (mdl_xgb is not None) and hasattr(mdl_xgb, "get_booster"):
    # XGBoost SHAP
    expl = shap.TreeExplainer(mdl_xgb)
    shap_vals = expl.shap_values(X_te)
    # Mean |SHAP|
    imp = pd.DataFrame({
        "feature": X_te.columns,
        "mean_abs_shap": np.mean(np.abs(shap_vals), axis=0)
    }).sort_values("mean_abs_shap", ascending=False)

    imp.to_csv(DATA_DIR / "shap_importance_top20.csv", index=False)
    plt.figure(figsize=(7,8))
    shap.summary_plot(shap_vals, X_te, show=False)  # creates a beeswarm
    plt.tight_layout()
    plt.savefig(FIG_DIR / "shap_importance.png", dpi=150, bbox_inches="tight")
    plt.close()
    saved_any = True
    print("Saved SHAP importance artifacts.")

# Logistic regression absolute coefficients as a fallback/extra
if mdl_lr is not None and hasattr(mdl_lr, "coef_"):
    coefs = pd.Series(mdl_lr.coef_.ravel(), index=X_te.columns).abs().sort_values(ascending=False)
    coef_df = coefs.reset_index()
    coef_df.columns = ["feature", "abs_coef"]
    coef_df.to_csv(DATA_DIR / "logreg_abscoef_top20.csv", index=False)

    plt.figure(figsize=(7,5))
    top = coef_df.head(20).sort_values("abs_coef")
    plt.barh(top["feature"], top["abs_coef"])
    plt.title("LogReg |coeff| top-20")
    plt.tight_layout()
    plt.savefig(FIG_DIR / "logreg_coef_importance.png", dpi=150)
    plt.close()
    saved_any = True
    print("Saved logistic-regression coefficient importance.")

if not saved_any:
    print("No importance plot saved (need SHAP or LR coefficients).")


No importance plot saved (need SHAP or LR coefficients).


In [12]:
# --- Local explanation for a single test example (optional) ---
if shap is not None and mdl_xgb is not None and hasattr(mdl_xgb, "get_booster"):
    idx = int(np.random.randint(0, len(X_te)))
    expl = shap.TreeExplainer(mdl_xgb)
    sv = expl.shap_values(X_te.iloc[[idx]])
    try:
        shap.plots.waterfall(shap.Explanation(values=sv[0],
                                              base_values=expl.expected_value,
                                              data=X_te.iloc[idx].values,
                                              feature_names=list(X_te.columns)),
                             show=False)
        plt.tight_layout()
        plt.savefig(FIG_DIR / f"shap_local_idx{idx}.png", dpi=150, bbox_inches="tight")
        plt.close()
        print(f"Saved local SHAP waterfall for idx={idx}.")
    except Exception as e:
        print("Waterfall plot not available with current SHAP version:", e)


In [13]:
# --- Feature drift (KS) between train and test ---
from scipy.stats import ks_2samp

rows = []
for col in X_tr.columns:
    a, b = pd.Series(X_tr[col]).dropna(), pd.Series(X_te[col]).dropna()
    # only run KS if both vary
    if a.nunique() > 1 and b.nunique() > 1:
        stat, pval = ks_2samp(a, b)
    else:
        stat, pval = np.nan, np.nan
    rows.append({"feature": col, "ks_stat": stat, "p_value": pval})

drift = pd.DataFrame(rows).sort_values("p_value", na_position="last")
drift.to_csv(DATA_DIR / "feature_drift_ks.csv", index=False)
print("Saved:", DATA_DIR / "feature_drift_ks.csv")


Saved: data\feature_drift_ks.csv
