In [3]:
import sys, os, importlib, inspect

# 1) Make sure the repo root (which contains `src/`) is on sys.path.
#    Since notebooks live in `notebooks/`, the repo root is one level up.
sys.path.append(os.path.abspath(".."))

# 2) Import (and reload) your module so edits take effect.
from src import market as market_mod
importlib.reload(market_mod)

# 3) Pull the function you need.
from src.market import add_market_context

# Optional: sanity check that we’re importing the file you just edited
print("Using:", market_mod.__file__)
print("add_market_context head:\n",
      "\n".join(inspect.getsource(market_mod.add_market_context).splitlines()[:8]))


Using: c:\Users\byu30\.projects\stock-direction-ml\src\market.py
add_market_context head:
 def add_market_context(df: pd.DataFrame, start: str | None = None, end: str | None = None) -> pd.DataFrame:
    """
    Join SPY & VIX by *date index* (avoids 'different levels' merge errors).
    Adds columns: spy_close, mkt_ret1, mkt_ret5, vix_close, vix_chg1
    """
    # -------- normalize left frame --------
    left = df.copy()



In [4]:
# --- Imports & hot-reload of local modules ---
import sys, os
from importlib import reload
import numpy as np
import pandas as pd

# make repo root importable
sys.path.append(os.path.abspath(".."))

# local modules
from src import data as data_mod, features as features_mod, market as market_mod, utils as utils_mod
reload(data_mod); reload(features_mod); reload(market_mod); reload(utils_mod)

from src.data import get_data
from src.features import add_features
from src.market import add_market_context
from src.utils import make_labels


In [5]:
import importlib, inspect, src.market as market_mod
print("market.py path:", market_mod.__file__)
print("First 25 lines of add_market_context:\n", "\n".join(inspect.getsource(market_mod.add_market_context).splitlines()[:25]))


market.py path: c:\Users\byu30\.projects\stock-direction-ml\src\market.py
First 25 lines of add_market_context:
 def add_market_context(df: pd.DataFrame, start: str | None = None, end: str | None = None) -> pd.DataFrame:
    """
    Join SPY & VIX by *date index* (avoids 'different levels' merge errors).
    Adds columns: spy_close, mkt_ret1, mkt_ret5, vix_close, vix_chg1
    """
    # -------- normalize left frame --------
    left = df.copy()

    # If 'date' is not a column, try to bring it out of the index
    if "date" not in left.columns:
        idx_names = list(getattr(left.index, "names", []) or [])
        if "date" in idx_names:
            left = left.reset_index("date")
        else:
            left = left.reset_index()

    # Flatten MultiIndex columns if present
    if isinstance(left.columns, pd.MultiIndex):
        left.columns = [
            ("_".join(map(str, c)).strip() if isinstance(c, tuple) else str(c)) for c in left.columns
        ]

    # Standardize date co

In [6]:
# --- Params ---
TICKER = "AAPL"
START, END = "2015-01-01", "2023-12-31"
TAU, DEAD_ZONE = 0.001, True  # ±0.1% dead-zone

# --- Load → featurize → market context → label ---
df = get_data(TICKER, start=START, end=END)
df = add_features(df)
df = add_market_context(df, start=START, end=END)   # NEW
df = make_labels(df, tau=TAU, dead_zone=DEAD_ZONE)

# If merge created any gaps, drop them
df = df.dropna().reset_index(drop=True)

# Exclude raw price/label columns; everything else becomes a feature
exclude = ["date","open","high","low","close","volume","ret_next","y"]
feat_cols = [c for c in df.columns if c not in exclude]

# quick sanity check: (rows, feature_count)
len(df), len(feat_cols)


MergeError: Not allowed to merge between different levels. (1 levels on the left, 2 on the right)

In [None]:
n = len(df)
i_tr = int(0.70*n); i_va = int(0.85*n)

X_tr, y_tr = X[:i_tr], y[:i_tr]
X_va, y_va = X[i_tr:i_va], y[i_tr:i_va]
X_te, y_te = X[i_va:], y[i_va:]

scaler = StandardScaler().fit(X_tr)
X_tr = scaler.transform(X_tr); X_va = scaler.transform(X_va); X_te = scaler.transform(X_te)
X_tr.shape, X_va.shape, X_te.shape

In [None]:
clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_tr, y_tr)

p_va = clf.predict_proba(X_va)[:,1]
p_te = clf.predict_proba(X_te)[:,1]

pred_va = (p_va>0.5).astype(int)
pred_te = (p_te>0.5).astype(int)

print("VAL ROC-AUC:", roc_auc_score(y_va, p_va))
print("VAL BalAcc :", balanced_accuracy_score(y_va, pred_va))
print("VAL F1     :", f1_score(y_va, pred_va))
print(confusion_matrix(y_va, pred_va))
print(classification_report(y_va, pred_va, digits=3))

print("\nTEST ROC-AUC:", roc_auc_score(y_te, p_te))
print("TEST BalAcc :", balanced_accuracy_score(y_te, pred_te))
print("TEST F1     :", f1_score(y_te, pred_te))
print(confusion_matrix(y_te, pred_te))
print(classification_report(y_te, pred_te, digits=3))

In [None]:
fig = plt.figure(figsize=(12,5))
ax1 = plt.subplot(1,2,1); RocCurveDisplay.from_predictions(y_te, p_te, ax=ax1); ax1.set_title("Test ROC")
ax2 = plt.subplot(1,2,2); PrecisionRecallDisplay.from_predictions(y_te, p_te, ax=ax2); ax2.set_title("Test PR")
plt.show()


In [None]:
thr = 0.55     # confidence threshold
fee = 0.0010   # 10 bps per trade

test_df = df.iloc[int(0.85*len(df)):].copy().reset_index(drop=True)
test_df["p"] = p_te

pos = (test_df["p"].values > thr).astype(int)   # long-only when confident
r = test_df["ret_next"].values                  # next-day return

trades = np.abs(np.diff(np.r_[0, pos]))         # position changes
costs = trades * fee

strat_r = pos * r - costs
bh_r    = r

def equity_curve(returns): return (1 + pd.Series(returns)).cumprod()
def sharpe(returns):
    s = np.std(returns); 
    return 0.0 if s==0 else np.mean(returns)/s*np.sqrt(252)
def max_drawdown(series):
    peak = series.cummax()
    dd = series/peak - 1.0
    return dd.min()

eq_strat = equity_curve(strat_r)
eq_bh    = equity_curve(bh_r)

print("Strategy Sharpe (ann.):", sharpe(strat_r))
print("Buy&Hold Sharpe (ann.):", sharpe(bh_r))
print("Strategy MaxDD:", max_drawdown(eq_strat))
print("Buy&Hold MaxDD:", max_drawdown(eq_bh))

plt.figure(figsize=(12,5))
plt.plot(eq_strat.values, label=f"Strategy (thr={thr:.2f}, fee={int(fee*1e4)}bps)")
plt.plot(eq_bh.values,    label="Buy & Hold")
plt.title("Equity Curve — Test Period")
plt.legend(); plt.show()


In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt

if 'test_df' not in globals() or 'p_logreg' not in getattr(test_df, 'columns', []):
    test_df = df.iloc[int(0.85*len(df)):].copy().reset_index(drop=True)
    test_df["p_logreg"] = p_te[:len(test_df)]

probs = test_df["p_logreg"].values

coverage = (probs > thr).mean()
trades_count = int(np.abs(np.diff(np.r_[0, (probs > thr).astype(int)])).sum())
print(f"Coverage (in-position days): {coverage:.2%}")
print("Trades:", trades_count)

plt.figure(figsize=(6,4))
plt.hist(probs, bins=30)
plt.axvline(0.5, ls="--"); plt.axvline(thr, color="r")
plt.title("LogReg predicted probabilities (TEST)")
plt.show()


In [None]:
def sharpe_daily(r):
    r = pd.Series(r); s = r.std()
    return 0.0 if s == 0 else r.mean()/s*np.sqrt(252)

def strat_sharpe_from_probs(p, ret_next, fee=0.0010, thr=0.55):
    pos = (p > thr).astype(int)
    trades = np.abs(np.diff(np.r_[0, pos])) * fee
    strat_r = pos * ret_next - trades
    return sharpe_daily(strat_r)

val_slice = slice(int(0.70*len(df)), int(0.85*len(df)))
val_ret_next = df.iloc[val_slice]["ret_next"].values

grid = np.linspace(0.50, 0.60, 21)
thr_log = max(grid, key=lambda t: strat_sharpe_from_probs(p_va, val_ret_next, fee=0.0010, thr=t))
thr_log


In [None]:
thr = float(thr_log)

pos = (test_df["p_logreg"].values > thr).astype(int)
trades = np.abs(np.diff(np.r_[0, pos])); costs = trades * fee
strat_r = pos * r - costs
eq_s, eq_b = (1+pd.Series(strat_r)).cumprod(), (1+pd.Series(r)).cumprod()

print("LogReg Strategy Sharpe (tuned):", sharpe(strat_r))
print("Buy&Hold Sharpe               :", sharpe(bh_r))
print("LogReg MaxDD (tuned):", max_drawdown(eq_s), "| BH MaxDD:", max_drawdown(eq_b))

plt.figure(figsize=(12,5))
plt.plot(eq_s.values, label=f"LogReg (thr={thr:.2f}, fee=10bps)")
plt.plot(eq_b.values, label="Buy & Hold")
plt.title("Equity Curve — TEST (LogReg, tuned on VAL)")
plt.legend(); plt.show()


In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=400, max_depth=4, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    random_state=42, n_jobs=-1
)
xgb.fit(X_tr, y_tr)

p_va_xgb = xgb.predict_proba(X_va)[:, 1]
p_te_xgb = xgb.predict_proba(X_te)[:, 1]

print("VAL AUC (XGB):", roc_auc_score(y_va, p_va_xgb))
print("TEST AUC (XGB):", roc_auc_score(y_te, p_te_xgb))


In [None]:
val_ret_next = df.iloc[val_slice]["ret_next"].values  # already defined slice
grid = np.linspace(0.50, 0.60, 21)
thr_xgb = max(grid, key=lambda t: strat_sharpe_from_probs(p_va_xgb, val_ret_next, fee=0.0010, thr=t))
thr_xgb


In [None]:
fee = 0.0010

test_df_xgb = df.iloc[int(0.85*len(df)):].copy().reset_index(drop=True)
test_df_xgb["p_xgb"] = p_te_xgb
r_x = test_df_xgb["ret_next"].values

pos_x = (test_df_xgb["p_xgb"].values > thr_xgb).astype(int)
trades_x = np.abs(np.diff(np.r_[0, pos_x])); costs_x = trades_x * fee
strat_r_x = pos_x * r_x - costs_x
bh_r_x = r_x

eq_s_x, eq_b_x = (1+pd.Series(strat_r_x)).cumprod(), (1+pd.Series(bh_r_x)).cumprod()

print("XGB Strategy Sharpe:", sharpe(strat_r_x))
print("Buy&Hold Sharpe   :", sharpe(bh_r_x))
print("XGB MaxDD:", max_drawdown(eq_s_x), "| BH MaxDD:", max_drawdown(eq_b_x))

plt.figure(figsize=(12,5))
plt.plot(eq_s_x.values, label=f"XGB (thr={thr_xgb:.2f}, fee=10bps)")
plt.plot(eq_b_x.values, label="Buy & Hold")
plt.title("Equity Curve — TEST (XGBoost)")
plt.legend(); plt.show()


In [None]:
imp = xgb.feature_importances_
order = np.argsort(imp)[::-1][:20]
plt.figure(figsize=(8,6))
plt.barh(np.array(feat_cols)[order][::-1], imp[order][::-1])
plt.title("XGBoost Feature Importance (top 20)")
plt.tight_layout(); plt.show()
