In [None]:
import sys, os, importlib, numpy as np, pandas as pd
sys.path.append(os.path.abspath(".."))

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, balanced_accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
import matplotlib.pyplot as plt

from src import data as data_mod, features as features_mod, utils as utils_mod
importlib.reload(data_mod); importlib.reload(features_mod); importlib.reload(utils_mod)
from src.data import get_data
from src.features import add_features
from src.utils import make_labels

In [None]:
df = get_data("AAPL", start="2015-01-01", end="2023-12-31")
df = add_features(df)
df = make_labels(df, tau=0.0)

feat_cols = [c for c in df.columns if c not in ["date","open","high","low","close","volume","ret_next","y"]]
print("Using features:", feat_cols)
X = df[feat_cols].values
y = df["y"].astype(int).values
dates = df["date"].values
len(df), len(feat_cols)

In [None]:
n = len(df)
i_tr = int(0.70*n); i_va = int(0.85*n)

X_tr, y_tr = X[:i_tr], y[:i_tr]
X_va, y_va = X[i_tr:i_va], y[i_tr:i_va]
X_te, y_te = X[i_va:], y[i_va:]

scaler = StandardScaler().fit(X_tr)
X_tr = scaler.transform(X_tr); X_va = scaler.transform(X_va); X_te = scaler.transform(X_te)
X_tr.shape, X_va.shape, X_te.shape

In [None]:
clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_tr, y_tr)

p_va = clf.predict_proba(X_va)[:,1]
p_te = clf.predict_proba(X_te)[:,1]

pred_va = (p_va>0.5).astype(int)
pred_te = (p_te>0.5).astype(int)

print("VAL ROC-AUC:", roc_auc_score(y_va, p_va))
print("VAL BalAcc :", balanced_accuracy_score(y_va, pred_va))
print("VAL F1     :", f1_score(y_va, pred_va))
print(confusion_matrix(y_va, pred_va))
print(classification_report(y_va, pred_va, digits=3))

print("\nTEST ROC-AUC:", roc_auc_score(y_te, p_te))
print("TEST BalAcc :", balanced_accuracy_score(y_te, pred_te))
print("TEST F1     :", f1_score(y_te, pred_te))
print(confusion_matrix(y_te, pred_te))
print(classification_report(y_te, pred_te, digits=3))

In [None]:
fig = plt.figure(figsize=(12,5))
ax1 = plt.subplot(1,2,1); RocCurveDisplay.from_predictions(y_te, p_te, ax=ax1); ax1.set_title("Test ROC")
ax2 = plt.subplot(1,2,2); PrecisionRecallDisplay.from_predictions(y_te, p_te, ax=ax2); ax2.set_title("Test PR")
plt.show()


In [None]:
thr = 0.55     # confidence threshold
fee = 0.0010   # 10 bps per trade

test_df = df.iloc[int(0.85*len(df)):].copy().reset_index(drop=True)
test_df["p"] = p_te

pos = (test_df["p"].values > thr).astype(int)   # long-only when confident
r = test_df["ret_next"].values                  # next-day return

trades = np.abs(np.diff(np.r_[0, pos]))         # position changes
costs = trades * fee

strat_r = pos * r - costs
bh_r    = r

def equity_curve(returns): return (1 + pd.Series(returns)).cumprod()
def sharpe(returns):
    s = np.std(returns); 
    return 0.0 if s==0 else np.mean(returns)/s*np.sqrt(252)
def max_drawdown(series):
    peak = series.cummax()
    dd = series/peak - 1.0
    return dd.min()

eq_strat = equity_curve(strat_r)
eq_bh    = equity_curve(bh_r)

print("Strategy Sharpe (ann.):", sharpe(strat_r))
print("Buy&Hold Sharpe (ann.):", sharpe(bh_r))
print("Strategy MaxDD:", max_drawdown(eq_strat))
print("Buy&Hold MaxDD:", max_drawdown(eq_bh))

plt.figure(figsize=(12,5))
plt.plot(eq_strat.values, label=f"Strategy (thr={thr:.2f}, fee={int(fee*1e4)}bps)")
plt.plot(eq_bh.values,    label="Buy & Hold")
plt.title("Equity Curve — Test Period")
plt.legend(); plt.show()


In [None]:
import numpy as np, matplotlib.pyplot as plt

coverage = (test_df["p"].values > thr).mean()
trades_count = int(np.abs(np.diff(np.r_[0, (test_df["p"].values > thr).astype(int)])).sum())
print(f"Coverage (in-position days): {coverage:.2%}")
print("Trades:", trades_count)

plt.figure(figsize=(6,4))
plt.hist(test_df["p"], bins=30)
plt.axvline(0.5, ls="--"); plt.axvline(thr, color="r")
plt.title("Predicted probabilities on TEST")
plt.show()

In [None]:
def sharpe_daily(ret):
    s = ret.std()
    return 0.0 if s==0 else ret.mean()/s*np.sqrt(252)

def strat_sharpe_from_probs(p, ret_next, fee=0.0010, thr=0.55):
    pos = (p > thr).astype(int)
    trades = np.abs(np.diff(np.r_[0, pos])) * fee
    strat_r = pos * ret_next - trades
    return sharpe_daily(pd.Series(strat_r))

# Build VAL slice (same split as earlier)
val_slice = slice(int(0.70*len(df)), int(0.85*len(df)))
val_df = df.iloc[val_slice].copy().reset_index(drop=True)
val_df["p"] = p_va

grid = np.linspace(0.50, 0.60, 21)
scores = [(t, strat_sharpe_from_probs(val_df["p"].values, val_df["ret_next"].values, fee=0.0010, thr=t)) for t in grid]
thr = max(scores, key=lambda x: x[1])[0]
print("Chosen threshold from VAL:", thr)

In [None]:
# Recompute strategy with new thr on test
pos = (test_df["p"].values > thr).astype(int)
trades = np.abs(np.diff(np.r_[0, pos]))
costs = trades * fee
strat_r = pos * r - costs

eq_strat = (1 + pd.Series(strat_r)).cumprod()
eq_bh    = (1 + pd.Series(r)).cumprod()

print("Strategy Sharpe (ann.):", sharpe(strat_r))
print("Buy&Hold Sharpe (ann.):", sharpe(bh_r))
print("Strategy MaxDD:", max_drawdown(eq_strat))
print("Buy&Hold MaxDD:", max_drawdown(eq_bh))

plt.figure(figsize=(12,5))
plt.plot(eq_strat.values, label=f"Strategy (thr={thr:.2f}, fee={int(fee*1e4)}bps)")
plt.plot(eq_bh.values,    label="Buy & Hold")
plt.title("Equity Curve — Test Period (Threshold Tuned on VAL)")
plt.legend(); plt.show()

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(
    n_estimators=400, max_depth=4, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    random_state=42, n_jobs=-1
)
xgb.fit(X_tr, y_tr)
p_va_xgb = xgb.predict_proba(X_va)[:,1]
p_te_xgb = xgb.predict_proba(X_te)[:,1]

from sklearn.metrics import roc_auc_score
print("VAL AUC (XGB):", roc_auc_score(y_va, p_va_xgb))
print("TEST AUC (XGB):", roc_auc_score(y_te, p_te_xgb))

In [None]:
def sharpe_daily(r): 
    r = pd.Series(r); s = r.std(); 
    return 0 if s==0 else r.mean()/s*np.sqrt(252)

def strat_sharpe(p, ret_next, fee, thr):
    pos = (p>thr).astype(int)
    trades = np.abs(np.diff(np.r_[0,pos])) * fee
    return sharpe_daily(pos*ret_next - trades)

grid = np.linspace(0.50, 0.60, 21)
thr_xgb = max(grid, key=lambda t: strat_sharpe(p_va_xgb, df.iloc[int(0.70*len(df)):int(0.85*len(df))]["ret_next"].values, 0.0010, t))
thr_xgb

In [None]:
fee = 0.0010
test_df = df.iloc[int(0.85*len(df)):].copy().reset_index(drop=True)
test_df["p"] = p_te_xgb
r = test_df["ret_next"].values
pos = (test_df["p"].values > thr_xgb).astype(int)
trades = np.abs(np.diff(np.r_[0,pos])); costs = trades*fee
strat_r = pos*r - costs; bh_r = r

def equity_curve(returns): return (1+pd.Series(returns)).cumprod()
def sharpe(r): s=np.std(r); return 0 if s==0 else np.mean(r)/s*np.sqrt(252)
def max_drawdown(eq): peak=eq.cummax(); return (eq/peak-1).min()

eq_s, eq_b = equity_curve(strat_r), equity_curve(bh_r)
print("XGB Strategy Sharpe:", sharpe(strat_r))
print("Buy&Hold Sharpe   :", sharpe(bh_r))
print("XGB MaxDD:", max_drawdown(eq_s), "| BH MaxDD:", max_drawdown(eq_b))

plt.figure(figsize=(12,5))
plt.plot(eq_s.values, label=f"XGB (thr={thr_xgb:.2f}, fee=10bps)")
plt.plot(eq_b.values, label="Buy & Hold")
plt.title("Equity Curve — TEST (XGBoost)")
plt.legend(); plt.show()