In [1]:
import sys, os, importlib, numpy as np, pandas as pd
sys.path.append(os.path.abspath(".."))

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

from src import data as data_mod, features as features_mod, utils as utils_mod
importlib.reload(data_mod); importlib.reload(features_mod); importlib.reload(utils_mod)
from src.data import get_data
from src.features import add_features
from src.utils import make_labels

# Optional XGBoost
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False

plt.rcParams["figure.figsize"] = (12,5)
plt.rcParams["axes.grid"] = True


In [2]:
TICKERS = ["AAPL", "SPY", "MSFT", "BTC-USD"]  # edit freely
START, END = "2015-01-01", "2023-12-31"

TAUS = [0.0, 0.001, 0.002]   # 0%, ±0.1%, ±0.2% dead-zone
FEE = 0.0010                 # 10 bps one-way
MODELS = ["logreg", "xgb"]   # will skip xgb if not installed
RANDOM_STATE = 42

In [3]:
def split_scale(X, y, r, train=0.70, val=0.15):
    n = len(X); i_tr = int(train*n); i_va = int((train+val)*n)
    X_tr, y_tr, r_tr = X[:i_tr], y[:i_tr], r[:i_tr]
    X_va, y_va, r_va = X[i_tr:i_va], y[i_tr:i_va], r[i_tr:i_va]
    X_te, y_te, r_te = X[i_va:], y[i_va:], r[i_va:]
    scaler = StandardScaler().fit(X_tr)
    return (scaler.transform(X_tr), y_tr, r_tr,
            scaler.transform(X_va), y_va, r_va,
            scaler.transform(X_te), y_te, r_te)

def sharpe_daily(rr):
    rr = pd.Series(rr); s = rr.std()
    return 0.0 if s==0 else rr.mean()/s*np.sqrt(252)

def backtest_from_probs(p, r_next, thr, fee):
    pos = (p > thr).astype(int)
    trades = np.abs(np.diff(np.r_[0, pos])) * fee
    strat_r = pos * r_next - trades
    return {
        "Sharpe_strat": sharpe_daily(strat_r),
        "Sharpe_bh": sharpe_daily(r_next),
        "Coverage": pos.mean(),
        "Trades": int(trades.sum()),
    }

def tune_threshold(p_val, r_val, fee=FEE, grid=np.linspace(0.50, 0.60, 21)):
    # choose thr that maximizes fee-aware Sharpe on validation
    best = None
    best_s = -1e9
    for t in grid:
        s = backtest_from_probs(p_val, r_val, thr=t, fee=fee)["Sharpe_strat"]
        if s > best_s:
            best_s, best = s, t
    return best


In [4]:
def run_once(ticker, tau):
    # 1) data
    df = get_data(ticker, start=START, end=END)
    df = add_features(df)
    df = make_labels(df, tau=tau, dead_zone=(tau > 0))
    feat_cols = [c for c in df.columns if c not in ["date","open","high","low","close","volume","ret_next","y"]]
    X = df[feat_cols].values
    y = df["y"].astype(int).values
    r = df["ret_next"].values

    # 2) split & scale
    X_tr, y_tr, r_tr, X_va, y_va, r_va, X_te, y_te, r_te = split_scale(X, y, r)

    rows = []

    # 3) Logistic Regression
    lr = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE)
    lr.fit(X_tr, y_tr)
    p_va = lr.predict_proba(X_va)[:,1]
    p_te = lr.predict_proba(X_te)[:,1]
    thr = tune_threshold(p_va, r_va, fee=FEE)
    bt = backtest_from_probs(p_te, r_te, thr, fee=FEE)
    rows.append(dict(
        ticker=ticker, tau=tau, model="logreg",
        auc_va=roc_auc_score(y_va, p_va),
        auc_te=roc_auc_score(y_te, p_te),
        thr=thr, **bt
    ))

    # 4) XGBoost (if available)
    if HAS_XGB and "xgb" in MODELS:
        xgb = XGBClassifier(
            n_estimators=400, max_depth=4, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
            random_state=RANDOM_STATE, n_jobs=-1
        )
        xgb.fit(X_tr, y_tr)
        p_va_x = xgb.predict_proba(X_va)[:,1]
        p_te_x = xgb.predict_proba(X_te)[:,1]
        thr_x = tune_threshold(p_va_x, r_va, fee=FEE)
        bt_x = backtest_from_probs(p_te_x, r_te, thr_x, fee=FEE)
        rows.append(dict(
            ticker=ticker, tau=tau, model="xgb",
            auc_va=roc_auc_score(y_va, p_va_x),
            auc_te=roc_auc_score(y_te, p_te_x),
            thr=thr_x, **bt_x
        ))

    return pd.DataFrame(rows)


In [5]:
all_rows = []
for t in TICKERS:
    for tau in TAUS:
        try:
            res = run_once(t, tau)
            all_rows.append(res)
        except Exception as e:
            print(f"Error on {t}, tau={tau}: {e}")

mt = pd.concat(all_rows, ignore_index=True)
display(mt.sort_values(["ticker","model","tau"]))

print("\n=== Averages by model (across tickers & taus) ===")
display(mt.groupby("model")[["auc_te","Sharpe_strat","Sharpe_bh","Coverage","Trades"]].mean().round(4))

print("\n=== Best per ticker/model by Sharpe_strat ===")
best = (mt.sort_values(["ticker","model","Sharpe_strat"], ascending=[True,True,False])
          .groupby(["ticker","model"]).head(1)
          .reset_index(drop=True))
display(best[["ticker","model","tau","auc_te","Sharpe_strat","Sharpe_bh","Coverage","Trades","thr"]])


Unnamed: 0,ticker,tau,model,auc_va,auc_te,thr,Sharpe_strat,Sharpe_bh,Coverage,Trades
0,AAPL,0.0,logreg,0.489283,0.457203,0.56,-0.545245,0.69731,0.041791,0
2,AAPL,0.001,logreg,0.485939,0.474086,0.6,0.788011,0.738853,0.003205,0
4,AAPL,0.002,logreg,0.501812,0.461433,0.585,-0.960928,0.876988,0.003436,0
1,AAPL,0.0,xgb,0.534844,0.455843,0.535,-0.490488,0.69731,0.402985,0
3,AAPL,0.001,xgb,0.542512,0.461078,0.58,-0.81622,0.738853,0.346154,0
5,AAPL,0.002,xgb,0.570175,0.47851,0.59,-0.309927,0.876988,0.28866,0
18,BTC-USD,0.0,logreg,0.510702,0.503841,0.58,0.969222,1.16456,0.030738,0
20,BTC-USD,0.001,logreg,0.513612,0.508225,0.58,0.945616,0.936981,0.038544,0
22,BTC-USD,0.002,logreg,0.527448,0.503284,0.595,0.816782,0.987867,0.024831,0
19,BTC-USD,0.0,xgb,0.537954,0.565787,0.535,1.885017,1.16456,0.594262,0



=== Averages by model (across tickers & taus) ===


Unnamed: 0_level_0,auc_te,Sharpe_strat,Sharpe_bh,Coverage,Trades
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
logreg,0.4683,0.3811,1.1,0.1319,0.0
xgb,0.5024,0.4827,1.1,0.5526,0.0



=== Best per ticker/model by Sharpe_strat ===


Unnamed: 0,ticker,model,tau,auc_te,Sharpe_strat,Sharpe_bh,Coverage,Trades,thr
0,AAPL,logreg,0.001,0.474086,0.788011,0.738853,0.003205,0,0.6
1,AAPL,xgb,0.002,0.47851,-0.309927,0.876988,0.28866,0,0.59
2,BTC-USD,logreg,0.0,0.503841,0.969222,1.16456,0.030738,0,0.58
3,BTC-USD,xgb,0.0,0.565787,1.885017,1.16456,0.594262,0,0.535
4,MSFT,logreg,0.002,0.466418,0.967568,1.340702,0.255319,0,0.55
5,MSFT,xgb,0.002,0.508806,1.423974,1.340702,0.531915,0,0.535
6,SPY,logreg,0.001,0.439385,1.005884,1.216659,0.072165,0,0.565
7,SPY,xgb,0.002,0.542189,0.800341,1.708649,0.728,0,0.535


In [6]:
def fee_sweep(ticker="AAPL", tau=0.001, fees=(0.0005, 0.0010, 0.0020)):
    out = []
    base_fee = FEE
    for f in fees:
        globals()["FEE"] = f
        df = run_once(ticker, tau)
        df["fee"] = f
        out.append(df)
    globals()["FEE"] = base_fee
    return pd.concat(out, ignore_index=True)

fee_res = fee_sweep("AAPL", 0.001)
display(fee_res[["model","fee","auc_te","Sharpe_strat","Sharpe_bh","Coverage","Trades","thr"]].sort_values(["model","fee"]))


Unnamed: 0,model,fee,auc_te,Sharpe_strat,Sharpe_bh,Coverage,Trades,thr
0,logreg,0.0005,0.474086,0.847652,0.738853,0.003205,0,0.6
2,logreg,0.001,0.474086,0.788011,0.738853,0.003205,0,0.6
4,logreg,0.002,0.474086,0.638306,0.738853,0.003205,0,0.6
1,xgb,0.0005,0.461078,-0.544147,0.738853,0.346154,0,0.58
3,xgb,0.001,0.461078,-0.81622,0.738853,0.346154,0,0.58
5,xgb,0.002,0.461078,-1.356782,0.738853,0.346154,0,0.58


In [7]:
from pathlib import Path
Path("../data").mkdir(parents=True, exist_ok=True)

mt.to_csv("../data/multiticker_tau_sweep.csv", index=False)
best.to_csv("../data/multiticker_best_by_ticker_model.csv", index=False)
fee_res.to_csv("../data/multiticker_fee_sweep_AAPL.csv", index=False)

"Saved: data/multiticker_tau_sweep.csv, data/multiticker_best_by_ticker_model.csv, data/multiticker_fee_sweep_AAPL.csv"


'Saved: data/multiticker_tau_sweep.csv, data/multiticker_best_by_ticker_model.csv, data/multiticker_fee_sweep_AAPL.csv'