# Trading MVP – Should We Trade Tomorrow?

This notebook contains a complete, **reproducible** pipeline:
1. Data audit and cleansing
2. Feature engineering (rolling behaviour metrics)
3. Label creation (predict next‑day bad trading day)
4. Expanding‑window time‑series split
5. Baseline, logistic‑regression, and LightGBM models
6. Evaluation metrics & monetary lift
7. Risk‑management dashboard plots

Adjust paths and parameters in the first code cell as needed.

In [None]:
import json

import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    average_precision_score,
    confusion_matrix,
    precision_recall_fscore_support,
)
from sklearn.model_selection import TimeSeriesSplit

RANDOM_STATE = 42
plt.rcParams["figure.figsize"] = (8, 4)
# ---- data path ----
CSV_PATH = "../data/train/predictors_6973.pkl"

## 1  Load & audit data

In [None]:
def load_data(path: str) -> pd.DataFrame:
    df = pd.read_pickle(path)
    df = df.sort_index()
    return df


def audit_data(df: pd.DataFrame):
    full_range = pd.date_range(df.index.min(), df.index.max(), freq="D", tz=None)
    missing = full_range.difference(df.index)
    print("Missing days:", len(missing))
    if len(missing):
        print(missing[:10])
    dupes = df.index[df.index.duplicated()]
    print("Duplicate days:", dupes.size)
    print("\nNull counts:\n", df.isnull().sum())


df = load_data(CSV_PATH)
audit_data(df)

In [None]:
df.head()

In [None]:
print(df.reset_index().columns)
print(df.info())
# -- print data types
print(df.dtypes)

## 2  Feature engineering

In [None]:
def compute_streak(series: pd.Series, positive: bool = True) -> pd.Series:
    cond = series > 0 if positive else series < 0
    streak = np.zeros(len(series), dtype=int)
    run = 0
    for i, flag in enumerate(cond):
        run = run + 1 if flag else 0
        streak[i] = run
    return pd.Series(streak, index=series.index)

In [None]:
windows = [3, 5, 10, 20]
# for N in windows:
# roll = df["total_delta"].rolling(N, min_periods=1)
# df[f"hit_rate_{N}d"] = roll.apply(lambda x: (x > 0).mean(), raw=True)
# df[f"avg_win_{N}d"] = roll.apply(lambda x: x[x > 0].mean() if (x > 0).any() else 0)
# df[f"avg_loss_{N}d"] = roll.apply(lambda x: x[x < 0].mean() if (x < 0).any() else 0)
# df[f"expectancy_{N}d"] = (
#     df[f"hit_rate_{N}d"] * df[f"avg_win_{N}d"]
#     + (1 - df[f"hit_rate_{N}d"]) * df[f"avg_loss_{N}d"]
# )
# df[f"turnover_qty_{N}d"] = df["qty"].rolling(N, min_periods=1).sum()
# df[f"sharpe_{N}d"] = (roll.mean() / (roll.std(ddof=0) + 1e-9)) * np.sqrt(252)

In [None]:
# current streaks (not rolling max)
# df["win_streak"] = compute_streak(df["total_delta"], positive=True)
# df["loss_streak"] = compute_streak(df["total_delta"], positive=False)

In [None]:
df.describe(include="all")

## 3  Create target label

In [None]:
df["pnl_next"] = df["total_delta"].shift(-1)
df["tomorrow_bad_day"] = (df["pnl_next"] < 0).astype(int)
df = df.iloc[:-1]  # drop last row (no label)

## 4  Train/Validation/Test split (expanding window)

In [None]:
tscv = TimeSeriesSplit(n_splits=3)
X = df.drop(columns=["tomorrow_bad_day", "pnl_next"])

const_cols = [c for c in X.columns if X[c].nunique() <= 1]
X = X.drop(columns=const_cols)
y = df["tomorrow_bad_day"]
splits = list(tscv.split(X))

train_idx, test_idx = splits[-1]
X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

print(X_train.shape)
print(X_test.shape)

## 5  Model training & evaluation

In [None]:
# Baseline
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)
# Logistic
logreg = LogisticRegression(
    penalty="l1",
    solver="liblinear",
    C=1,
    class_weight="balanced",
    random_state=RANDOM_STATE,
)
logreg.fit(X_train, y_train)
# LightGBM (quick default)
lgbm = lgb.LGBMClassifier(
    min_gain_to_split=0,
    min_data_in_leaf=5,
    random_state=42,
    n_estimators=2000,
    num_leaves=31,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    learning_rate=0.05,
    bagging_freq=1,
    metric="auc",
    class_weight="balanced",
)
lgbm.fit(X_train, y_train)

models = {"Dummy": dummy, "Logistic": logreg, "LightGBM": lgbm}
for name, mdl in models.items():
    y_pred = mdl.predict(X_test)
    y_prob = mdl.predict_proba(X_test)[:, 1] if hasattr(mdl, "predict_proba") else None
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary", zero_division=0
    )
    pr_auc = average_precision_score(y_test, y_prob) if y_prob is not None else np.nan
    print(
        f"{name}: Precision={prec:.3f} Recall={rec:.3f} F1={f1:.3f} PR_AUC={pr_auc:.3f}"
    )

### Monetary lift calculation

In [None]:
best_model = lgbm  # choose logistic for demo
signals = best_model.predict(X_test)
pnl_always = df.loc[y_test.index, "pnl_next"].sum()
pnl_model = df.loc[y_test.index, "pnl_next"].where(signals == 0, 0).sum()
print("Always trade PnL:", pnl_always)
print("Model guided PnL:", pnl_model)
print("Lift:", pnl_model - pnl_always)

## 6  Risk‑management dashboard

In [None]:
def equity_curve(pnl_series):
    return pnl_series.cumsum()


pnl_series = df.loc[y_test.index, "pnl_next"]
pnl_model_series = df.loc[y_test.index, "pnl_next"].where(signals == 0, 0)

plt.figure()
plt.plot(equity_curve(pnl_series), label="Always")
plt.plot(equity_curve(pnl_model_series), label="Model")
plt.title("Cumulative Equity")
plt.legend()
plt.show()

In [None]:
def equity_curve(p):
    return p.cumsum()


jj = 0
hh = -1

eq_always = equity_curve(pnl_series)[jj:hh]
eq_model = equity_curve(pnl_model_series)[jj:hh]

bad_mask = signals[jj:hh] == 1  # model skipped
bad_idx = y_test[jj:hh].index[bad_mask]

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(eq_always, label="Always")
ax.plot(eq_model, label="Model")

# mark skip days on always-trade curvea
ax.scatter(
    bad_idx, eq_always.loc[bad_idx], marker="v", color="red", s=40, label="Skipped"
)

ax.set_title("Cumulative Equity with Skip Markers")
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
booster = lgbm.booster_  # unwrap sklearn wrapper

gain = booster.feature_importance(importance_type="gain")
names = booster.feature_name()

import pandas as pd

imp = pd.DataFrame({"feature": names, "gain": gain}).sort_values(
    "gain", ascending=False
)
imp.head(20)

In [None]:
gain = booster.feature_importance(importance_type="gain")
names = booster.feature_name()
imp = {n: g for n, g in zip(names, gain)}

total = sum(gain)
pct = {n: g / total for n, g in imp.items()}

display(pct)

In [None]:
display(X_train)

In [None]:
display(X_test)