In [1]:
# ==========================================
# 03_signal_analysis.ipynb
# Intraday signal exploration and predictability tests
# ==========================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
from pathlib import Path

# -----------------------
# 1. Load cleaned data
# -----------------------
data_dir = Path("../data/processed")
symbols = ["SPY", "QQQ"]

all_dfs = {}
for symbol in symbols:
    file = data_dir / f"{symbol}_1min_clean.parquet"
    df = pd.read_parquet(file)
    df = df.sort_values("timestamp").reset_index(drop=True)
    all_dfs[symbol] = df
    print(f"Loaded {symbol}: {len(df):,} rows")

# Example: use SPY as primary
df = all_dfs["SPY"].copy()

# Create lagged and next returns for predictive testing
df["lag_return"] = df["log_return"].shift(1)
df["next_return"] = df["log_return"].shift(-1)

# VWAP deviation
df["vwap_dev"] = (df["close"] - df["vwap"]) / df["vwap"]

# Volume percentile per day
df["volume_pct"] = df.groupby(df["timestamp"].dt.date)["volume"].rank(pct=True)

df = df.dropna(subset=["lag_return", "next_return", "vwap_dev", "volume_pct"])

print(df.head())


Loaded SPY: 391,260 rows
Loaded QQQ: 391,260 rows
                  timestamp  index symbol     open     high       low   close  \
2 2020-01-02 09:33:00-05:00      2    SPY  323.520  323.830  323.5000  323.77   
3 2020-01-02 09:34:00-05:00      3    SPY  323.760  323.860  323.6700  323.68   
4 2020-01-02 09:35:00-05:00      4    SPY  323.675  323.980  323.6600  323.93   
5 2020-01-02 09:36:00-05:00      5    SPY  323.920  323.975  323.8300  323.92   
6 2020-01-02 09:37:00-05:00      6    SPY  323.930  323.960  323.7877  323.82   

     volume  trade_count        vwap        date    return  log_return  \
2  246777.0       1585.0  323.690822  2020-01-02  0.000768    0.000768   
3  192233.0       1261.0  323.781768  2020-01-02 -0.000278   -0.000278   
4  422749.0       2300.0  323.835520  2020-01-02  0.000772    0.000772   
5  266637.0       1376.0  323.896165  2020-01-02 -0.000031   -0.000031   
6  207102.0       1100.0  323.904850  2020-01-02 -0.000309   -0.000309   

   overnight_retur