In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from pathlib import Path
import statsmodels.api as sm
import seaborn as sns

# Load cleaned datasets
symbols = ["SPY", "QQQ"]
data_dir = Path("../data/processed")

all_dfs = {}
for symbol in symbols:
    file = data_dir / f"{symbol}_1min_clean.parquet"
    if file.exists():
        all_dfs[symbol] = pd.read_parquet(file)
        print(f"Loaded {symbol}: {len(all_dfs[symbol])} rows")

# Example: use SPY for now
df = all_dfs["SPY"].copy()
df["time"] = pd.to_datetime(df["timestamp"]).dt.time
df["date"] = pd.to_datetime(df["timestamp"]).dt.date


Loaded SPY: 391260 rows
Loaded QQQ: 391260 rows


In [None]:
intraday_stats = (
    df.groupby("time")
      .agg(mean_return=("log_return", "mean"),
           volatility=("log_return", "std"),
           volume=("volume", "mean"))
      .reset_index()
)

fig = px.line(intraday_stats, x="time", y=["mean_return", "volatility"],
              title="Intraday Mean Return and Volatility Patterns (SPY)")
fig.show()

fig = px.line(intraday_stats, x="time", y="volume", title="Intraday Average Volume")
fig.show()

In [None]:
returns = df["log_return"].dropna()
lags = range(1, 11)
acf_values = [returns.autocorr(lag) for lag in lags]

plt.bar(lags, acf_values)
plt.title("Autocorrelation of 1-min log returns (SPY)")
plt.xlabel("Lag (minutes)")
plt.ylabel("Correlation")
plt.show()


In [None]:
df["abs_return"] = df["log_return"].abs()
px.scatter(df, x="volume", y="abs_return", opacity=0.3,
           title="Volume vs Absolute 1-min Return (SPY)").show()

print("Correlation(volume, |return|):", df["volume"].corr(df["abs_return"]))


In [None]:
threshold = df["log_return"].std() * 5
extreme = df[np.abs(df["log_return"]) > threshold]
print(f"Extreme 5σ moves: {len(extreme)}")
display(extreme[["timestamp", "log_return", "volume"]].head())

px.scatter(df, x="timestamp", y="log_return",
           title="Extreme Intraday Moves (>5σ)").show()


In [None]:
df["vwap_dev"] = (df["close"] - df["vwap"]) / df["vwap"]
px.scatter(df, x="vwap_dev", y="log_return", opacity=0.3,
           title="VWAP Deviation vs Next-Minute Return (SPY)").show()

print("Correlation(VWAP deviation, next return):",
      df["vwap_dev"].corr(df["log_return"].shift(-1)))
