## Intraday OHLCV data (shared for all three ideas)

In [3]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta

symbol = "SPY"
interval = "1m"
periods = ["7d"] # yfinance only allows 7 days per request

data_all = []

# rolling download for multiple months (approx 7 days per pull)
for i in range(0, 300): # about 300*7 = 2100 days = 3 years
    end_date = datetime.today() - timedelta(days=i*7)
    start_date = end_date - timedelta(days=7)
    df = yf.download(symbol, start=start_date, end=end_date, interval=interval)
    if not df.empty:
        df.reset_index(inplace=True)
        data_all.append(df)

df_all = pd.concat(data_all)
df_all.drop_duplicate(subsets="Datetime", inplace=True)
df_all.rename(columns={"Datetime": "timestamp_utc"}, inplace=True)
df_all.to_csv("data/SPY_intraday_2022_2025.csv", index=False)
print("Saved data/SPY_intraday_2022_2025.csv", len(df_all))

  0%|          | 0/24 [00:00<?, ?it/s]

Error: 403 {"status":"NOT_AUTHORIZED","request_id":"d265cb07223166bcd108f41aa9112561","message":"Your plan doesn't include this data timeframe. Please upgrade your plan at https://polygon.io/pricing"}


  0%|          | 0/24 [00:08<?, ?it/s]


KeyboardInterrupt: 

polygon.io OHLCV data 2024-2025

In [4]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
import time

API_KEY = "aIT4pyAR2rUApDm8ivkT6BqWpiSQTl8R"
TICKER = "SPY"
MULTIPLIER = 1
TIMESPAN = "minute"
START_DATE = "2022-01-01"
END_DATE = "2023-12-31"

def fetch_agg(ticker, start, end):
    url = f"https://api.polygon.io/v2/aggs/ticker/{ticker}/range/{MULTIPLIER}/{TIMESPAN}/{start}/{end}"
    params = {
        "adjusted": "true",
        "sort": "asc",
        "limit": 50000,
        "apiKey": API_KEY
    }
    r = requests.get(url, params=params)
    if r.status_code != 200:
        print("Error:", r.status_code, r.text)
        return None
    data = r.json().get("results", [])
    if not data:
        return None
    df = pd.DataFrame(data)
    df["timestamp_utc"] = pd.to_datetime(df["t"], unit="ms", utc=True)
    df = df.rename(columns={
        "o": "open",
        "h": "high",
        "l": "low",
        "c": "close",
        "v": "volume",
        "vw": "vwap"
    })[["timestamp_utc", "open", "high", "low", "close", "volume", "vwap"]]
    return df

# Split by months to stay under API limits
date_ranges = pd.date_range(START_DATE, END_DATE, freq="30D")
dfs = []

for i in tqdm(range(len(date_ranges)-1)):
    start = date_ranges[i].strftime("%Y-%m-%d")
    end = date_ranges[i+1].strftime("%Y-%m-%d")
    df_part = fetch_agg(TICKER, start, end)
    if df_part is not None:
        dfs.append(df_part)
    time.sleep(12)  # avoid rate limit (5 req/min on free tier)

# Merge all
df = pd.concat(dfs).drop_duplicates("timestamp_utc").sort_values("timestamp_utc")
df["ret"] = df["close"].pct_change()
df["vol_5m"] = df["ret"].rolling(5).std()
df["vol_30m"] = df["ret"].rolling(30).std()

# Save clean dataset
df.to_csv(f"{TICKER}_intraday_polygon_2022_2023.csv", index=False)
print(f"âœ… Saved {len(df):,} rows to data/{TICKER}_intraday_polygon_2022_2023.csv")


  0%|          | 0/24 [00:00<?, ?it/s]

Error: 403 {"status":"NOT_AUTHORIZED","request_id":"4e1efe72f4f22b16bc2bf8f5dbbd3ec6","message":"Your plan doesn't include this data timeframe. Please upgrade your plan at https://polygon.io/pricing"}


  0%|          | 0/24 [00:06<?, ?it/s]


KeyboardInterrupt: 