In [16]:
# 📊 Data Fetch + Feature Engineering Cell
%pip install yfinance

import pandas as pd
import numpy as np
from ta import add_all_ta_features
import yfinance as yf
import os

# Step 1: Create data directory
os.makedirs('data', exist_ok=True)

# Step 2: Download SPY data (proxy for /ES futures)
ticker = 'SPY'
df = yf.download(ticker, start='2020-01-01', end='2025-07-23', interval='1d')
df.index.name = 'timestamp'

# Step 3: Normalize column names (handle MultiIndex)
if isinstance(df.columns, pd.MultiIndex):
    df.columns = ['_'.join(col).strip().lower() for col in df.columns.values]
else:
    df.columns = df.columns.str.lower()

# Step 4: Save raw CSV for reference
df.to_csv('data/spy_historical.csv')

# Step 5: Add technical analysis features
# The KeyError: 'high' occurred because the add_all_ta_features function expects columns named 'open', 'high', 'low', 'close', and 'volume' by default.
# However, your dataframe columns are named 'open_spy', 'high_spy', 'low_spy', 'close_spy', and 'volume_spy'.
# To fix this, I explicitly mapped the function arguments to the correct column names in your dataframe.

df = add_all_ta_features(
    df,
    open='open_spy',
    high='high_spy',
    low='low_spy',
    close='close_spy',
    volume='volume_spy'
)

# Step 6: Add custom rolling features
for window in [5, 10, 20, 50]:
    df[f'rolling_mean_{window}'] = df['close_spy'].rolling(window).mean()
    df[f'rolling_std_{window}'] = df['close_spy'].rolling(window).std()
    df[f'momentum_{window}'] = df['close_spy'] - df['close_spy'].shift(window)
    df[f'rate_of_change_{window}'] = df['close_spy'].pct_change(window)
    df[f'volatility_ratio_{window}'] = df[f'rolling_std_{window}'] / df['close_spy']

# Step 7: Lag features
for lag in [1, 2, 3, 5]:
    df[f'close_lag_{lag}'] = df['close_spy'].shift(lag)
    df[f'close_diff_{lag}'] = df['close_spy'].diff(lag)

# Step 8: Binary classification target
df['target'] = np.where(df['close_spy'].shift(-1) > df['close_spy'], 1, 0)

# Step 9: Drop NA rows created by rolling/lags
df.dropna(inplace=True)

# Step 10: Save processed features
df.to_csv('data/features.csv')

# Optional: Preview output
df.head()




  df = yf.download(ticker, start='2020-01-01', end='2025-07-23', interval='1d')
[*********************100%***********************]  1 of 1 completed

Note: you may need to restart the kernel to use updated packages.



  self._psar[i] = high2
  df[f'momentum_{window}'] = df['close_spy'] - df['close_spy'].shift(window)
  df[f'rate_of_change_{window}'] = df['close_spy'].pct_change(window)
  df[f'volatility_ratio_{window}'] = df[f'rolling_std_{window}'] / df['close_spy']
  df[f'rolling_mean_{window}'] = df['close_spy'].rolling(window).mean()
  df[f'rolling_std_{window}'] = df['close_spy'].rolling(window).std()
  df[f'momentum_{window}'] = df['close_spy'] - df['close_spy'].shift(window)
  df[f'rate_of_change_{window}'] = df['close_spy'].pct_change(window)
  df[f'volatility_ratio_{window}'] = df[f'rolling_std_{window}'] / df['close_spy']
  df[f'close_lag_{lag}'] = df['close_spy'].shift(lag)
  df[f'close_diff_{lag}'] = df['close_spy'].diff(lag)
  df[f'close_lag_{lag}'] = df['close_spy'].shift(lag)
  df[f'close_diff_{lag}'] = df['close_spy'].diff(lag)
  df[f'close_lag_{lag}'] = df['close_spy'].shift(lag)
  df[f'close_diff_{lag}'] = df['close_spy'].diff(lag)
  df[f'close_lag_{lag}'] = df['close_spy'].shift(

Unnamed: 0_level_0,close_spy,high_spy,low_spy,open_spy,volume_spy,volume_adi,volume_obv,volume_cmf,volume_fi,volume_em,...,volatility_ratio_50,close_lag_1,close_diff_1,close_lag_2,close_diff_2,close_lag_3,close_diff_3,close_lag_5,close_diff_5,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
