In [1]:
import os

import numpy as np
import pandas as pd


In [None]:
import os
import numpy as np
import pandas as pd

def process_data():
    """Clean raw S&P 500 data and engineer volatility features."""
    
    
    input_path = os.path.join("data", "raw", "sp500_raw.csv")

    
    if not os.path.exists(input_path):
        print(f"Error: File not found at {input_path}")
        return

    print("Processing data...")
    try:
        df = pd.read_csv(input_path, parse_dates=['Date'], index_col='Date')
    except Exception as e:
        print(f"Error reading CSV: {e}")
        return

    
    price_col = 'ClosePrice'

    df[price_col] = pd.to_numeric(df[price_col], errors='coerce')
    df[price_col] = df[price_col].ffill()
    df = df.dropna(subset=[price_col])

    valid_prices = df[price_col] > 0
    df = df.loc[valid_prices].copy()

    df['Log_Return'] = np.log(df[price_col] / df[price_col].shift(1))
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    df['Target_Vol'] = df['Log_Return'] ** 2

    
    for lag in range(1, 6):
        df[f'Vol_Lag_{lag}'] = df['Target_Vol'].shift(lag)
        df[f'Return_Lag_{lag}'] = df['Log_Return'].shift(lag)

    df['Vol_Roll_Mean_5'] = df['Target_Vol'].rolling(window=5).mean().shift(1)
    df['Vol_Roll_Mean_21'] = df['Target_Vol'].rolling(window=21).mean().shift(1)

    
    df['VIX_Lag_1'] = df['VIX'].shift(1)
    df['VIX_Lag_5'] = df['VIX'].shift(5)
    df['VIX_Roll_Mean_5'] = df['VIX'].rolling(window=5).mean().shift(1)

    df_clean = df.dropna()

    if df_clean.empty:
        print("Error: Cleaning resulted in an empty dataset.")
        return

    output_path = os.path.join("data", "processed")
    os.makedirs(output_path, exist_ok=True)
    
    output_file = os.path.join(output_path, "sp500_ml_ready.csv")
    df_clean.to_csv(output_file)
    
    print(f"Processed data saved to {output_file}")
    print(f"Cleaned shape: {df_clean.shape}")
    print(f"Columns: {list(df_clean.columns)}")

if __name__ == "__main__":
    process_data()

Processing data...
Processed data saved to data/processed/sp500_ml_ready.csv
Cleaned shape: (9044, 23)
Columns: ['ClosePrice', 'DailyReturn', 'RV', 'Month', 'MonthlyReturn', 'VIX', 'Log_Return', 'Target_Vol', 'Vol_Lag_1', 'Return_Lag_1', 'Vol_Lag_2', 'Return_Lag_2', 'Vol_Lag_3', 'Return_Lag_3', 'Vol_Lag_4', 'Return_Lag_4', 'Vol_Lag_5', 'Return_Lag_5', 'Vol_Roll_Mean_5', 'Vol_Roll_Mean_21', 'VIX_Lag_1', 'VIX_Lag_5', 'VIX_Roll_Mean_5']
