In [12]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

In [36]:
def run_eda():
    """
    Generates exploratory visualizations for the S&P 500 volatility dataset.
    
    Outputs:
    1) volatility_rv_vix.png
       Time series of the realized-volatility target (Target_Vol) alongside VIX
       (implied volatility) using a dual-axis plot.

    2) volatility_clusters.png
       Annualized 21-day realized volatility (in %) over time, with the dataset
       split into training / validation / test segments shown in different colors.

    3) feature_correlations.png
       Correlation heatmap of Target_Vol, VIX, and engineered lag/rolling features
       to inspect multicollinearity and feature relationships.

    4) returns_distribution.png
       Histogram (with KDE) of daily log returns to assess skewness and tail behavior.

    5) qq_plot.png
       Qâ€“Q plot of standardized log returns against a normal distribution to
       visually assess deviations from Gaussianity (fat tails).
    """
    input_path = os.path.join("..", "data", "processed", "sp500_ml_ready.csv")
    if not os.path.exists(input_path):
        print(f"Error: Processed data not found at {input_path}")
        return

    print("Loading data for EDA...")
    df = pd.read_csv(input_path, index_col=0, parse_dates=True)
    
    plot_dir = os.path.join("..", "plots")
    os.makedirs(plot_dir, exist_ok=True)
    # ===========================================
    # 1. Volatility Time Series with VIX
    # ===========================================
    fig, ax1 = plt.subplots(figsize=(14, 6))
    
    # Plot Realized Volatility
    ax1.plot(df.index, df['Target_Vol'], label='Realized Volatility', alpha=0.7, linewidth=0.5, color='blue')
    ax1.set_ylabel('Realized Volatility (Squared Returns)', color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')
    
    # Plot VIX on secondary axis
    ax2 = ax1.twinx()
    ax2.plot(df.index, df['VIX'], label='VIX', alpha=0.7, linewidth=0.5, color='orange')
    ax2.set_ylabel('VIX', color='orange')
    ax2.tick_params(axis='y', labelcolor='orange')
    
    # Data split lines
    ax1.axvline(pd.Timestamp('2018-01-01'), color='r', linestyle='--', alpha=0.8)
    ax1.axvline(pd.Timestamp('2022-01-01'), color='r', linestyle='--', alpha=0.8)
    ax1.text(pd.Timestamp('2000-01-01'), df['Target_Vol'].max()*0.9, 'TRAIN', color='red', fontsize=12)
    ax1.text(pd.Timestamp('2019-01-01'), df['Target_Vol'].max()*0.9, 'VAL', color='red', fontsize=12)
    ax1.text(pd.Timestamp('2023-01-01'), df['Target_Vol'].max()*0.9, 'TEST', color='red', fontsize=12)

    plt.title('S&P 500: Realized Volatility vs Implied Volatility (VIX)')
    fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))
    fig.tight_layout()
    plt.savefig(os.path.join(plot_dir, "volatility_rv_vix.png"))
    plt.close()
    print("Saved: volatility_rv_vix.png")
    
    # ===========================================
    # 2. Volatility Clusters
    # ===========================================
    # Define splits
    train_mask = df.index < '2015-01-01'
    val_mask = (df.index >= '2015-01-01') & (df.index < '2020-01-01')
    test_mask = df.index >= '2020-01-01'
    
    # Convert to percentage (annualized volatility)
    df['Vol_Pct'] = df['Log_Return'].rolling(21).std() * np.sqrt(252) * 100
    
    # Plot
    plt.figure(figsize=(12, 6))
    
    # Plot each split in different color
    plt.plot(df.index[train_mask], df['Vol_Pct'][train_mask], 
             color='#1f77b4', label='training', linewidth=0.8)
    plt.plot(df.index[val_mask], df['Vol_Pct'][val_mask], 
             color='#ff7f0e', label='validation', linewidth=0.8)
    plt.plot(df.index[test_mask], df['Vol_Pct'][test_mask], 
             color='#2ca02c', label='test', linewidth=0.8)
    
    plt.ylabel('Realized Volatility')
    plt.ylim(0, 100)
    plt.yticks([0, 20, 40, 60, 80, 100], ['0%', '20%', '40%', '60%', '80%', '100%'])
    plt.legend(loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    plt.savefig(os.path.join(plot_dir, "volatility_clusters.png"), dpi=150)
    plt.close()
    print("Saved: volatility_clusters.png")

    # ===========================================
    # 3. Feature Correlation 
    # ===========================================

    feature_cols = ['Target_Vol', 'VIX'] + [c for c in df.columns if 'Lag' in c or 'Roll' in c]
    corr_matrix = df[feature_cols].corr()

    # Create mask for upper triangle
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, "feature_correlations.png"))
    plt.close()
    print("Saved: feature_correlations.png")

    # ===========================================
    # 3. Return Distribution
    # ===========================================
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Log_Return'], bins=100, kde=True, color='blue')
    plt.title('Distribution of Daily Log Returns')
    plt.xlabel('Log Return')
    plt.savefig(os.path.join(plot_dir, "returns_distribution.png"))
    plt.close()
    print("Saved: returns_distribution.png")

    # ===========================================
    # 5. Q-Q Plot for Standardized Log Returns
    # ===========================================
    from scipy import stats
    
    # Standardize log returns (mean=0, std=1)
    log_returns = df['Log_Return'].dropna()
    standardized_returns = (log_returns - log_returns.mean()) / log_returns.std()
    
    plt.figure(figsize=(8, 6))
    stats.probplot(standardized_returns, dist="norm", plot=plt)
    
    plt.title('')  # Remove default title
    plt.xlabel('Theoretical Quantiles')
    plt.ylabel('Sample Quantiles')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    plt.savefig(os.path.join(plot_dir, "qq_plot.png"), dpi=150)
    plt.close()
    print("Saved: qq_plot.png")

if __name__ == "__main__":
    run_eda()

Loading data for EDA...
Saved: volatility_rv_vix.png
Saved: volatility_clusters.png
Saved: feature_correlations.png
Saved: returns_distribution.png
Saved: qq_plot.png
