In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load features
df = pd.read_parquet("../data/processed/daily_features_full.parquet")

# ============================================================
# 1. Basic Stats
# ============================================================
print("=== Feature Summary ===")
print(df.describe())

# ============================================================
# 2. OFI Distribution
# ============================================================
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# OFI histogram
axes[0].hist(df['ofi'], bins=30, edgecolor='black')
axes[0].axvline(0, color='red', linestyle='--')
axes[0].set_xlabel('OFI')
axes[0].set_title('OFI Distribution (should be centered ~0)')

# OFI by date
df.groupby('date')['ofi'].mean().plot(ax=axes[1])
axes[1].set_ylabel('Mean OFI')
axes[1].set_title('Time Series Stability')

# OFI vs forward returns (placeholder for Phase 2)
axes[2].text(0.5, 0.5, 'Phase 2:\nOFI vs Returns', 
             ha='center', va='center', fontsize=14)
axes[2].set_title('Predictive Power (TODO)')

plt.tight_layout()
plt.show()

# ============================================================
# 3. Spread Analysis
# ============================================================
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Spread distribution (log scale)
axes[0].hist(np.log10(df['spread_mean']*10000), bins=30, edgecolor='black')
axes[0].set_xlabel('Log10(Spread in bps)')
axes[0].set_title('Spread Distribution')

# Spread stability
axes[1].scatter(df['spread_mean']*10000, df['spread_stability'], alpha=0.5)
axes[1].set_xlabel('Mean Spread (bps)')
axes[1].set_ylabel('Spread CV')
axes[1].set_title('Spread Level vs Stability')

plt.tight_layout()
plt.show()

# ============================================================
# 4. Cross-Sectional Correlations
# ============================================================
corr_features = ['ofi', 'ofi_std', 'spread_mean', 'spread_stability', 
                 'morning_share', 'dollar_volume']

corr_matrix = df[corr_features].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlations')
plt.tight_layout()
plt.show()

# ============================================================
# 5. Data Quality Checks
# ============================================================
print("\n=== Data Quality ===")
print(f"Missing values:\n{df.isnull().sum()}")
print(f"\nClassification method:\n{df['classification_method'].value_counts()}")
print(f"\nEvents: {df['event_id'].nunique()}")
print(f"Permnos: {df['permno'].nunique()}")
print(f"Days per event: {df.groupby('event_id').size().describe()}")

# ============================================================
# 6. Sanity Checks
# ============================================================
print("\n=== Sanity Checks ===")

# Buy + Sell should equal Volume
volume_check = np.allclose(
    df['buy_volume'] + df['sell_volume'], 
    df['volume']
)
print(f"Buy + Sell = Volume: {volume_check}")

# OFI should match (buy - sell) / volume
ofi_check = np.allclose(
    (df['buy_volume'] - df['sell_volume']) / df['volume'],
    df['ofi']
)
print(f"OFI formula correct: {ofi_check}")

# Share volume should equal volume
share_volume_check = (df['share_volume'] == df['volume']).all()
print(f"Share volume matches: {share_volume_check}")

print("\nâœ“ Feature validation complete")