# üî¨ CARIA-SR Definitive Validation

**CONSISTENT METHODOLOGY:**
1. Download S&P 500 constituent prices (500+ stocks)
2. Calculate AR + Entropy from cross-sectional covariance
3. Z-score + Peak Memory (60-day)
4. Quantile Regression validation
5. Minsky Hedge (pure structural: peak > 1.5)

In [None]:
# @title 1. Setup
!pip install -q yfinance pandas numpy scipy scikit-learn statsmodels seaborn matplotlib pyarrow requests

from google.colab import drive
drive.mount('/content/drive')

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
import requests
import warnings
from datetime import datetime
import statsmodels.formula.api as smf
from sklearn.covariance import LedoitWolf

warnings.filterwarnings('ignore')
np.random.seed(42)
sns.set_style('whitegrid')

WORK_DIR = '/content/drive/MyDrive/CARIA'
os.makedirs(WORK_DIR, exist_ok=True)
os.makedirs(f'{WORK_DIR}/figures', exist_ok=True)
os.makedirs(f'{WORK_DIR}/tables', exist_ok=True)

FMP_API_KEY = "79fY9wvC9qtCJHcn6Yelf4ilE9TkRMoq"
START_DATE = "2000-01-01"
END_DATE = datetime.now().strftime("%Y-%m-%d")

print(f"‚úÖ Output: {WORK_DIR}")

In [None]:
# @title 2. Download S&P 500 Constituent Prices (500+ stocks)

# Get current S&P 500 tickers
url = f"https://financialmodelingprep.com/api/v3/sp500_constituent?apikey={FMP_API_KEY}"
resp = requests.get(url)
sp500_tickers = [x['symbol'] for x in resp.json()] if resp.status_code == 200 else []
print(f"S&P 500 constituents: {len(sp500_tickers)}")

# Download prices in batches
print(f"\nDownloading prices for {len(sp500_tickers)} stocks...")
print("‚è≥ This takes 5-10 minutes...")

all_prices = []
batch_size = 50

for i in range(0, len(sp500_tickers), batch_size):
    batch = sp500_tickers[i:i+batch_size]
    try:
        data = yf.download(batch, start=START_DATE, end=END_DATE, progress=False, auto_adjust=True)['Close']
        if isinstance(data, pd.Series):
            data = data.to_frame()
        all_prices.append(data)
        print(f"   Batch {i//batch_size + 1}/{(len(sp500_tickers)-1)//batch_size + 1} OK")
    except Exception as e:
        print(f"   Batch {i//batch_size + 1} error: {e}")

# Combine all batches
prices = pd.concat(all_prices, axis=1)
prices = prices.dropna(axis=1, how='all')
prices.to_csv(f'{WORK_DIR}/sp500_prices.csv')

print(f"\n‚úÖ Downloaded {prices.shape[1]} stocks, {len(prices)} days")
print(f"   Period: {prices.index.min().date()} to {prices.index.max().date()}")

In [None]:
# @title 3. Download Market Data (VIX, SPY, TLT, 10Y Yield)

print("Downloading market data...")
market = yf.download(['^VIX', 'SPY', 'TLT', '^TNX'], start=START_DATE, end=END_DATE, progress=False)

market_df = pd.DataFrame({
    'volatility': market['Close']['^VIX'],
    'price': market['Close']['SPY'],
    'tlt': market['Close']['TLT'],
    'treasury_10y': market['Close']['^TNX']
}).dropna()
market_df.index.name = 'Date'
market_df.to_csv(f'{WORK_DIR}/market_validation_data.csv')

print(f"\n‚úÖ Market data: {len(market_df)} days")

In [None]:
# @title 4. Calculate Structural Metrics (AR + Entropy) ‚è≥ ~15 min

def cov_to_corr(S):
    d = np.sqrt(np.diag(S))
    d = np.where(d == 0, 1e-10, d)
    C = S / np.outer(d, d)
    return np.nan_to_num((C + C.T) / 2)

def eig_metrics(C, k_frac=0.2):
    w = np.sort(np.linalg.eigvalsh(C))[::-1]
    w = np.maximum(w, 1e-10)  # Avoid negative eigenvalues
    k = max(1, int(np.ceil(k_frac * len(w))))
    ar = np.sum(w[:k]) / np.sum(w)
    p = w / np.sum(w)
    ent = -np.sum(p * np.log(p + 1e-10)) / np.log(len(w)) if len(w) > 1 else 0.5
    return float(ar), float(ent)

# Calculate returns
returns = np.log(prices).diff()
good_coverage = returns.notna().mean() >= 0.9
returns = returns.loc[:, good_coverage]
print(f"Using {returns.shape[1]} stocks with >90% coverage")

# Rolling structural metrics
window = 252
step = 5
lw = LedoitWolf()

struct = pd.DataFrame(index=returns.index, columns=['absorption_ratio', 'entropy'], dtype=float)

total_steps = (len(returns) - window) // step
print(f"\nCalculating AR + Entropy ({total_steps} steps)...")

for idx, t in enumerate(range(window, len(returns), step)):
    W = returns.iloc[t-window:t]
    W = W.loc[:, W.notna().mean() >= 0.9]
    
    if W.shape[1] < 100:
        continue
    
    W = W.apply(lambda s: s.fillna(s.mean()))
    X = W.values - np.nanmean(W.values, axis=0)
    
    try:
        S = lw.fit(X).covariance_
        C = cov_to_corr(S)
    except:
        C = np.corrcoef(X, rowvar=False)
        C = np.nan_to_num((C + C.T) / 2)
    
    ar, ent = eig_metrics(C)
    struct.iloc[t] = [ar, ent]
    
    if (idx + 1) % 100 == 0:
        print(f"   {idx + 1}/{total_steps} ({(idx+1)/total_steps*100:.0f}%)")

struct = struct.ffill().bfill()
struct.index.name = 'date'
struct.to_csv(f'{WORK_DIR}/caria_structural_metrics.csv')

print(f"\n‚úÖ Structural metrics saved")
print(f"   AR mean: {struct['absorption_ratio'].mean():.4f}")
print(f"   Entropy mean: {struct['entropy'].mean():.4f}")

In [None]:
# @title 5. Merge Data and Calculate Signals

print("Loading and merging data...")

# Load from saved files (in case rerunning)
struct_df = pd.read_csv(f'{WORK_DIR}/caria_structural_metrics.csv', index_col='date', parse_dates=True)
market_df = pd.read_csv(f'{WORK_DIR}/market_validation_data.csv', index_col='Date', parse_dates=True)

# Merge
df = struct_df.join(market_df, how='inner').sort_index()

# Calculate Z-Scores
window_z = 252
df['absorp_z'] = (df['absorption_ratio'] - df['absorption_ratio'].rolling(window_z).mean()) / df['absorption_ratio'].rolling(window_z).std()

# Peak Memory (60 days) - THE KEY FEATURE
window_memory = 60
df['caria_peak'] = df['absorp_z'].rolling(window_memory).max()

# Future returns for prediction
df['future_ret_22'] = df['price'].pct_change(22).shift(-22)

df = df.dropna()
print(f"\n‚úÖ Dataset: {len(df)} observations")
print(f"   Period: {df.index.min().date()} to {df.index.max().date()}")

In [None]:
# @title Phase 8: Regime & Memory Test (Quantile Regression)

low_vol_df = df[df['volatility'] < 20].copy()
print(f"Testing on 'Calm Markets' (VIX < 20). N={len(low_vol_df)}")

# Model A: VIX Only
mod_vix = smf.quantreg('future_ret_22 ~ volatility', low_vol_df)
res_vix = mod_vix.fit(q=0.05)

# Model B: VIX + Peak Memory
mod_struct = smf.quantreg('future_ret_22 ~ volatility + caria_peak', low_vol_df)
res_struct = mod_struct.fit(q=0.05)

print(res_struct.summary())

print(f"\nBase Model (VIX Only) Pseudo R¬≤:      {res_vix.prsquared:.5f}")
print(f"Structural Model (+Peak) Pseudo R¬≤:   {res_struct.prsquared:.5f}")
imp = ((res_struct.prsquared - res_vix.prsquared)/res_vix.prsquared)*100
print(f"üî• Improvement in Low-Vol Regime:     {imp:.1f}%")

In [None]:
# @title Phase 9: Robustness Heatmap

windows = [20, 40, 60, 90, 120]
vix_caps = [15, 18, 20, 22, 25]
results_matrix = np.zeros((len(windows), len(vix_caps)))

print("Running Sensitivity Grid...")

for w in windows:
    df[f'peak_{w}'] = df['absorp_z'].rolling(window=w).max()

for i, w in enumerate(windows):
    for j, v in enumerate(vix_caps):
        subset = df[df['volatility'] < v].copy().dropna()
        if len(subset) > 500:
            try:
                r_b = smf.quantreg('future_ret_22 ~ volatility', subset).fit(q=0.05).prsquared
                r_s = smf.quantreg(f'future_ret_22 ~ volatility + peak_{w}', subset).fit(q=0.05).prsquared
                results_matrix[i, j] = ((r_s - r_b)/r_b)*100 if r_b > 0 else 0
            except: pass

plt.figure(figsize=(10, 8))
sns.heatmap(results_matrix, annot=True, fmt=".1f", cmap="RdYlGn", xticklabels=vix_caps, yticklabels=windows)
plt.title("Robustness: Improvement in Tail Risk Prediction (%)")
plt.xlabel("VIX Threshold")
plt.ylabel("Memory Window (Days)")
plt.savefig(f'{WORK_DIR}/figures/Robustness_Heatmap.png', dpi=300)
plt.show()

In [None]:
# @title Phase 11: Pure Structural Hedge (Cash)

print("Running Pure Structural Hedge...")

backtest_df = df.copy().dropna()
backtest_df['daily_ret'] = backtest_df['price'].pct_change()

# THE SIGNAL: Peak > 1.5 sigma = Unsafe
# Lag by 1 day (trade at next open)
backtest_df['unsafe_state'] = (backtest_df['caria_peak'].shift(1) > 1.5)

# Strategy: If Unsafe -> 0 (Cash). If Safe -> S&P 500.
backtest_df['strat_ret'] = np.where(backtest_df['unsafe_state'], 0, backtest_df['daily_ret'])

# Cumulative
backtest_df['cum_bnh'] = (1 + backtest_df['daily_ret'].fillna(0)).cumprod()
backtest_df['cum_strat'] = (1 + backtest_df['strat_ret'].fillna(0)).cumprod()

def get_max_drawdown(series):
    roll_max = series.cummax()
    return ((series - roll_max) / roll_max).min()

dd_bnh = get_max_drawdown(backtest_df['cum_bnh'])
dd_strat = get_max_drawdown(backtest_df['cum_strat'])

days = (backtest_df.index[-1] - backtest_df.index[0]).days
years = days / 365.25
cagr_bnh = (backtest_df['cum_bnh'].iloc[-1])**(1/years) - 1
cagr_strat = (backtest_df['cum_strat'].iloc[-1])**(1/years) - 1

print(f"\n--- PHASE 11 RESULTS ---")
print(f"Time in Cash/Hedge: {backtest_df['unsafe_state'].mean()*100:.1f}%")
print(f"Buy & Hold:   Max DD = {dd_bnh:.1%}, CAGR = {cagr_bnh:.1%}")
print(f"Minsky Cash:  Max DD = {dd_strat:.1%}, CAGR = {cagr_strat:.1%}")
print(f"DD Reduction: {dd_bnh - dd_strat:.1%}")

In [None]:
# @title Phase 12: Smart Hedge (Treasuries) + Vol-Targeting

print("Running Smart Hedge...")

# Treasury return from yield
backtest_df['treasury_daily_ret'] = (backtest_df['treasury_10y'] / 100) / 252

# Strategy: Unsafe -> Treasuries. Safe -> S&P 500.
backtest_df['smart_ret'] = np.where(
    backtest_df['unsafe_state'],
    backtest_df['treasury_daily_ret'],
    backtest_df['daily_ret']
)

# Levered version (1.5x when safe)
leverage = 1.5
backtest_df['lev_ret'] = np.where(
    backtest_df['unsafe_state'],
    backtest_df['treasury_daily_ret'],
    backtest_df['daily_ret'] * leverage - (0.05/252 * (leverage-1))
)

# Cumulative
backtest_df['cum_smart'] = (1 + backtest_df['smart_ret'].fillna(0)).cumprod()
backtest_df['cum_lev'] = (1 + backtest_df['lev_ret'].fillna(0)).cumprod()

dd_smart = get_max_drawdown(backtest_df['cum_smart'])
cagr_smart = (backtest_df['cum_smart'].iloc[-1])**(1/years) - 1

dd_lev = get_max_drawdown(backtest_df['cum_lev'])
cagr_lev = (backtest_df['cum_lev'].iloc[-1])**(1/years) - 1

print(f"\n--- PHASE 12 RESULTS ---")
print(f"Benchmark (S&P 500):    DD = {dd_bnh:.1%}, CAGR = {cagr_bnh:.1%}")
print(f"Minsky (Cash Hedge):    DD = {dd_strat:.1%}, CAGR = {cagr_strat:.1%}")
print(f"Minsky (Smart/Bond):    DD = {dd_smart:.1%}, CAGR = {cagr_smart:.1%}")
print(f"Minsky (1.5x Levered):  DD = {dd_lev:.1%}, CAGR = {cagr_lev:.1%}")

In [None]:
# @title Visualization

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Equity Curves
ax = axes[0, 0]
ax.plot(backtest_df.index, backtest_df['cum_bnh'], label='S&P 500', color='gray', alpha=0.5)
ax.plot(backtest_df.index, backtest_df['cum_smart'], label=f'Minsky Bond (DD:{dd_smart:.0%})', color='blue', linewidth=2)
ax.plot(backtest_df.index, backtest_df['cum_lev'], label=f'Minsky 1.5x (DD:{dd_lev:.0%})', color='darkgreen', linewidth=2)
ax.set_yscale('log')
ax.set_title('Equity Curves')
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3)

# Signal
ax = axes[0, 1]
ax.plot(backtest_df.index, backtest_df['caria_peak'], color='darkred')
ax.axhline(y=1.5, color='orange', linestyle='--', label='Threshold (1.5œÉ)')
ax.fill_between(backtest_df.index, 0, 4, where=backtest_df['unsafe_state'], alpha=0.2, color='red')
ax.set_title('CARIA Peak Memory Signal')
ax.legend()
ax.grid(True, alpha=0.3)

# Drawdowns
ax = axes[1, 0]
dd_bnh_series = backtest_df['cum_bnh'] / backtest_df['cum_bnh'].cummax() - 1
dd_smart_series = backtest_df['cum_smart'] / backtest_df['cum_smart'].cummax() - 1
ax.fill_between(backtest_df.index, dd_bnh_series, 0, alpha=0.3, color='gray', label='S&P 500')
ax.plot(backtest_df.index, dd_smart_series, color='blue', label='Minsky')
ax.set_title('Drawdowns')
ax.legend()
ax.grid(True, alpha=0.3)

# 2020 COVID zoom
ax = axes[1, 1]
subset = backtest_df.loc['2019-01-01':'2020-06-01']
ax.plot(subset.index, subset['volatility'], color='gray', linestyle='--', label='VIX')
ax2 = ax.twinx()
ax2.plot(subset.index, subset['caria_peak'], color='darkred', linewidth=2, label='Peak Memory')
ax2.axhline(y=1.5, color='orange', linestyle='--')
ax.set_title('COVID-19 Warning')
ax2.legend(loc='upper left')

plt.tight_layout()
plt.savefig(f'{WORK_DIR}/figures/Final_Results.png', dpi=300)
plt.show()

In [None]:
# @title Final Summary

print("\n" + "="*70)
print("üî¨ CARIA-SR DEFINITIVE VALIDATION")
print("="*70)

print(f"\nüìä DATA:")
print(f"   Universe: {returns.shape[1]} S&P 500 stocks")
print(f"   Period: {df.index.min().date()} to {df.index.max().date()}")
print(f"   Observations: {len(df)}")

print(f"\nüî¨ QUANTILE REGRESSION (VIX < 20):")
print(f"   VIX-only Pseudo R¬≤: {res_vix.prsquared:.5f}")
print(f"   VIX+Peak Pseudo R¬≤: {res_struct.prsquared:.5f}")
print(f"   Improvement: {imp:.1f}%")
print(f"   Peak Memory p-value: {res_struct.pvalues['caria_peak']:.4f}")

print(f"\nüí∞ MINSKY HEDGE:")
print(f"   Time in Hedge: {backtest_df['unsafe_state'].mean()*100:.1f}%")
print(f"   {'Strategy':<20} {'Max DD':>10} {'CAGR':>10}")
print(f"   {'-'*42}")
print(f"   {'S&P 500':<20} {dd_bnh:>10.1%} {cagr_bnh:>10.1%}")
print(f"   {'Minsky Cash':<20} {dd_strat:>10.1%} {cagr_strat:>10.1%}")
print(f"   {'Minsky Bond':<20} {dd_smart:>10.1%} {cagr_smart:>10.1%}")
print(f"   {'Minsky 1.5x':<20} {dd_lev:>10.1%} {cagr_lev:>10.1%}")

print(f"\nüìÅ Files saved to: {WORK_DIR}/")
print("\n‚úÖ DONE!")