# GREAT CARIA v2.0 - Robust Implementation

With proper purge/embargo validation and CF change-of-direction target.

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

In [None]:
# === 1. LOAD DATA ===
from google.colab import drive
drive.mount('/content/drive')

MARKET_PATH = '/content/drive/MyDrive/CARIA/data/raw/yahoo_market.parquet'
df_daily = pd.read_parquet(MARKET_PATH)
COUNTRIES = ['USA', 'CHN', 'JPN', 'DEU', 'GBR', 'FRA', 'BRA', 'MEX', 'KOR', 'AUS']
print(f"Loaded: {df_daily.shape}")

In [None]:
# === 2. COMPUTE CRISIS FACTOR ===
index_cols = [f'{c}_index' for c in COUNTRIES if f'{c}_index' in df_daily.columns]
returns = df_daily[index_cols].pct_change().dropna()
returns.columns = [c.replace('_index', '') for c in returns.columns]

def compute_cf(returns, window=20):
    cf = []
    for i in range(window, len(returns)):
        w = returns.iloc[i-window:i]
        corr = w.corr().values
        avg_corr = (corr.sum() - len(corr)) / (len(corr) * (len(corr) - 1))
        avg_vol = w.std().mean()
        cf.append(avg_corr * avg_vol * 100)
    return pd.Series(cf, index=returns.index[window:])

CF = compute_cf(returns)
print(f"CF computed: {len(CF)} samples")

In [None]:
# === 3. TARGET: CF CHANGE-OF-DIRECTION ===
HORIZON = 5
cf_future = CF.shift(-HORIZON)
cf_change = (cf_future > CF).astype(int)  # 1 = CF will increase
cf_change = cf_change.dropna()
print(f"Target samples: {len(cf_change)}")

In [None]:
# === 4. FEATURES ===
features = pd.DataFrame(index=cf_change.index)
features['cf_now'] = CF.loc[cf_change.index]
features['cf_ma5'] = CF.rolling(5).mean().loc[cf_change.index]
features['cf_ma20'] = CF.rolling(20).mean().loc[cf_change.index]
features['vix'] = df_daily['VIX'].loc[cf_change.index]
features['dxy'] = df_daily['DXY'].loc[cf_change.index]
features['ret_usa'] = returns['USA'].rolling(5).mean().loc[cf_change.index]
features['ret_bra'] = returns['BRA'].rolling(5).mean().loc[cf_change.index] if 'BRA' in returns else 0

features = features.dropna()
target = cf_change.loc[features.index]
print(f"Final samples: {len(features)}")

In [None]:
# === 5. PURGED TEMPORAL SPLIT ===
PURGE = 20
EMBARGO = 10
train_end = int(len(features) * 0.7)
test_start = train_end + PURGE + EMBARGO

X_train = features.iloc[:train_end].values
y_train = target.iloc[:train_end].values
X_test = features.iloc[test_start:].values
y_test = target.iloc[test_start:].values

mu, sigma = X_train.mean(axis=0), X_train.std(axis=0) + 1e-8
X_train = (X_train - mu) / sigma
X_test = (X_test - mu) / sigma

print(f"Train: {len(X_train)}, Test: {len(X_test)} (purge={PURGE}, embargo={EMBARGO})")

In [None]:
# === 6. LOGISTIC REGRESSION BASELINE ===
lr = LogisticRegression(max_iter=1000, C=0.1)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)
print(f"\nLogistic Regression Accuracy (purged): {acc_lr:.1%}")

In [None]:
# === 7. SHUFFLE TEST ===
y_train_shuffled = np.random.permutation(y_train)
lr_shuffle = LogisticRegression(max_iter=1000, C=0.1)
lr_shuffle.fit(X_train, y_train_shuffled)
y_pred_shuffle = lr_shuffle.predict(X_test)
acc_shuffle = accuracy_score(y_test, y_pred_shuffle)
print(f"Shuffled Accuracy: {acc_shuffle:.1%}")
print(f"Lift over random: {(acc_lr - acc_shuffle)*100:.1f}pp")

In [None]:
# === 8. ROLLING OOS VALIDATION ===
n_folds = 5
fold_size = len(features) // (n_folds + 1)
rolling_accs = []

for i in range(n_folds):
    train_end = (i + 1) * fold_size
    test_start = train_end + PURGE + EMBARGO
    test_end = test_start + fold_size
    
    if test_end > len(features):
        break
    
    X_tr = features.iloc[:train_end].values
    y_tr = target.iloc[:train_end].values
    X_te = features.iloc[test_start:test_end].values
    y_te = target.iloc[test_start:test_end].values
    
    mu, sigma = X_tr.mean(axis=0), X_tr.std(axis=0) + 1e-8
    X_tr = (X_tr - mu) / sigma
    X_te = (X_te - mu) / sigma
    
    lr_fold = LogisticRegression(max_iter=1000, C=0.1)
    lr_fold.fit(X_tr, y_tr)
    acc = accuracy_score(y_te, lr_fold.predict(X_te))
    rolling_accs.append(acc)
    print(f"  Fold {i+1}: {acc:.1%}")

print(f"\nRolling OOS Mean: {np.mean(rolling_accs):.1%} ± {np.std(rolling_accs):.1%}")

In [None]:
# === 9. SUMMARY ===
print("\n" + "="*50)
print("GREAT CARIA v2.0 VALIDATION RESULTS")
print("="*50)
print(f"Main accuracy (purged): {acc_lr:.1%}")
print(f"Shuffle accuracy:       {acc_shuffle:.1%}")
print(f"Lift over random:       {(acc_lr - acc_shuffle)*100:.1f}pp")
print(f"Rolling OOS mean:       {np.mean(rolling_accs):.1%}")
print("="*50)

if acc_lr > 0.55 and (acc_lr - acc_shuffle) > 0.03:
    print("✓ PASSED - Real signal detected")
else:
    print("✗ FAILED - Signal is marginal or noise")