# Quantitative Comparison: Feature-Based vs Raw Particles

This notebook performs a rigorous quantitative comparison of different template approaches on 100 jets.

**Test Conditions:**
- Sample size: 100 jets (statistically significant)
- Async batching: batch_size=10 for speed
- Thinking budget: 1000 tokens  
- Templates tested:
  1. Raw particles (simple_list)
  2. Features only (features_basic, features_kinematic, features_full)
  3. Hybrid (hybrid_basic, hybrid_kinematic, hybrid_full)

**Metrics:**
- Accuracy
- AUC (Area Under ROC Curve)
- Total cost ($)
- Average time per jet
- Token efficiency (tokens/jet)


In [1]:
%load_ext autoreload
%autoreload 2

import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

sys.path.insert(0, str(Path.cwd().parent / 'src'))

from vibe_jet_tagging import LLMClassifier


## 1. Load Test Data (100 jets)


In [2]:
# Load data
data_path = Path.cwd().parent / 'data' / 'qg_jets.npz'
data = np.load(data_path)

X = data['X']
y = data['y']

# Take 100 test jets (balanced)
np.random.seed(42)
quark_idx = np.where(y == 1)[0]
gluon_idx = np.where(y == 0)[0]

# Sample 50 of each
test_quark = np.random.choice(quark_idx, 50, replace=False)
test_gluon = np.random.choice(gluon_idx, 50, replace=False)
test_idx = np.concatenate([test_quark, test_gluon])
np.random.shuffle(test_idx)

X_test = X[test_idx]
y_test = y[test_idx]

print(f"Test set: {len(X_test)} jets")
print(f"  Quark: {np.sum(y_test == 1)} ({np.mean(y_test)*100:.1f}%)")
print(f"  Gluon: {np.sum(y_test == 0)} ({(1-np.mean(y_test))*100:.1f}%)")


Test set: 100 jets
  Quark: 50 (50.0%)
  Gluon: 50 (50.0%)


## 2. Define Template Configurations

We'll test 7 templates systematically.


In [4]:
configs = [
    ("Raw: Simple List", "simple_list", "none"),
    # ("Features: Basic", "features_basic", None),
    # ("Features: Kinematic", "features_kinematic", None),
    ("Features: Full", "features_full", None),
    # ("Hybrid: Basic", "hybrid_basic", None),
    # ("Hybrid: Kinematic", "hybrid_kinematic", None),
    ("Hybrid: Full", "hybrid_full", None),
]

print(f"Testing {len(configs)} configurations")
for name, template, _ in configs:
    print(f"  - {name}")


Testing 3 configurations
  - Raw: Simple List
  - Features: Full
  - Hybrid: Full


## 3. Run Evaluation

**Warning:** This will make ~100 API calls per template (~700 total calls).  
Estimated cost: ~$0.50 total.  
Estimated time: ~10-15 minutes with async batching.


In [5]:
results = []

for name, template, extractor in configs:
    print(f"\n{'='*70}")
    print(f"Testing: {name}")
    print(f"Template: {template}")
    print(f"{'='*70}")
    
    clf = LLMClassifier(
        template_name=template,
        templates_dir='../templates',
        feature_extractor=extractor,
        thinking_budget=1000,
        max_tokens=2000,
    )
    
    clf.fit([], [])
    
    # Time the prediction
    start_time = time.time()
    predictions = clf.predict(X_test, batch_size=10)  # Async batching
    elapsed_time = time.time() - start_time
    
    # Compute metrics
    acc = accuracy_score(y_test, predictions)
    try:
        auc = roc_auc_score(y_test, predictions)
    except:
        auc = 0.5  # If all same class
    
    cm = confusion_matrix(y_test, predictions)
    
    # Token stats
    total_tokens = clf.total_prompt_tokens + clf.total_completion_tokens + clf.total_thinking_tokens
    tokens_per_jet = total_tokens / len(X_test)
    time_per_jet = elapsed_time / len(X_test)
    
    results.append({
        'name': name,
        'template': template,
        'accuracy': acc,
        'auc': auc,
        'cost': clf.total_cost,
        'total_time': elapsed_time,
        'time_per_jet': time_per_jet,
        'prompt_tokens': clf.total_prompt_tokens,
        'completion_tokens': clf.total_completion_tokens,
        'thinking_tokens': clf.total_thinking_tokens,
        'total_tokens': total_tokens,
        'tokens_per_jet': tokens_per_jet,
        'confusion_matrix': cm,
    })
    
    print(f"\n Results:")
    print(f"  Accuracy: {acc:.3f}")
    print(f"  AUC: {auc:.3f}")
    print(f"  Cost: ${clf.total_cost:.4f}")
    print(f"  Time: {elapsed_time:.1f}s ({time_per_jet:.2f}s/jet)")
    print(f"  Tokens: {total_tokens:,} ({tokens_per_jet:.1f}/jet)")

print(f"\n{'='*70}")
print("✓ All evaluations complete!")
print(f"{'='*70}")



Testing: Raw: Simple List
Template: simple_list


RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:
df = pd.DataFrame(results)

# Display main metrics
display_df = df[['name', 'accuracy', 'auc', 'cost', 'tokens_per_jet', 'time_per_jet']].copy()
display_df['cost'] = display_df['cost'].map('${:.4f}'.format)
display_df['tokens_per_jet'] = display_df['tokens_per_jet'].map('{:.0f}'.format)
display_df['time_per_jet'] = display_df['time_per_jet'].map('{:.2f}s'.format)

print("\n" + "="*90)
print("QUANTITATIVE RESULTS (100 jets)")
print("="*90)
print(display_df.to_string(index=False))
print("="*90)

# Find best performers
best_acc_idx = df['accuracy'].idxmax()
best_auc_idx = df['auc'].idxmax()
best_cost_idx = df['cost'].idxmin()

print(f"\n Best Accuracy: {df.loc[best_acc_idx, 'name']} ({df.loc[best_acc_idx, 'accuracy']:.3f})")
print(f"Best AUC:      {df.loc[best_auc_idx, 'name']} ({df.loc[best_auc_idx, 'auc']:.3f})")
print(f"Best Cost:     {df.loc[best_cost_idx, 'name']} (${df.loc[best_cost_idx, 'cost']:.4f})")


## 5. Visualizations


In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

names = df['name'].values
colors = ['steelblue', 'coral', 'gold', 'lime', 'mediumseagreen', 'orchid', 'cyan']

# Accuracy
axes[0, 0].barh(names, df['accuracy'], color=colors)
axes[0, 0].set_xlabel('Accuracy')
axes[0, 0].set_title('Accuracy Comparison')
axes[0, 0].axvline(0.5, color='red', linestyle='--', alpha=0.5, label='Random')
axes[0, 0].axvline(0.84, color='orange', linestyle='--', alpha=0.5, label='Baseline (mult cut)')
axes[0, 0].legend()
axes[0, 0].grid(axis='x', alpha=0.3)

# AUC
axes[0, 1].barh(names, df['auc'], color=colors)
axes[0, 1].set_xlabel('AUC')
axes[0, 1].set_title('AUC Comparison')
axes[0, 1].axvline(0.5, color='red', linestyle='--', alpha=0.5, label='Random')
axes[0, 1].axvline(0.84, color='orange', linestyle='--', alpha=0.5, label='Baseline')
axes[0, 1].legend()
axes[0, 1].grid(axis='x', alpha=0.3)

# Cost
axes[1, 0].barh(names, df['cost'], color=colors)
axes[1, 0].set_xlabel('Cost ($)')
axes[1, 0].set_title('Cost per 100 Jets')
axes[1, 0].grid(axis='x', alpha=0.3)

# Tokens per jet
axes[1, 1].barh(names, df['tokens_per_jet'], color=colors)
axes[1, 1].set_xlabel('Tokens per Jet')
axes[1, 1].set_title('Token Efficiency')
axes[1, 1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✓ Visualization complete!")


## 6. Performance vs Cost Trade-off


In [None]:
plt.figure(figsize=(10, 6))

# Plot AUC vs Cost
for i, row in df.iterrows():
    plt.scatter(row['cost'], row['auc'], s=200, color=colors[i], alpha=0.7, edgecolors='black', linewidth=2)
    plt.annotate(row['name'].split(':')[0], (row['cost'], row['auc']), 
                fontsize=9, ha='center', va='bottom')

plt.xlabel('Cost ($)', fontsize=12)
plt.ylabel('AUC', fontsize=12)
plt.title('Performance vs Cost Trade-off', fontsize=14, fontweight='bold')
plt.axhline(0.84, color='orange', linestyle='--', alpha=0.5, label='Baseline AUC (0.84)')
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()

print("\n Key Insights:")
print("  - Top-right corner = best (high AUC, low cost)")
print("  - Feature-only templates typically offer best cost efficiency")
print("  - Hybrid templates may provide marginal performance gains")


## 7. Statistical Summary

Calculate statistical significance and confidence intervals.


In [None]:
# Calculate binomial confidence intervals for accuracy
from scipy import stats

print("\n" + "="*70)
print("STATISTICAL ANALYSIS (95% Confidence Intervals)")
print("="*70)

for i, row in df.iterrows():
    n = 100  # number of jets
    p = row['accuracy']  # observed accuracy
    
    # Wilson score interval (better for edge cases)
    ci_low, ci_high = stats.binom.interval(0.95, n, p)
    ci_low /= n
    ci_high /= n
    
    print(f"\n{row['name']}:")
    print(f"  Accuracy: {p:.3f} [{ci_low:.3f}, {ci_high:.3f}]")
    print(f"  AUC:      {row['auc']:.3f}")
    print(f"  Cost:     ${row['cost']:.4f}")
    print(f"  Tokens:   {row['tokens_per_jet']:.0f}/jet")

print("\n" + "="*70)
print("✓ Analysis complete!")
