# ROPE Results Analysis

This notebook provides tools for analyzing ROPE evaluation results,
creating visualizations, and identifying patterns.

## 1. Load Results

In [None]:
import json
import pandas as pd
import numpy as np

# Load results (adjust path as needed)
RESULTS_PATH = "../results.json"  # Change to your results file

try:
    with open(RESULTS_PATH) as f:
        results = json.load(f)
    print(f"Loaded {len(results)} results")
except FileNotFoundError:
    print(f"Results file not found at {RESULTS_PATH}")
    print("Run an evaluation first: rope run --models phi2 --defenses none,delimiter")
    # Create synthetic results for demonstration
    np.random.seed(42)
    results = []
    for model in ["llama2-7b", "llama3-8b", "phi2"]:
        for defense in ["none", "delimiter", "icl"]:
            for attack_type in ["hijack", "extract", "obfuscate", "poison"]:
                for task_family in ["qa", "summarize", "rag"]:
                    for i in range(10):
                        base = {"none": 1.5, "delimiter": 0.8, "icl": 0.6}[defense]
                        severity = min(3, max(0, int(np.random.normal(base, 1))))
                        results.append({
                            "model": model,
                            "defense": defense,
                            "task_id": i + 1,
                            "task_family": task_family,
                            "attack_type": attack_type,
                            "severity": severity,
                            "response": "synthetic"
                        })
    print(f"Created {len(results)} synthetic results for demonstration")

In [None]:
df = pd.DataFrame(results)
print(f"Models: {df['model'].unique()}")
print(f"Defenses: {df['defense'].unique()}")
print(f"Attack types: {df['attack_type'].unique()}")
print(f"Task families: {df['task_family'].unique()}")
print(f"\nSeverity distribution:")
print(df['severity'].value_counts().sort_index())

## 2. Overall Metrics

In [None]:
from rope.metrics import compute_metrics, compute_by_attack_type, compute_by_task_family, print_summary

# Overall metrics
metrics = compute_metrics(results)
print_summary(metrics)
metrics

## 3. Defense Comparison Chart

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 150

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Chart 1: ASR by model and defense
ax = axes[0]
pivot = metrics.pivot(index='defense', columns='model', values='asr_1plus')
pivot.plot(kind='bar', ax=ax, width=0.7)
ax.set_title('Attack Success Rate by Model and Defense')
ax.set_ylabel('ASR (severity >= 1)')
ax.set_xlabel('Defense')
ax.set_ylim(0, 1)
ax.legend(title='Model')
ax.grid(axis='y', alpha=0.3)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)

# Chart 2: Average severity
ax = axes[1]
pivot_sev = metrics.pivot(index='defense', columns='model', values='avg_severity')
pivot_sev.plot(kind='bar', ax=ax, width=0.7)
ax.set_title('Average Severity by Model and Defense')
ax.set_ylabel('Average Severity (0-3)')
ax.set_xlabel('Defense')
ax.set_ylim(0, 3)
ax.legend(title='Model')
ax.grid(axis='y', alpha=0.3)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)

plt.tight_layout()
plt.savefig('defense_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved defense_comparison.png')

## 4. Attack Type Heatmap

In [None]:
try:
    import seaborn as sns
except ImportError:
    print("Install seaborn: pip install seaborn")
    sns = None

if sns:
    by_type = compute_by_attack_type(results)

    # Create one heatmap per model
    models = by_type['model'].unique()
    fig, axes = plt.subplots(1, len(models), figsize=(6 * len(models), 4))
    if len(models) == 1:
        axes = [axes]

    for ax, model in zip(axes, models):
        model_data = by_type[by_type['model'] == model]
        pivot = model_data.pivot_table(
            values='asr_1plus',
            index='defense',
            columns='attack_type'
        )
        sns.heatmap(pivot, annot=True, fmt='.2f', cmap='RdYlGn_r',
                    vmin=0, vmax=1, ax=ax, cbar_kws={'label': 'ASR'})
        ax.set_title(f'{model}: ASR by Defense x Attack Type')

    plt.tight_layout()
    plt.savefig('attack_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()
    print('Saved attack_heatmap.png')

## 5. Task Family Analysis

In [None]:
by_family = compute_by_task_family(results)
print("Metrics by task family:")
by_family

In [None]:
# Task family vulnerability comparison
fig, ax = plt.subplots(figsize=(10, 5))

# Filter to 'none' defense to show raw vulnerability
none_family = by_family[by_family['defense'] == 'none'] if 'none' in by_family['defense'].values else by_family
pivot_fam = none_family.pivot_table(values='asr_1plus', index='task_family', columns='model')
pivot_fam.plot(kind='bar', ax=ax, width=0.7)
ax.set_title('Vulnerability by Task Family (No Defense)')
ax.set_ylabel('ASR (severity >= 1)')
ax.set_xlabel('Task Family')
ax.set_ylim(0, 1)
ax.legend(title='Model')
ax.grid(axis='y', alpha=0.3)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)
plt.tight_layout()
plt.show()

## 6. Key Findings

In [None]:
print("=" * 60)
print("KEY FINDINGS")
print("=" * 60)

# Finding 1: Most/least robust model
none_metrics = metrics[metrics['defense'] == 'none'] if 'none' in metrics['defense'].values else metrics
if len(none_metrics) > 0:
    most_robust = none_metrics.loc[none_metrics['asr_1plus'].idxmin()]
    least_robust = none_metrics.loc[none_metrics['asr_1plus'].idxmax()]
    print(f"\n1. Most robust model (no defense): {most_robust['model']} (ASR: {most_robust['asr_1plus']:.1%})")
    print(f"   Least robust model (no defense): {least_robust['model']} (ASR: {least_robust['asr_1plus']:.1%})")

# Finding 2: Best defense overall
defense_avg = metrics.groupby('defense')['asr_1plus'].mean()
best_defense = defense_avg.idxmin()
print(f"\n2. Best defense overall: {best_defense} (avg ASR: {defense_avg[best_defense]:.1%})")

# Finding 3: Hardest attack type
by_type = compute_by_attack_type(results)
type_avg = by_type.groupby('attack_type')['asr_1plus'].mean()
hardest = type_avg.idxmax()
print(f"\n3. Hardest attack to defend: {hardest} (avg ASR: {type_avg[hardest]:.1%})")

print("\n" + "=" * 60)