# CKA Transferability Analysis (Framework v3)

This notebook analyzes attack transferability between open-weight and closed-weight LLMs
using Centered Kernel Alignment (CKA) to identify optimal surrogate models.

**Phases:**
1. Phase 1: Compute CKA similarity matrix between open models
2. Phase 2: Run attack suite on all models (open + closed)
3. Phase 3: Correlate CKA with attack transfer, recommend surrogates

**Requirements:**
- HuggingFace token (for Gemma, Llama)
- OpenAI API key (optional, for Phase 2)
- Anthropic API key (optional, for Phase 2)

In [None]:
# ==========================================
# CELL 1: Install Dependencies
# ==========================================
!pip install -q transformers accelerate torch scipy numpy huggingface_hub tokenizers
!pip install -q openai anthropic  # Optional for Phase 2
!pip install -q matplotlib pandas seaborn  # For charts and tables

print("Dependencies installed.")

In [None]:
# ==========================================
# CELL 2: Clone Framework from GitHub
# ==========================================
import os
import sys
import shutil

# Clear any cached imports
for mod in list(sys.modules.keys()):
    if 'transferability' in mod:
        del sys.modules[mod]

# Clone the repository
if not os.path.exists('/content/AI-SecOps'):
    !git clone -b framework-v3 https://github.com/zbovaird/AI-SecOps.git /content/AI-SecOps
else:
    !cd /content/AI-SecOps && git fetch origin framework-v3 && git checkout framework-v3 && git pull origin framework-v3

# Copy framework to content for easy imports
framework_src = '/content/AI-SecOps/transferability_framework'
framework_dest = '/content/transferability_framework'

if os.path.exists(framework_src):
    if os.path.exists(framework_dest):
        shutil.rmtree(framework_dest)
    shutil.copytree(framework_src, framework_dest)
    print("[OK] Framework copied to /content/transferability_framework")

# Add to path
if '/content' not in sys.path:
    sys.path.insert(0, '/content')

# Verify core modules
print("\nVerifying modules:")
for module in ['cka', 'model_loader', 'attack_suite', 'api_clients']:
    path = f'/content/transferability_framework/core/{module}.py'
    status = "OK" if os.path.exists(path) else "MISSING"
    print(f"  {module}: [{status}]")

In [None]:
# ==========================================
# CELL 3: Hugging Face Authentication
# ==========================================
print("=" * 60)
print("HUGGING FACE AUTHENTICATION")
print("=" * 60)
print("\n‚ö†Ô∏è  IMPORTANT: Gemma and Llama models require Hugging Face authentication!")
print("\nSteps:")
print("1. Visit: https://huggingface.co/settings/tokens")
print("2. Log in to Hugging Face (or create account)")
print("3. Generate a token with 'read' permissions")
print("4. Accept model licenses:")
print("   - Gemma: https://huggingface.co/google/gemma-2-2b-it")
print("   - Llama: https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct")
print()

# Initialize HF_TOKEN (will be None if using CLI login)
HF_TOKEN = None

# Check if already logged in
try:
    from huggingface_hub import whoami
    user_info = whoami()
    print(f"‚úì Already logged in as: {user_info.get('name', 'Unknown')}")
    print(f"‚úì Email: {user_info.get('email', 'Not provided')}")
    print("\nYou can proceed to Cell 4 to verify GPU setup.")
    # HF_TOKEN remains None - CLI login handles authentication
except Exception:
    print("‚ö†Ô∏è  Not logged in yet.")
    print("\nRun the command below to login:")
    print()
    # HF_TOKEN remains None - user will login via CLI

# Hugging Face CLI login command - UNCOMMENT AND RUN THIS LINE
# !huggingface-cli login

# Alternative: Python login (uncomment and add your token)
# from huggingface_hub import login
# login(token='your_token_here')  # Replace with your actual token

print("\nAfter logging in, run this cell again to verify, then proceed to Cell 4.")

# ==========================================
# Optional: Configure API Keys for Closed Models (Phase 2)
# ==========================================
print("\n" + "=" * 60)
print("OPTIONAL: API Keys for Closed Models (Phase 2)")
print("=" * 60)

# OpenAI API key (OPTIONAL for Phase 2)
try:
    from google.colab import userdata
    OPENAI_KEY = userdata.get('OPENAI_API_KEY')
    print("[OK] OpenAI API key loaded from secrets")
except:
    OPENAI_KEY = None
    print("[INFO] No OpenAI key - skipping GPT-4 testing")
    print("       To add: Colab ‚Üí üîë Secrets ‚Üí Add 'OPENAI_API_KEY'")

# Anthropic API key (OPTIONAL for Phase 2)
try:
    from google.colab import userdata
    ANTHROPIC_KEY = userdata.get('ANTHROPIC_API_KEY')
    print("[OK] Anthropic API key loaded from secrets")
except:
    ANTHROPIC_KEY = None
    print("[INFO] No Anthropic key - skipping Claude testing")
    print("       To add: Colab ‚Üí üîë Secrets ‚Üí Add 'ANTHROPIC_API_KEY'")

In [None]:
# ==========================================
# CELL 4: Verify GPU and Setup Logging
# ==========================================
import torch
import logging

# Check GPU
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"[OK] GPU: {gpu_name} ({gpu_memory:.1f} GB)")
else:
    print("[WARNING] No GPU detected - analysis will be slow")

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
# ==========================================
# CELL 5: Mount Google Drive for Results
# ==========================================
from google.colab import drive
import os
from datetime import datetime

# Mount Drive
drive.mount('/content/drive')

# Create results directory
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
RESULTS_DIR = f'/content/drive/MyDrive/transferability_results/{timestamp}'
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"[OK] Results will be saved to: {RESULTS_DIR}")

In [None]:
# ==========================================
# CELL 6: Import Framework Components
# ==========================================
from transferability_framework.core.cka import CKA, linear_cka
from transferability_framework.core.model_loader import ModelLoader, SUPPORTED_MODELS
from transferability_framework.core.attack_suite import AttackSuite, STANDARD_ATTACKS
from transferability_framework.core.api_clients import OpenAIClient, AnthropicClient

from transferability_framework.experiments.open_model_cka import OpenModelCKAExperiment, CKA_PROMPTS
from transferability_framework.experiments.attack_testing import AttackTestingExperiment
from transferability_framework.experiments.correlation import CorrelationAnalysis

print("[OK] All framework components imported")
print(f"\nSupported open models: {list(SUPPORTED_MODELS.keys())}")
print(f"Standard attacks: {len(STANDARD_ATTACKS)} prompts")

---
## Phase 1: CKA Similarity Matrix

Compute CKA between open-weight models to identify structural similarities.
This phase works without API keys.

In [None]:
# ==========================================
# CELL 7: Configure Phase 1
# ==========================================

# Select models to compare
# Start with fewer models if GPU memory is limited
OPEN_MODELS = [
    "gemma2",    # google/gemma-2-2b-it
    "mistral",   # mistralai/Ministral-3-8B-Instruct-2512
    "llama",     # meta-llama/Llama-3.2-3B-Instruct
    # "phi",     # microsoft/phi-2 (optional)
    # "qwen",    # Qwen/Qwen2.5-3B-Instruct (optional)
]

# Prompts for hidden state extraction
# Using standard diverse prompts
prompts_for_cka = CKA_PROMPTS[:20]  # 20 prompts is usually sufficient

print(f"Models to compare: {OPEN_MODELS}")
print(f"Prompts for CKA: {len(prompts_for_cka)}")

In [None]:
# ==========================================
# CELL 8: Run Phase 1 - CKA Computation
# ==========================================

# Initialize experiment
# Note: hf_token=None since we use CLI login (Cell 3)
cka_experiment = OpenModelCKAExperiment(
    hf_token=None,
    device="auto",
    dtype="bfloat16",
    kernel="linear",
    n_layer_samples=5,  # Sample 5 layers per model for efficiency
)

# Run CKA
print("="*60)
print("PHASE 1: Computing CKA Similarity Matrix")
print("="*60)

cka_matrix = cka_experiment.run(
    model_names=OPEN_MODELS,
    prompts=prompts_for_cka,
    save_path=f"{RESULTS_DIR}/cka_matrix.json",
)

print("\n" + cka_matrix.summary())

---
## Phase 2: Attack Testing

Run standard attack suite on all models (open and closed).

In [None]:
# ==========================================
# CELL 9: Run Phase 2 - Attack Testing (Open Models)
# ==========================================

# Initialize attack testing experiment
# Note: hf_token=None since we use CLI login (Cell 3)
attack_experiment = AttackTestingExperiment(
    hf_token=None,
    openai_key=OPENAI_KEY,
    anthropic_key=ANTHROPIC_KEY,
    device="auto",
    dtype="bfloat16",
)

print("="*60)
print("PHASE 2: Attack Testing")
print("="*60)

# Test open models first
print("\nTesting open-weight models...")
open_attack_results = attack_experiment.run_open_models(
    model_names=OPEN_MODELS,
    save_path=f"{RESULTS_DIR}/open_model_attacks.json",
)

In [None]:
# ==========================================
# CELL 10: Run Phase 2 - Attack Testing (Closed Models)
# ==========================================

# Test closed models if API keys are available
if OPENAI_KEY or ANTHROPIC_KEY:
    print("\nTesting closed-weight models via API...")
    closed_attack_results = attack_experiment.run_closed_models(
        save_path=f"{RESULTS_DIR}/closed_model_attacks.json",
    )
    
    # Merge results
    all_attack_results = open_attack_results
    all_attack_results.models.update(closed_attack_results.models)
else:
    print("\n[INFO] Skipping closed model testing - no API keys")
    all_attack_results = open_attack_results

print("\n" + all_attack_results.summary())

---
## Phase 3: Correlation Analysis

Correlate CKA similarity with attack transferability and generate surrogate recommendations.

In [None]:
# ==========================================
# CELL 11: Run Phase 3 - Correlation Analysis
# ==========================================

# Initialize correlation analysis
correlation = CorrelationAnalysis()

print("="*60)
print("PHASE 3: Correlation Analysis")
print("="*60)

# Identify closed models in results
closed_models = ["openai", "anthropic"] if (OPENAI_KEY or ANTHROPIC_KEY) else []
closed_models = [m for m in closed_models if m in all_attack_results.models]

# Run correlation analysis
report = correlation.run(
    cka_matrix=cka_matrix,
    attack_results=all_attack_results,
    closed_model_keys=closed_models,
    save_path=f"{RESULTS_DIR}/transferability_report.json",
)

print("\n" + report.summary())

---
## Summary and Recommendations

In [None]:
# ==========================================
# CELL 12: Generate Final Summary
# ==========================================

print("="*60)
print("FINAL ANALYSIS SUMMARY")
print("="*60)

# CKA findings
print("\n1. CKA SIMILARITY (Structural)")
print("-" * 40)
m1, m2, sim = cka_matrix.get_most_similar_pair()
print(f"   Most similar pair: {m1} <-> {m2}")
print(f"   CKA similarity: {sim:.3f}")

# Attack findings
print("\n2. ATTACK SUCCESS RATES")
print("-" * 40)
for model, results in sorted(
    all_attack_results.models.items(),
    key=lambda x: x[1].compliance_rate,
    reverse=True
):
    print(f"   {model}: {results.compliance_rate:.1%} compliance")

# Correlation findings
print("\n3. CKA-ATTACK CORRELATION")
print("-" * 40)
print(f"   Pearson correlation: {report.cka_attack_correlation:.3f}")
print(f"   P-value: {report.correlation_p_value:.4f}")

if report.cka_attack_correlation > 0.5:
    print("   Interpretation: CKA is a GOOD predictor of attack transfer")
else:
    print("   Interpretation: CKA has LIMITED predictive value")

# Surrogate recommendations
if report.surrogate_recommendations:
    print("\n4. SURROGATE RECOMMENDATIONS")
    print("-" * 40)
    for target, rec in report.surrogate_recommendations.items():
        print(f"   For {target}: Use {rec.best_surrogate}")
        print(f"      Expected prediction error: {rec.prediction_error:.1%}")

# Files saved
print(f"\n5. RESULTS SAVED TO")
print("-" * 40)
print(f"   {RESULTS_DIR}")
for f in os.listdir(RESULTS_DIR):
    size = os.path.getsize(f"{RESULTS_DIR}/{f}")
    print(f"   - {f} ({size:,} bytes)")

In [None]:
# ==========================================
# CELL 13: Red Team Action Items
# ==========================================

print("="*60)
print("RED TEAM ACTION ITEMS")
print("="*60)

# Based on analysis
print("\n1. SURROGATE STRATEGY")
if report.surrogate_recommendations:
    for target, rec in report.surrogate_recommendations.items():
        print(f"   - Test attacks on {rec.best_surrogate} first")
        print(f"     Then transfer successful attacks to {target}")
        print(f"     Expected transfer accuracy: {1 - rec.prediction_error:.0%}")
else:
    print("   - No closed models tested; use most compliant open model as baseline")

print("\n2. HIGH-VALUE ATTACK CATEGORIES")
# Find categories with highest success
category_success = {}
for model, results in all_attack_results.models.items():
    for cat, metrics in results.category_metrics.items():
        if cat not in category_success:
            category_success[cat] = []
        category_success[cat].append(metrics.get('compliance_rate', 0))

for cat in sorted(category_success.keys(), key=lambda c: max(category_success[c]), reverse=True):
    max_success = max(category_success[cat])
    if max_success > 0:
        print(f"   - {cat}: up to {max_success:.0%} success (focus area)")

print("\n3. NEXT STEPS")
print("   - Develop variations of successful attack categories")
print("   - Test on surrogate model first to save API costs")
print("   - Track CKA changes if models are updated")
print("   - Expand attack suite for categories with low coverage")

In [None]:
# ==========================================
# CELL 14: Final Results Export with Charts
# ==========================================
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import numpy as np

print("="*60)
print("FINAL RESULTS EXPORT & VISUALIZATION")
print("="*60)

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Prepare comparison data
comparison_data = []
for model_id, results in all_attack_results.models.items():
    row = {
        'Model': model_id,
        'Type': results.model_type,
        'Compliance Rate': results.compliance_rate,
        'Refusal Rate': results.refusal_rate,
        'Unclear Rate': results.unclear_rate,
        'Total Attacks': results.total_attacks,
        'Avg Response Time (ms)': results.avg_response_time_ms,
    }
    
    # Add CKA similarity (if open model)
    if model_id in cka_matrix.model_names:
        # Average CKA with other open models
        similarities = []
        for other_model in cka_matrix.model_names:
            if other_model != model_id:
                similarities.append(cka_matrix.get_similarity(model_id, other_model))
        row['Avg CKA Similarity'] = np.mean(similarities) if similarities else 0.0
    else:
        row['Avg CKA Similarity'] = None
    
    comparison_data.append(row)

# Create DataFrame
df = pd.DataFrame(comparison_data)

# Save comparison table (easy to copy/paste)
comparison_table_path = f"{RESULTS_DIR}/model_comparison_table.txt"
with open(comparison_table_path, 'w') as f:
    f.write("="*80 + "\n")
    f.write("MODEL COMPARISON TABLE (Copy/Paste Friendly)" + "\n")
    f.write("="*80 + "\n\n")
    f.write(df.to_string(index=False))
    f.write("\n\n")
    f.write("="*80 + "\n")
    f.write("CSV Format (for Excel/Sheets):" + "\n")
    f.write("="*80 + "\n\n")
    f.write(df.to_csv(index=False))

print(f"\n[OK] Comparison table saved: {comparison_table_path}")

# Create charts
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Model Comparison Charts', fontsize=16, fontweight='bold')

# Chart 1: Compliance Rate Comparison
ax1 = axes[0, 0]
df_sorted = df.sort_values('Compliance Rate', ascending=False)
bars = ax1.barh(df_sorted['Model'], df_sorted['Compliance Rate'], color='coral')
ax1.set_xlabel('Compliance Rate', fontsize=12)
ax1.set_title('Attack Compliance Rate by Model', fontsize=14, fontweight='bold')
ax1.set_xlim(0, 1)
for i, v in enumerate(df_sorted['Compliance Rate']):
    ax1.text(v + 0.01, i, f'{v:.1%}', va='center', fontweight='bold')

# Chart 2: Refusal Rate Comparison
ax2 = axes[0, 1]
df_sorted_refusal = df.sort_values('Refusal Rate', ascending=False)
bars2 = ax2.barh(df_sorted_refusal['Model'], df_sorted_refusal['Refusal Rate'], color='steelblue')
ax2.set_xlabel('Refusal Rate', fontsize=12)
ax2.set_title('Attack Refusal Rate by Model', fontsize=14, fontweight='bold')
ax2.set_xlim(0, 1)
for i, v in enumerate(df_sorted_refusal['Refusal Rate']):
    ax2.text(v + 0.01, i, f'{v:.1%}', va='center', fontweight='bold')

# Chart 3: CKA Similarity (open models only)
ax3 = axes[1, 0]
df_open = df[df['Avg CKA Similarity'].notna()].copy()
if len(df_open) > 0:
    df_open_sorted = df_open.sort_values('Avg CKA Similarity', ascending=False)
    bars3 = ax3.barh(df_open_sorted['Model'], df_open_sorted['Avg CKA Similarity'], color='mediumseagreen')
    ax3.set_xlabel('Average CKA Similarity', fontsize=12)
    ax3.set_title('CKA Similarity (Open Models)', fontsize=14, fontweight='bold')
    ax3.set_xlim(0, 1)
    for i, v in enumerate(df_open_sorted['Avg CKA Similarity']):
        ax3.text(v + 0.01, i, f'{v:.3f}', va='center', fontweight='bold')
else:
    ax3.text(0.5, 0.5, 'No open model CKA data', ha='center', va='center', fontsize=12)
    ax3.set_title('CKA Similarity (Open Models)', fontsize=14, fontweight='bold')

# Chart 4: Combined Metrics (Compliance vs Refusal)
ax4 = axes[1, 1]
scatter = ax4.scatter(df['Refusal Rate'], df['Compliance Rate'], 
                      s=200, alpha=0.6, c=range(len(df)), cmap='viridis')
for i, row in df.iterrows():
    ax4.annotate(row['Model'], (row['Refusal Rate'], row['Compliance Rate']),
                xytext=(5, 5), textcoords='offset points', fontsize=9)
ax4.set_xlabel('Refusal Rate', fontsize=12)
ax4.set_ylabel('Compliance Rate', fontsize=12)
ax4.set_title('Compliance vs Refusal Rate', fontsize=14, fontweight='bold')
ax4.grid(True, alpha=0.3)

plt.tight_layout()

# Save chart
chart_path = f"{RESULTS_DIR}/model_comparison_charts.png"
plt.savefig(chart_path, dpi=300, bbox_inches='tight')
print(f"[OK] Comparison charts saved: {chart_path}")
plt.show()

# Save DataFrame as CSV
csv_path = f"{RESULTS_DIR}/model_comparison.csv"
df.to_csv(csv_path, index=False)
print(f"[OK] CSV saved: {csv_path}")

# Consolidate all results into one JSON file
consolidated_results = {
    'timestamp': datetime.now().isoformat(),
    'models_tested': list(all_attack_results.models.keys()),
    'comparison_table': df.to_dict('records'),
    'cka_matrix': cka_matrix.to_dict(),
    'attack_results': all_attack_results.to_dict(),
    'transferability_report': report.to_dict(),
    'summary': {
        'most_compliant_model': df.loc[df['Compliance Rate'].idxmax(), 'Model'] if len(df) > 0 else None,
        'most_refusing_model': df.loc[df['Refusal Rate'].idxmax(), 'Model'] if len(df) > 0 else None,
        'cka_attack_correlation': report.cka_attack_correlation,
        'total_models_tested': len(all_attack_results.models),
    }
}

consolidated_path = f"{RESULTS_DIR}/consolidated_results.json"
with open(consolidated_path, 'w') as f:
    json.dump(consolidated_results, f, indent=2, default=str)
print(f"[OK] Consolidated results saved: {consolidated_path}")

# Final summary
print("\n" + "="*60)
print("ALL RESULTS PERMANENTLY SAVED TO GOOGLE DRIVE")
print("="*60)
print(f"\nLocation: {RESULTS_DIR}")
print("\nFiles saved:")
for f in sorted(os.listdir(RESULTS_DIR)):
    size = os.path.getsize(f"{RESULTS_DIR}/{f}")
    print(f"  ‚úì {f} ({size:,} bytes)")

print("\n" + "="*60)
print("COMPARISON TABLE (Copy/Paste Below)")
print("="*60)
print("\n" + df.to_string(index=False))
print("\n" + "="*60)
