# Table 1 Reproduction: Surgical Repair Verification

This notebook reproduces the **Table 1** results (Surgical Repair Performance) using the **canonical natural sample** discussed in the paper:
> *"The book was written by the man."*

We compare the baseline performance (Passive) vs. the Steered performance (Repair) using a robust technical steering vector.

**Note:** This notebook uses the `spectral_trust` library for standardized spectral metric calculations.

In [None]:
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from spectral_trust import GSPConfig, GraphConstructor, SpectralAnalyzer

# Models to test
MODELS = [
    "microsoft/Phi-3-mini-4k-instruct",
    # Uncomment to run full suite
    # "mistralai/Mistral-7B-Instruct-v0.1",
    # "meta-llama/Llama-3.2-3B-Instruct"
]

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
# --- 1. TEST SAMPLES (Canonical & Natural) ---
TEST_PAIRS = [
    ("The man wrote the book.", "The book was written by the man."),
    ("The teacher read the letter.", "The letter was read by the teacher."),
    ("The girl found the key.", "The key was found by the girl."),
    ("The doctor cured the patient.", "The patient was cured by the doctor."),
    ("The artist painted the picture.", "The picture was painted by the artist.")
]

In [None]:
# --- 2. SPECTRAL UTILS (Using spectral-trust) ---
def get_fiedler_value(adj_matrix, graph_constructor, spectral_analyzer):
    """Compute Fiedler value using spectral-trust library."""
    if adj_matrix.shape[0] < 2: return 0.0
    
    # Convert to Tensor [1, seq, seq]
    adj_tensor = torch.tensor(adj_matrix, dtype=torch.float32, device=device).unsqueeze(0)
    
    # Construct Laplacian (handles symmetrization)
    L = graph_constructor.construct_laplacian(adj_tensor)
    
    # FIX: Move to CPU for SciPy based eigendecomposition
    L = L.cpu()
    
    # Compute Eigenvalues
    evals, _ = spectral_analyzer.compute_eigendecomposition(L)
    
    # Return lambda_2 (batch index 0)
    return evals[0, 1].item() if evals.shape[1] > 1 else 0.0

def calculate_perplexity(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs.input_ids)
    return torch.exp(outputs.loss).item()

In [None]:
# --- 3. EXPERIMENT LOOP ---
RESULTS = []

for model_name in MODELS:
    print(f"\nProcessing {model_name}...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            torch_dtype=torch.float16, 
            device_map="auto",
            attn_implementation="eager"
        )
        
        # Initialize Spectral Trust Tools
        config = GSPConfig(model_name=model_name, device=device)
        gc = GraphConstructor(config)
        sa = SpectralAnalyzer(config)
        
    except Exception as e:
        print(f"Skipping {model_name}: {e}")
        continue

    # CALIBRATE STEERING (Robust Technical Set)
    LAYER_IDX = 2
    technical_calibration = [
        ("The system processed the data.", "The data was processed by the system."),
        ("The server hosted the website.", "The website was hosted by the server."),
        ("The code compiled the program.", "The program was compiled by the code."),
        ("The layer encoded the input.", "The input was encoded by the layer."),
        ("The model predicted the token.", "The token was predicted by the model."),
        ("The node routed the packet.", "The packet was routed by the node."),
        ("The script executed the command.", "The command was executed by the script."),
        ("The database stored the record.", "The record was stored by the database.")
    ]
    
    def get_mean_hidden(texts):
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(model.device)
        with torch.no_grad(): outputs = model(**inputs, output_hidden_states=True)
        h = outputs.hidden_states[LAYER_IDX + 1]
        mask = inputs.attention_mask.unsqueeze(-1)
        return ((h * mask).sum(dim=1) / mask.sum(dim=1)).mean(dim=0)
    
    v_act = get_mean_hidden([x[0] for x in technical_calibration])
    v_pas = get_mean_hidden([x[1] for x in technical_calibration])
    steering_vector = v_act - v_pas
    print("Steering calibrated on Technical Set.")
    
    # RUN TEST
    ppl_bases, ppl_steers = [], []
    f_bases, f_steers, f_actives = [], [], []
    alpha = 0.2
    
    def steer_hook(module, inp, out):
        if isinstance(out, tuple):
            h = out[0]
            v = steering_vector.view(1, 1, -1).to(h.dtype)
            return (h + alpha * v,) + out[1:]
        return out + alpha * steering_vector.view(1, 1, -1).to(out.dtype)
    
    for active, passive in TEST_PAIRS:
        # Baseline
        ppl_base = calculate_perplexity(model, tokenizer, passive)
        inp = tokenizer(passive, return_tensors="pt").to(model.device)
        with torch.no_grad(): 
            out = model(**inp, output_attentions=True)
            # FIX: Measure effect at LAYER_IDX + 1 (the layer AFFECTED by the steer)
            adj = out.attentions[LAYER_IDX + 1][0].mean(dim=0).float().cpu().numpy()
            f_base = get_fiedler_value(adj, gc, sa)
            
        # Active Ref
        inp_a = tokenizer(active, return_tensors="pt").to(model.device)
        with torch.no_grad():
            out_a = model(**inp_a, output_attentions=True)
            adj_a = out_a.attentions[LAYER_IDX + 1][0].mean(dim=0).float().cpu().numpy()
            f_act = get_fiedler_value(adj_a, gc, sa)

        # Steered
        h = model.model.layers[LAYER_IDX].register_forward_hook(steer_hook)
        ppl_steer = calculate_perplexity(model, tokenizer, passive)
        with torch.no_grad():
            out = model(**inp, output_attentions=True)
            adj = out.attentions[LAYER_IDX + 1][0].mean(dim=0).float().cpu().numpy()
            f_steer = get_fiedler_value(adj, gc, sa)
        h.remove()
        
        ppl_bases.append(ppl_base)
        ppl_steers.append(ppl_steer)
        f_bases.append(f_base)
        f_steers.append(f_steer)
        f_actives.append(f_act)

    # METRICS
    mean_ppl_base = np.mean(ppl_bases)
    mean_ppl_steer = np.mean(ppl_steers)
    mean_f_base = np.mean(f_bases)
    mean_f_steer = np.mean(f_steers)
    mean_f_act = np.mean(f_actives)
    
    pct_gain = (mean_ppl_base - mean_ppl_steer) / mean_ppl_base * 100
    loss_gap = mean_f_act - mean_f_base
    rec_pct = (mean_f_steer - mean_f_base) / loss_gap * 100 if loss_gap > 0 else 0
    
    print(f"  PPL Gain: +{pct_gain:.2f}%")
    print(f"  Fiedler: {mean_f_base:.3f} -> {mean_f_steer:.3f} (Active {mean_f_act:.3f})")
    print(f"  Recovery: {rec_pct:.2f}%")
    
    RESULTS.append({
        "Model": model_name,
        "Base PPL": mean_ppl_base,
        "Steered PPL": mean_ppl_steer,
        "PPL Gain %": pct_gain,
        "Recov %": rec_pct
    })

In [None]:
# --- 4. DISPLAY TABLE ---
import pandas as pd
df = pd.DataFrame(RESULTS)
display(df)