# Table 1 Reproduction: Multi-Model Steering Performance

This notebook reproduces the results for **Table 1** in the paper. It evaluates the effectiveness of the "Surgical Repair" (activation steering) across three models:
1. `microsoft/Phi-3-mini-4k-instruct`
2. `mistralai/Mistral-7B-Instruct-v0.1`
3. `meta-llama/Llama-3.2-3B-Instruct`

For each model, we:
1. Calibrate a steering vector using a robust technical vocabulary.
2. Test it on a set of coherent natural language samples.
3. Report `PPL Reduction` and `Fiedler Recovery` ($% \lambda_2$).

In [None]:
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer

# Models to test
MODELS = [
    "microsoft/Phi-3-mini-4k-instruct",
    # Uncomment to run full suite (requires sufficient VRAM)
    # "mistralai/Mistral-7B-Instruct-v0.1",
    # "meta-llama/Llama-3.2-3B-Instruct"
]

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
# --- 1. SAMPLE GENERATION (Coherent) ---
def generate_coherent_pairs(n):
    vocab = {
        "subjects": ["The book", "The letter", "The report", "The message", "The song", "The code", "The image", "The file"],
        "verbs": ["written", "sent", "reviewed", "received", "sung", "fixed", "processed", "saved"],
        "agents": ["by the man", "by the teacher", "by the manager", "by the user", "by the woman", "by the expert", "by the system", "by the admin"]
    }
    pairs = []
    for _ in range(n):
        s = np.random.choice(vocab["subjects"])
        v = np.random.choice(vocab["verbs"])
        a = np.random.choice(vocab["agents"])
        # Simple grammar heuristic
        passive = f"{s} was {v} {a}."
        
        # Active mapping (approximate)
        v_act = v
        if v == "written": v_act = "wrote"
        elif v == "sent": v_act = "sent"
        elif v == "sung": v_act = "sang"
        elif v == "fixed": v_act = "fixed"
        elif v == "saved": v_act = "saved"
        
        a_subj = a.replace("by ", "").capitalize()
        active = f"{a_subj} {v_act} {s.lower()}."
        active = active.replace("The the", "The") # fix artifacts
        pairs.append((active, passive))
    return pairs

TEST_PAIRS = generate_coherent_pairs(50) # N=50 for speed in demo

In [None]:
# --- 2. SPECTRAL UTILS ---
def get_fiedler_value(adj_matrix):
    try:
        L = np.diag(np.sum(adj_matrix, axis=1)) - adj_matrix
        eigvals = np.linalg.eigvalsh(L)
        return sorted(eigvals)[1] if len(eigvals) > 1 else 0.0
    except: return 0.0

def calculate_perplexity(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs.input_ids)
    return torch.exp(outputs.loss).item()

In [None]:
# --- 3. EXPERIMENT LOOP ---
RESULTS = []

for model_name in MODELS:
    print(f"\nProcessing {model_name}...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        # Fix padding
        if tokenizer.pad_token is None: 
            tokenizer.pad_token = tokenizer.eos_token
            
        model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            torch_dtype=torch.float16, 
            device_map="auto",
            attn_implementation="eager"
        )
    except Exception as e:
        print(f"Skipping {model_name} (Load Error): {e}")
        continue

    # CALIBRATE STEERING
    LAYER_IDX = 2
    tech_vocab = ["The system processed the data.", "The data was processed by the system."]
    # (Simplified calibration for demo - ideally utilize robust set)
    
    def get_mean_hidden(text):
        inp = tokenizer(text, return_tensors="pt").to(model.device)
        with torch.no_grad(): out = model(**inp, output_hidden_states=True)
        return out.hidden_states[LAYER_IDX+1].mean(dim=1).mean(dim=0)
    
    v_act = get_mean_hidden(tech_vocab[0])
    v_pas = get_mean_hidden(tech_vocab[1])
    steering_vector = v_act - v_pas
    
    # RUN TEST
    ppl_bases, ppl_steers = [], []
    f_bases, f_steers, f_actives = [], [], []
    
    # Define Hook
    alpha = 0.2
    def steer_hook(module, inp, out):
        if isinstance(out, tuple):
            h = out[0]
            v = steering_vector.view(1, 1, -1).to(h.dtype)
            return (h + alpha * v,) + out[1:]
        return out + alpha * steering_vector.view(1, 1, -1).to(out.dtype)
    
    for active, passive in TEST_PAIRS:
        # 1. Baseline Passive
        ppl_base = calculate_perplexity(model, tokenizer, passive)
        # Get Fiedler
        inp = tokenizer(passive, return_tensors="pt").to(model.device)
        with torch.no_grad(): out = model(**inp, output_attentions=True)
        adj = out.attentions[LAYER_IDX][0].mean(dim=0).float().cpu().numpy()
        f_base = get_fiedler_value(adj)

        # 2. Active Reference
        inp_a = tokenizer(active, return_tensors="pt").to(model.device)
        with torch.no_grad(): out_a = model(**inp_a, output_attentions=True)
        adj_a = out_a.attentions[LAYER_IDX][0].mean(dim=0).float().cpu().numpy()
        f_active_val = get_fiedler_value(adj_a)
        
        # 3. Steered Passive
        h = model.model.layers[LAYER_IDX].register_forward_hook(steer_hook)
        ppl_steer = calculate_perplexity(model, tokenizer, passive)
        # Get Fiedler Steered
        inp = tokenizer(passive, return_tensors="pt").to(model.device)
        with torch.no_grad(): out = model(**inp, output_attentions=True)
        adj = out.attentions[LAYER_IDX][0].mean(dim=0).float().cpu().numpy()
        f_steer = get_fiedler_value(adj)
        h.remove()
        
        # Store
        ppl_bases.append(ppl_base)
        ppl_steers.append(ppl_steer)
        f_bases.append(f_base)
        f_steers.append(f_steer)
        f_actives.append(f_active_val)
        
    # Aggregate
    mean_ppl_base = np.mean(ppl_bases)
    mean_ppl_steer = np.mean(ppl_steers)
    mean_f_base = np.mean(f_bases)
    mean_f_steer = np.mean(f_steers)
    mean_f_act = np.mean(f_actives)
    
    # Metrics
    ppl_gain = mean_ppl_base - mean_ppl_steer
    pct_gain = (ppl_gain / mean_ppl_base) * 100
    loss_gap = mean_f_act - mean_f_base
    recovered = mean_f_steer - mean_f_base
    rec_pct = (recovered / loss_gap) * 100 if loss_gap > 0 else 0
    
    print(f"  Base PPL: {mean_ppl_base:.2f} -> Steered: {mean_ppl_steer:.2f} (Gain +{pct_gain:.1f}%)")
    print(f"  Fiedler: {mean_f_base:.3f} -> {mean_f_steer:.3f} (Target {mean_f_act:.3f}) (Recov {rec_pct:.1f}%)")
    
    RESULTS.append({
        "Model": model_name,
        "Base PPL": mean_ppl_base,
        "Steered PPL": mean_ppl_steer,
        "Recov %": rec_pct,
        "PPL Gain %": pct_gain
    })

In [None]:
# --- 4. DISPLAY TABLE ---
import pandas as pd
df = pd.DataFrame(RESULTS)
display(df)