# Synthetic Steering Evaluation

This notebook evaluates the effectiveness of different steering methods on synthetically generated contrastive pairs.
We evaluate responses on three criteria:
1. Difference from unsteered responses
2. Coherence of the response
3. Alignment with the target trait

In [None]:
import sys
import torch
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from wisent_guard.core.steering_methods.dac import DAC
from wisent_guard.core.steering_methods.caa import CAA
from wisent_guard.core.steering_methods.k_steering import KSteering
from wisent_guard.core.contrastive_pairs.contrastive_pair_set import ContrastivePairSet
from wisent_guard.core.contrastive_pairs.generate_synthetically import SyntheticContrastivePairGenerator
from wisent_guard.core.model import Model
from evaluate_personal import SteeringEvaluator

# Parameters
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
LAYER_INDEX = 15
STEERING_STRENGTH = 1.0
MAX_LENGTH = 30
NUM_PAIRS = 5
EVAL_PROMPTS = 3  # Number of prompts to evaluate

In [None]:
# Get optimal device
device = torch.device("mps" if torch.backends.mps.is_available() else 
                     "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and tokenizer
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
hf_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32).to(device)
tokenizer.pad_token = tokenizer.eos_token

# Create Model wrapper
model = Model(name=MODEL_NAME, hf_model=hf_model)
print("✓ Model loaded successfully")

# Initialize evaluator
evaluator = SteeringEvaluator(hf_model, tokenizer, device)
print("✓ Evaluator initialized")

In [None]:
# Define trait for evaluation
TRAIT_NAME = "sarcastic"
TRAIT_DESCRIPTION = "sarcastic and witty responses with subtle mockery and irony"

# Test prompts for evaluation
TEST_PROMPTS = [
    "What's your opinion on working overtime?",
    "How do I become more productive?",
    "Tell me about your favorite hobby."
]

print(f"Evaluating trait: {TRAIT_NAME}")
print(f"Description: {TRAIT_DESCRIPTION}")
print(f"\nTest prompts ({len(TEST_PROMPTS)}):")
for i, prompt in enumerate(TEST_PROMPTS):
    print(f"{i+1}. {prompt}")

In [None]:
# Generate synthetic contrastive pairs
print("Generating synthetic contrastive pairs...")
generator = SyntheticContrastivePairGenerator(model)

pair_set = generator.generate_contrastive_pair_set(
    trait_description=TRAIT_DESCRIPTION,
    num_pairs=NUM_PAIRS,
    name=TRAIT_NAME
)

print(f"✓ Generated {len(pair_set.pairs)} {TRAIT_NAME} pairs")

# Show examples
print("\nExample pairs:")
for i, pair in enumerate(pair_set.pairs[:2]):
    print(f"\nPair {i+1}:")
    print(f"Prompt: {pair.prompt[:80]}..." if len(pair.prompt) > 80 else f"Prompt: {pair.prompt}")
    print(f"Positive: {pair.positive_response.text[:80]}...")
    print(f"Negative: {pair.negative_response.text[:80]}...")

In [None]:
# Extract activations for all pairs
def extract_activations(text, layer_idx):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    activations = []
    def hook(module, input, output):
        activations.append(output[0][:, -1, :].clone())
    handle = hf_model.model.layers[layer_idx].register_forward_hook(hook)
    with torch.no_grad():
        hf_model(**inputs)
    handle.remove()
    return activations[0].squeeze(0)

print("Extracting activations...")
for pair in pair_set.pairs:
    pair.positive_response.activations = extract_activations(pair.positive_response.text, LAYER_INDEX)
    pair.negative_response.activations = extract_activations(pair.negative_response.text, LAYER_INDEX)

print("✓ Activations extracted")

In [None]:
# Train all steering methods
print("Training steering methods...")

# DAC
print("  Training DAC...")
dac = DAC(device=device)
dac.set_model_reference(hf_model)
dac.train(pair_set, LAYER_INDEX)

# CAA
print("  Training CAA...")
caa = CAA(device=device)
caa.train(pair_set, LAYER_INDEX)

# K-Steering
print("  Training K-Steering...")
k_steering = KSteering(device=device, num_labels=1, classifier_epochs=20)
k_steering.train(pair_set, LAYER_INDEX)

print("✓ All steering methods trained")

# Store methods for evaluation
steering_methods = {
    'DAC': dac,
    'CAA': caa,
    'K-Steering': k_steering
}

In [None]:
# Generation functions
def generate_unsteered(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = hf_model.generate(
            **inputs,
            max_length=inputs["input_ids"].shape[1] + MAX_LENGTH,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(prompt):].strip()

def generate_with_steering(prompt, steering_method, strength):
    def steering_hook(module, input, output):
        hidden_states = output[0]
        last_token = hidden_states[:, -1:, :]
        steered = steering_method.apply_steering(last_token, strength)
        hidden_states[:, -1:, :] = steered
        return (hidden_states,) + output[1:]
    
    handle = hf_model.model.layers[LAYER_INDEX].register_forward_hook(steering_hook)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = hf_model.generate(
            **inputs,
            max_length=inputs["input_ids"].shape[1] + MAX_LENGTH,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    handle.remove()
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(prompt):].strip()

In [None]:
# Generate responses for evaluation
print("Generating responses for evaluation...")
print("=" * 60)

evaluation_results = []

for prompt in TEST_PROMPTS:
    print(f"\nPrompt: {prompt}")
    print("-" * 40)
    
    result = {'prompt': prompt}
    
    # Generate unsteered response
    unsteered = generate_unsteered(prompt)
    result['unsteered'] = unsteered
    print(f"Unsteered: {unsteered}")
    
    # Generate steered responses for each method
    for method_name, method in steering_methods.items():
        steered = generate_with_steering(prompt, method, STEERING_STRENGTH)
        result[method_name] = steered
        print(f"{method_name}: {steered}")
    
    evaluation_results.append(result)

print("\n" + "=" * 60)
print("✓ Response generation complete")

In [None]:
# Evaluate individual responses
print("\nEvaluating individual responses...")
print("=" * 60)

detailed_scores = []

for i, result in enumerate(evaluation_results):
    prompt = result['prompt']
    unsteered = result['unsteered']
    
    print(f"\nPrompt {i+1}: {prompt}")
    print("-" * 40)
    
    prompt_scores = {'prompt': prompt}
    
    for method_name in steering_methods.keys():
        if method_name in result:
            steered = result[method_name]
            
            print(f"\nEvaluating {method_name}:")
            scores = evaluator.evaluate_response(
                prompt, unsteered, steered, TRAIT_DESCRIPTION
            )
            
            prompt_scores[method_name] = scores
            
            print(f"  Difference from unsteered: {scores['difference']:.1f}/10")
            print(f"  Coherence: {scores['coherence']:.1f}/10")
            print(f"  Trait alignment ({TRAIT_NAME}): {scores['trait_alignment']:.1f}/10")
            print(f"  Overall: {scores['overall']:.1f}/10")
    
    detailed_scores.append(prompt_scores)

In [None]:
# Calculate and display aggregate scores
print("\nCalculating aggregate scores...")

# Use evaluator's batch evaluation
avg_scores = evaluator.evaluate_batch(evaluation_results, TRAIT_DESCRIPTION)

# Print summary
evaluator.print_evaluation_summary(avg_scores)

# Additional analysis
print("\nKey Findings:")
best_overall = max(avg_scores.items(), key=lambda x: x[1]['overall'])
print(f"- Best overall method: {best_overall[0]} (score: {best_overall[1]['overall']:.1f})")

best_trait = max(avg_scores.items(), key=lambda x: x[1]['trait_alignment'])
print(f"- Best trait alignment: {best_trait[0]} (score: {best_trait[1]['trait_alignment']:.1f})")

best_coherence = max(avg_scores.items(), key=lambda x: x[1]['coherence'])
print(f"- Most coherent: {best_coherence[0]} (score: {best_coherence[1]['coherence']:.1f})")

In [None]:
# Visualize results
import matplotlib.pyplot as plt
import numpy as np

# Prepare data for plotting
methods = list(avg_scores.keys())
criteria = ['difference', 'coherence', 'trait_alignment', 'overall']

# Create bar chart
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(methods))
width = 0.2

for i, criterion in enumerate(criteria):
    values = [avg_scores[method][criterion] for method in methods]
    ax.bar(x + i*width, values, width, label=criterion.replace('_', ' ').title())

ax.set_xlabel('Steering Method')
ax.set_ylabel('Score (0-10)')
ax.set_title(f'Steering Method Evaluation - {TRAIT_NAME.capitalize()} Trait')
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(methods)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n✅ Evaluation complete!")