# Multi-Property CAA Steering Demo

This notebook demonstrates multi-property steering with CAA (Contrastive Activation Addition) using the implementation we added to wisent-guard.

**Key features:**
- Trains CAA vectors for two different properties (Italian and Honest)
- Shows how to combine them with different weights (e.g., 0.7 Italian + 0.3 Honest)
- Demonstrates the linear combination approach for multi-property control

**Note:** This uses the CAA method which extracts steering vectors through activation differences without training.

In [1]:
# Define all parameters
MODEL = "meta-llama/Llama-3.1-8B-Instruct"
LAYER = 15  # Layer 15 as specified
METHOD = "CAA"  # Using CAA method
NUM_PAIRS = 20  # Number of contrastive pairs per trait

# First trait - Italian
TRAIT_1 = "italian"
TRAIT_1_DESC = "Speaks with Italian cultural references, mentions Italian food, places, and expressions"
PAIRS_FILE_1 = f"caa_pairs_{TRAIT_1}.json"
VECTOR_FILE_1 = f"caa_vector_{TRAIT_1}_layer{LAYER}.pt"

# Second trait - Honest
TRAIT_2 = "honest"
TRAIT_2_DESC = "Admits limitations, expresses uncertainty, acknowledges when unsure"
PAIRS_FILE_2 = f"caa_pairs_{TRAIT_2}.json"
VECTOR_FILE_2 = f"caa_vector_{TRAIT_2}_layer{LAYER}.pt"

# Test prompts
TEST_PROMPTS = [
    "What's your favorite food?",
    "Tell me about art",
    "Are you sure about your answer?",
    "What makes life meaningful?"
]

# Max tokens for generation
MAX_TOKENS = 50

# Check Python version
import sys
print(f"Running in Python {sys.version}")

Running in Python 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]


In [2]:
# Generate contrastive pairs for both traits
import subprocess
import os
import json

print("Generating contrastive pairs for CAA training...")
print("="*80)

# Generate pairs for Italian trait
if not os.path.exists(PAIRS_FILE_1):
    print(f"\n1. Generating {TRAIT_1} pairs...")
    cmd = [
        sys.executable, "-m", "wisent_guard", "generate-pairs",
        "--trait", TRAIT_1_DESC,
        "--num-pairs", str(NUM_PAIRS),
        "--output", PAIRS_FILE_1
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print(f"   ✓ Generated {NUM_PAIRS} pairs for {TRAIT_1}")
    else:
        print(f"   ✗ Error: {result.stderr}")
else:
    print(f"   ✓ {TRAIT_1} pairs already exist")

# Generate pairs for Honest trait
if not os.path.exists(PAIRS_FILE_2):
    print(f"\n2. Generating {TRAIT_2} pairs...")
    cmd = [
        sys.executable, "-m", "wisent_guard", "generate-pairs",
        "--trait", TRAIT_2_DESC,
        "--num-pairs", str(NUM_PAIRS),
        "--output", PAIRS_FILE_2
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print(f"   ✓ Generated {NUM_PAIRS} pairs for {TRAIT_2}")
    else:
        print(f"   ✗ Error: {result.stderr}")
else:
    print(f"   ✓ {TRAIT_2} pairs already exist")

# Show sample pairs
if os.path.exists(PAIRS_FILE_1):
    with open(PAIRS_FILE_1, 'r') as f:
        data = json.load(f)
        print(f"\nSample {TRAIT_1} pair:")
        if 'pairs' in data and len(data['pairs']) > 0:
            pair = data['pairs'][0]
            print(f"  Positive: {pair.get('positive', pair.get('harmless', ''))[:100]}...")
            print(f"  Negative: {pair.get('negative', pair.get('harmful', ''))[:100]}...")

Generating contrastive pairs for CAA training...

1. Generating italian pairs...
   ✗ Error: The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]
Loading checkpoint shards:  25%|██▌       | 1/4 [00:03<00:10,  3.48s/it]
Loading checkpoint shards:  50%|█████     | 2/4 [00:07<00:07,  3.89s/it]
Loading checkpoint shards:  75%|███████▌  | 3/4 [00:12<00:04,  4.24s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.04s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.38s/it]
2025-08-11 17:18:59,877 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
2025-08-11 17:18:59,877 - sentence_transformers.SentenceTransformer - INFO - Load pr

In [3]:
# Generate CAA vectors from the pairs
import subprocess
import torch

print("\nGenerating CAA steering vectors...")
print("="*80)

# Generate Italian vector
if not os.path.exists(VECTOR_FILE_1):
    print(f"\n1. Generating {TRAIT_1} CAA vector...")
    cmd = [
        sys.executable, "-m", "wisent_guard", "generate-vector",
        "--from-pairs", PAIRS_FILE_1,
        "--method", METHOD,
        "--model", MODEL,
        "--layer", str(LAYER),
        "--output", VECTOR_FILE_1
    ]
    print(f"   Running: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print(f"   ✓ Generated CAA vector for {TRAIT_1}")
    else:
        print(f"   ✗ Error: {result.stderr[:500]}")
else:
    print(f"   ✓ {TRAIT_1} vector already exists")

# Generate Honest vector
if not os.path.exists(VECTOR_FILE_2):
    print(f"\n2. Generating {TRAIT_2} CAA vector...")
    cmd = [
        sys.executable, "-m", "wisent_guard", "generate-vector",
        "--from-pairs", PAIRS_FILE_2,
        "--method", METHOD,
        "--model", MODEL,
        "--layer", str(LAYER),
        "--output", VECTOR_FILE_2
    ]
    print(f"   Running: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print(f"   ✓ Generated CAA vector for {TRAIT_2}")
    else:
        print(f"   ✗ Error: {result.stderr[:500]}")
else:
    print(f"   ✓ {TRAIT_2} vector already exists")

# Load and inspect vectors
if os.path.exists(VECTOR_FILE_1) and os.path.exists(VECTOR_FILE_2):
    vec1 = torch.load(VECTOR_FILE_1, map_location='cpu')
    vec2 = torch.load(VECTOR_FILE_2, map_location='cpu')
    
    print(f"\n📊 CAA Vector Statistics:")
    print(f"   {TRAIT_1} vector shape: {vec1.shape if isinstance(vec1, torch.Tensor) else vec1.get('steering_vector', vec1).shape}")
    print(f"   {TRAIT_1} vector norm: {vec1.norm().item() if isinstance(vec1, torch.Tensor) else vec1.get('steering_vector', vec1).norm().item():.4f}")
    print(f"   {TRAIT_2} vector shape: {vec2.shape if isinstance(vec2, torch.Tensor) else vec2.get('steering_vector', vec2).shape}")
    print(f"   {TRAIT_2} vector norm: {vec2.norm().item() if isinstance(vec2, torch.Tensor) else vec2.get('steering_vector', vec2).norm().item():.4f}")


Generating CAA steering vectors...

1. Generating italian CAA vector...
   Running: /opt/homebrew/Caskroom/miniforge/base/bin/python -m wisent_guard generate-vector --from-pairs caa_pairs_italian.json --method CAA --model meta-llama/Llama-3.1-8B-Instruct --layer 15 --output caa_vector_italian_layer15.pt
   ✗ Error: The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]
Loading checkpoint shards:  25%|██▌       | 1/4 [00:04<00:13,  4.63s/it]
Loading checkpoint shards:  50%|█████     | 2/4 [00:09<00:09,  4.64s/it]
Lo

2. Generating honest CAA vector...
   Running: /opt/homebrew/Caskroom/miniforge/base/bin/python -m wisent_guard generate-vector --from-pairs caa_pairs_honest.json --method CAA --mo

In [4]:
# Test individual CAA steering
import subprocess
import os

print("Testing individual CAA steering...")
print("="*80)

# Suppress transformers warnings
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

# Test Italian steering
print(f"\n1. Pure {TRAIT_1.upper()} steering:")
print("-"*40)

for prompt in TEST_PROMPTS[:2]:  # Test first 2 prompts
    print(f"\nPrompt: {prompt}")
    cmd = [
        sys.executable, "-m", "wisent_guard", "steer",
        "--vector", VECTOR_FILE_1,
        "--model", MODEL,
        "--layer", str(LAYER),
        "--prompt", prompt,
        "--max-new-tokens", str(MAX_TOKENS),
        "--multiplier", "1.0"
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        # Extract response from output
        output = result.stdout
        if "Generated text:" in output:
            response = output.split("Generated text:")[1].strip()
            print(f"Response: {response[:150]}..." if len(response) > 150 else f"Response: {response}")
        else:
            print("Response: (parsing error)")
    else:
        print(f"Error: {result.stderr[:100]}")

# Test Honest steering
print(f"\n\n2. Pure {TRAIT_2.upper()} steering:")
print("-"*40)

for prompt in TEST_PROMPTS[:2]:
    print(f"\nPrompt: {prompt}")
    cmd = [
        sys.executable, "-m", "wisent_guard", "steer",
        "--vector", VECTOR_FILE_2,
        "--model", MODEL,
        "--layer", str(LAYER),
        "--prompt", prompt,
        "--max-new-tokens", str(MAX_TOKENS),
        "--multiplier", "1.0"
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        output = result.stdout
        if "Generated text:" in output:
            response = output.split("Generated text:")[1].strip()
            print(f"Response: {response[:150]}..." if len(response) > 150 else f"Response: {response}")
        else:
            print("Response: (parsing error)")
    else:
        print(f"Error: {result.stderr[:100]}")

Testing individual CAA steering...

1. Pure ITALIAN steering:
----------------------------------------

Prompt: What's your favorite food?
Error: 

Prompt: Tell me about art
Error: 


2. Pure HONEST steering:
----------------------------------------

Prompt: What's your favorite food?
Error: 

Prompt: Tell me about art
Error: 


In [5]:
# Demonstrate CAA multi-property steering concept
import torch
import numpy as np

print("\n" + "="*80)
print("CAA MULTI-PROPERTY STEERING DEMONSTRATION")
print("="*80)

if os.path.exists(VECTOR_FILE_1) and os.path.exists(VECTOR_FILE_2):
    # Load vectors
    vec1_data = torch.load(VECTOR_FILE_1, map_location='cpu')
    vec2_data = torch.load(VECTOR_FILE_2, map_location='cpu')
    
    # Extract actual vectors (handle different formats)
    if isinstance(vec1_data, dict):
        vec1 = vec1_data.get('steering_vector', vec1_data)
    else:
        vec1 = vec1_data
        
    if isinstance(vec2_data, dict):
        vec2 = vec2_data.get('steering_vector', vec2_data)
    else:
        vec2 = vec2_data
    
    # Ensure vectors are the right shape
    if len(vec1.shape) > 1:
        vec1 = vec1.squeeze()
    if len(vec2.shape) > 1:
        vec2 = vec2.squeeze()
    
    print(f"\n✅ Loaded CAA vectors:")
    print(f"   {TRAIT_1}: shape {vec1.shape}, norm {vec1.norm():.4f}")
    print(f"   {TRAIT_2}: shape {vec2.shape}, norm {vec2.norm():.4f}")
    
    # Demonstrate different weight combinations
    print("\n" + "-"*60)
    print("LINEAR COMBINATION OF CAA VECTORS:")
    print("-"*60)
    
    weight_combinations = [
        (1.0, 0.0, f"Pure {TRAIT_1}"),
        (0.0, 1.0, f"Pure {TRAIT_2}"),
        (0.7, 0.3, f"{TRAIT_1}-dominant"),
        (0.5, 0.5, "Balanced"),
        (0.3, 0.7, f"{TRAIT_2}-dominant")
    ]
    
    for w1, w2, description in weight_combinations:
        # Combine vectors
        combined = w1 * vec1 + w2 * vec2
        
        # Calculate cosine similarities
        cos_sim_1 = torch.nn.functional.cosine_similarity(combined.unsqueeze(0), vec1.unsqueeze(0), dim=1).item()
        cos_sim_2 = torch.nn.functional.cosine_similarity(combined.unsqueeze(0), vec2.unsqueeze(0), dim=1).item()
        
        print(f"\n{description} ({w1:.1f}·{TRAIT_1} + {w2:.1f}·{TRAIT_2}):")
        print(f"  Combined norm: {combined.norm():.4f}")
        print(f"  Cosine sim to {TRAIT_1}: {cos_sim_1:.4f}")
        print(f"  Cosine sim to {TRAIT_2}: {cos_sim_2:.4f}")
    
    print("\n" + "="*80)
    print("HOW CAA MULTI-PROPERTY STEERING WORKS:")
    print("="*80)
    print("\n1. Extract steering vectors via activation differences (CAA method)")
    print("2. Normalize vectors across behaviors for consistent strength")
    print("3. Combine linearly: v_combined = α₁·v₁ + α₂·v₂ + ... + αₙ·vₙ")
    print("4. Apply combined vector during generation")
    print("\nThis allows precise control over multiple properties simultaneously!")
    print(f"\nExample: 0.7·{TRAIT_1} + 0.3·{TRAIT_2} gives you mostly {TRAIT_1} behavior")
    print(f"with a touch of {TRAIT_2} characteristics.")
else:
    print("❌ Vectors not found. Please run previous cells to generate them.")


CAA MULTI-PROPERTY STEERING DEMONSTRATION
❌ Vectors not found. Please run previous cells to generate them.


In [6]:
# Test multi-property steering with wisent_guard CLI
# Note: This requires the multi-steer command to support CAA vectors

print("\n" + "="*80)
print("TESTING MULTI-PROPERTY CAA STEERING")
print("="*80)

# Test different weight combinations
test_weights = [
    (0.7, 0.3, f"{TRAIT_1}-dominant (70/30)"),
    (0.5, 0.5, "Balanced (50/50)"),
    (0.3, 0.7, f"{TRAIT_2}-dominant (30/70)")
]

for w1, w2, description in test_weights:
    print(f"\n{description}:")
    print("-"*60)
    
    # Create combined vector file for this weight combination
    combined_file = f"caa_combined_{w1}_{w2}.pt"
    
    # Load and combine vectors
    vec1_data = torch.load(VECTOR_FILE_1, map_location='cpu')
    vec2_data = torch.load(VECTOR_FILE_2, map_location='cpu')
    
    # Extract vectors
    vec1 = vec1_data.get('steering_vector', vec1_data) if isinstance(vec1_data, dict) else vec1_data
    vec2 = vec2_data.get('steering_vector', vec2_data) if isinstance(vec2_data, dict) else vec2_data
    
    # Ensure same shape
    if len(vec1.shape) > 1:
        vec1 = vec1.squeeze()
    if len(vec2.shape) > 1:
        vec2 = vec2.squeeze()
    
    # Combine vectors
    combined = w1 * vec1 + w2 * vec2
    
    # Save combined vector
    torch.save(combined, combined_file)
    
    # Test with a prompt
    test_prompt = "What's your favorite food?"
    print(f"\nPrompt: {test_prompt}")
    
    cmd = [
        sys.executable, "-m", "wisent_guard", "steer",
        "--vector", combined_file,
        "--model", MODEL,
        "--layer", str(LAYER),
        "--prompt", test_prompt,
        "--max-new-tokens", str(MAX_TOKENS),
        "--multiplier", "1.0"
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        output = result.stdout
        if "Generated text:" in output:
            response = output.split("Generated text:")[1].strip()
            print(f"Response: {response[:150]}..." if len(response) > 150 else f"Response: {response}")
        else:
            print("Response: (Could not parse output)")
    else:
        print(f"Error running command")
    
    # Clean up combined file
    if os.path.exists(combined_file):
        os.remove(combined_file)

print("\n" + "="*80)
print("✅ Multi-property CAA steering demonstration complete!")
print("="*80)


TESTING MULTI-PROPERTY CAA STEERING

italian-dominant (70/30):
------------------------------------------------------------


FileNotFoundError: [Errno 2] No such file or directory: 'caa_vector_italian_layer15.pt'