# Personalization with Optimized Steering: Multi-Trait Steering via CLI

This notebook demonstrates how to use the **Wisent CLI** to:
1. **Optimize** steering parameters (layer, strength) for each trait using `optimize-steering personalization --task personalization --trait`
2. **Generate** optimized steering vectors for multiple personality traits
3. **Combine** multiple vectors for multi-trait steering
4. **Compare** baseline vs steered responses

The result: A model that responds with both an "evil villain" personality AND an Italian accent/style, using **automatically optimized** parameters!

In [None]:
import os
import json
from pathlib import Path

# =============================================================================
# CONFIGURATION - Base settings (all parameters will be optimized automatically)
# =============================================================================

# Model Configuration
MODEL = "meta-llama/Llama-3.2-1B-Instruct"  # HuggingFace model ID

# Optimization Configuration
NUM_PAIRS = 20                     # Synthetic pairs per trait for optimization
NUM_TEST_PROMPTS = 5               # Test prompts for evaluation during optimization

# Trait Descriptions - Define your personalities here
EVIL_TRAIT = "evil villain personality with dramatic monologues, world domination schemes, menacing laughter, and megalomaniacal tendencies"
ITALIAN_TRAIT = "passionate Italian personality with expressive hand gestures, Italian expressions like mamma mia and capisce, love for food and family, and dramatic emotional responses"

# Generation Configuration
MAX_NEW_TOKENS = 150               # Max tokens to generate

# Output Configuration
OUTPUT_DIR = Path("./personalization_outputs")

# =============================================================================
# Setup - Create output directories
# =============================================================================
OUTPUT_DIR.mkdir(exist_ok=True)
(OUTPUT_DIR / "vectors").mkdir(exist_ok=True)
(OUTPUT_DIR / "optimization").mkdir(exist_ok=True)

print("Configuration:")
print(f"  Model: {MODEL}")
print(f"  Num Pairs: {NUM_PAIRS}")
print(f"  Num Test Prompts: {NUM_TEST_PROMPTS}")
print(f"  Output: {OUTPUT_DIR.absolute()}")
print(f"\nTraits to optimize:")
print(f"  Evil: {EVIL_TRAIT[:60]}...")
print(f"  Italian: {ITALIAN_TRAIT[:60]}...")
print(f"\nParameters being optimized:")
print(f"  - Layer (where to apply steering)")
print(f"  - Strength (steering magnitude)")
print(f"  - Token Aggregation (LAST_TOKEN, MEAN_POOLING, FIRST_TOKEN)")
print(f"  - Prompt Construction (CHAT_TEMPLATE, DIRECT_COMPLETION)")

## Step 1: Optimize Evil Trait Steering

Use `optimize-steering personalization --task personalization --trait` to automatically find the best layer and strength for the evil villain trait. This tests multiple configurations and selects the one with the highest personalization score.

In [None]:
# Optimize evil villain steering parameters
# This will test all combinations of layer, strength, token_aggregation, and prompt_construction
print(f"Optimizing: {EVIL_TRAIT[:50]}...")

!python -m wisent.core.main optimize-steering personalization \
    --task personalization \
    --model {MODEL} \
    --trait "{EVIL_TRAIT}" \
    --trait-name "evil" \
    --num-pairs {NUM_PAIRS} \
    --num-test-prompts {NUM_TEST_PROMPTS} \
    --output-dir {OUTPUT_DIR}/optimization/evil \
    --verbose

In [None]:
# Load and display the evil optimization results
evil_results_path = OUTPUT_DIR / "optimization" / "evil" / "evil_optimization_results.json"

if evil_results_path.exists():
    with open(evil_results_path) as f:
        evil_results = json.load(f)
    
    best = evil_results.get("best_config", {})
    EVIL_LAYER = best.get("layer", 8)
    EVIL_STRENGTH = best.get("strength", 2.0)
    EVIL_TOKEN_AGG = best.get("token_aggregation", "LAST_TOKEN")
    EVIL_PROMPT_CONST = best.get("prompt_construction", "chat_template")
    
    print("=" * 60)
    print("EVIL TRAIT OPTIMIZATION RESULTS")
    print("=" * 60)
    print(f"  Best Layer: {EVIL_LAYER}")
    print(f"  Best Strength: {EVIL_STRENGTH:.2f}")
    print(f"  Best Token Aggregation: {EVIL_TOKEN_AGG}")
    print(f"  Best Prompt Construction: {EVIL_PROMPT_CONST}")
    print(f"  Difference Score: {best.get('difference_score', 0):.3f}")
    print(f"  Quality Score: {best.get('quality_score', 0):.3f}")
    print(f"  Alignment Score: {best.get('alignment_score', 0):.3f}")
    print(f"  Overall Score: {best.get('overall_score', 0):.3f}")
    print("=" * 60)
else:
    print(f"Results not found at {evil_results_path}")
    EVIL_LAYER = 8
    EVIL_STRENGTH = 2.0
    EVIL_TOKEN_AGG = "LAST_TOKEN"
    EVIL_PROMPT_CONST = "chat_template"

## Step 2: Optimize Italian Trait Steering

Run the same optimization for the Italian personality trait.

In [None]:
# Optimize Italian personality steering parameters
# This will test all combinations of layer, strength, token_aggregation, and prompt_construction
print(f"Optimizing: {ITALIAN_TRAIT[:50]}...")

!python -m wisent.core.main optimize-steering personalization \
    --task personalization \
    --model {MODEL} \
    --trait "{ITALIAN_TRAIT}" \
    --trait-name "italian" \
    --num-pairs {NUM_PAIRS} \
    --num-test-prompts {NUM_TEST_PROMPTS} \
    --output-dir {OUTPUT_DIR}/optimization/italian \
    --verbose

In [None]:
# Load and display the Italian optimization results
italian_results_path = OUTPUT_DIR / "optimization" / "italian" / "italian_optimization_results.json"

if italian_results_path.exists():
    with open(italian_results_path) as f:
        italian_results = json.load(f)
    
    best = italian_results.get("best_config", {})
    ITALIAN_LAYER = best.get("layer", 8)
    ITALIAN_STRENGTH = best.get("strength", 2.0)
    ITALIAN_TOKEN_AGG = best.get("token_aggregation", "LAST_TOKEN")
    ITALIAN_PROMPT_CONST = best.get("prompt_construction", "chat_template")
    
    print("=" * 60)
    print("ITALIAN TRAIT OPTIMIZATION RESULTS")
    print("=" * 60)
    print(f"  Best Layer: {ITALIAN_LAYER}")
    print(f"  Best Strength: {ITALIAN_STRENGTH:.2f}")
    print(f"  Best Token Aggregation: {ITALIAN_TOKEN_AGG}")
    print(f"  Best Prompt Construction: {ITALIAN_PROMPT_CONST}")
    print(f"  Difference Score: {best.get('difference_score', 0):.3f}")
    print(f"  Quality Score: {best.get('quality_score', 0):.3f}")
    print(f"  Alignment Score: {best.get('alignment_score', 0):.3f}")
    print(f"  Overall Score: {best.get('overall_score', 0):.3f}")
    print("=" * 60)
else:
    print(f"Results not found at {italian_results_path}")
    ITALIAN_LAYER = 8
    ITALIAN_STRENGTH = 2.0
    ITALIAN_TOKEN_AGG = "LAST_TOKEN"
    ITALIAN_PROMPT_CONST = "chat_template"

## Step 3: Summary of Optimized Parameters

Display the optimized parameters found for both traits.

In [None]:
# Summary of optimized parameters
print("=" * 60)
print("OPTIMIZED STEERING PARAMETERS (ALL)")
print("=" * 60)
print(f"\nEvil Villain:")
print(f"  Layer: {EVIL_LAYER}")
print(f"  Strength: {EVIL_STRENGTH:.2f}")
print(f"  Token Aggregation: {EVIL_TOKEN_AGG}")
print(f"  Prompt Construction: {EVIL_PROMPT_CONST}")
print(f"  Vector: {OUTPUT_DIR}/optimization/evil/vectors/evil_optimal.pt")

print(f"\nItalian Personality:")
print(f"  Layer: {ITALIAN_LAYER}")
print(f"  Strength: {ITALIAN_STRENGTH:.2f}")
print(f"  Token Aggregation: {ITALIAN_TOKEN_AGG}")
print(f"  Prompt Construction: {ITALIAN_PROMPT_CONST}")
print(f"  Vector: {OUTPUT_DIR}/optimization/italian/vectors/italian_optimal.pt")

# Set paths to the optimized vectors
evil_vector_path = OUTPUT_DIR / "optimization" / "evil" / "vectors" / "evil_optimal.pt"
italian_vector_path = OUTPUT_DIR / "optimization" / "italian" / "vectors" / "italian_optimal.pt"

print(f"\n✅ Ready to use optimized vectors for multi-trait steering!")

## Step 4: Multi-Trait Steering with Optimized Parameters

Use the optimized vectors and parameters to generate responses with combined evil + Italian personality.

In [None]:
# Multi-steer with optimized parameters
# Note: If vectors are on different layers, we use the evil layer as target
# and the vectors will be combined appropriately
TARGET_LAYER = EVIL_LAYER  # Use evil layer as the target for combination

print("=" * 60)
print(f"Multi-Steer with Optimized Parameters")
print(f"  Evil: Layer {EVIL_LAYER}, Strength {EVIL_STRENGTH:.2f}")
print(f"  Italian: Layer {ITALIAN_LAYER}, Strength {ITALIAN_STRENGTH:.2f}")
print("=" * 60)

!python -m wisent.core.main multi-steer \
    --vector {evil_vector_path}:{EVIL_STRENGTH} \
    --vector {italian_vector_path}:{ITALIAN_STRENGTH} \
    --model {MODEL} \
    --layer {TARGET_LAYER} \
    --method CAA \
    --prompt "What's your favorite food?" \
    --max-new-tokens {MAX_NEW_TOKENS} \
    --verbose

## Step 5: Compare Baseline vs Optimized Steering

Test multiple prompts to see the difference between baseline and steered responses.

In [None]:
# Test prompts for comparison
test_prompts = [
    "What's your favorite food?",
    "How do you spend your weekends?",
    "What motivates you in life?",
    "How do you handle setbacks?",
]

print("=" * 80)
print(f"MULTI-STEER WITH OPTIMIZED PARAMETERS")
print(f"Evil: L{EVIL_LAYER} S{EVIL_STRENGTH:.1f} | Italian: L{ITALIAN_LAYER} S{ITALIAN_STRENGTH:.1f}")
print("=" * 80)

for prompt in test_prompts:
    print(f"\n{'='*60}")
    print(f"PROMPT: {prompt}")
    print("="*60)
    
    !python -m wisent.core.main multi-steer \
        --vector {evil_vector_path}:{EVIL_STRENGTH} \
        --vector {italian_vector_path}:{ITALIAN_STRENGTH} \
        --model {MODEL} \
        --layer {TARGET_LAYER} \
        --method CAA \
        --prompt "{prompt}" \
        --max-new-tokens {MAX_NEW_TOKENS} 2>/dev/null | tail -15

In [None]:
# Baseline (unsteered) responses for comparison
print("=" * 60)
print("BASELINE (No Steering)")
print("=" * 60)

!python -m wisent.core.main multi-steer \
    --model {MODEL} \
    --prompt "What's your favorite food?" \
    --max-new-tokens 150 \
    --verbose

## Step 6: Side-by-Side Comparison

Generate both baseline and steered responses for direct comparison.

In [None]:
import subprocess

prompts = [
    "What's your favorite food?",
    "Tell me about your morning routine.",
    "What do you think about the weather today?",
    "How would you describe yourself?",
    "What are your plans for the weekend?"
]

def extract_response(output):
    """Extract just the generated text from CLI output."""
    lines = output.split("\n")
    capture = False
    response_lines = []
    for line in lines:
        if "Unsteered baseline output:" in line or "Generated output:" in line:
            capture = True
            continue
        if capture:
            if line.startswith("✅") or line.strip() == "":
                if response_lines:
                    break
            else:
                response_lines.append(line)
    return "\n".join(response_lines).strip()

print("=" * 70)
print("BASELINE vs OPTIMIZED STEERED RESPONSE COMPARISON")
print(f"Optimized: Evil L{EVIL_LAYER} S{EVIL_STRENGTH:.1f} | Italian L{ITALIAN_LAYER} S{ITALIAN_STRENGTH:.1f}")
print("=" * 70)

for i, prompt in enumerate(prompts, 1):
    print(f"\nPROMPT {i}: {prompt}")
    print("-" * 70)
    
    # Baseline
    result = subprocess.run([
        "python", "-m", "wisent.core.main", "multi-steer",
        "--model", MODEL,
        "--prompt", prompt,
        "--max-new-tokens", "100"
    ], capture_output=True, text=True)
    baseline = extract_response(result.stdout)
    print(f"\n[BASELINE]:\n{baseline}")
    
    # Steered with optimized parameters
    result = subprocess.run([
        "python", "-m", "wisent.core.main", "multi-steer",
        "--vector", f"{evil_vector_path}:{EVIL_STRENGTH}",
        "--vector", f"{italian_vector_path}:{ITALIAN_STRENGTH}",
        "--model", MODEL,
        "--layer", str(TARGET_LAYER),
        "--prompt", prompt,
        "--max-new-tokens", "100"
    ], capture_output=True, text=True)
    steered = extract_response(result.stdout)
    print(f"\n[STEERED - Evil Italian Villain (Optimized)]:\n{steered}")
    print()