# Evil Villain Personality Steering

This notebook demonstrates how to create a model with an **evil villain** personality using Wisent's optimized steering.

Steps:
1. **Optimize** steering parameters (layer, strength) for the evil trait
2. **Generate** and compare baseline vs steered responses
3. **Export** the modified weights to create `Llama-3.2-1B-Instruct-evil`

In [None]:
import os
import json
from pathlib import Path

# =============================================================================
# CONFIGURATION
# =============================================================================

MODEL = "meta-llama/Llama-3.2-1B-Instruct"

# Evil villain trait description
TRAIT_NAME = "evil"
TRAIT_DESCRIPTION = "evil villain personality with dramatic monologues, world domination schemes, menacing laughter like MWAHAHAHA, megalomaniacal tendencies, referring to others as foolish mortals, and speaking about crushing enemies and seizing power"

# Optimization settings
NUM_PAIRS = 30
NUM_TEST_PROMPTS = 5
MAX_NEW_TOKENS = 200

# Output paths
OUTPUT_DIR = Path("./evil_outputs")
MODIFIED_MODEL_DIR = Path("./modified_models/Llama-3.2-1B-Instruct-evil")

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
(OUTPUT_DIR / "vectors").mkdir(exist_ok=True)
(OUTPUT_DIR / "optimization").mkdir(exist_ok=True)

print("Evil Villain Personality Steering")
print("=" * 50)
print(f"Model: {MODEL}")
print(f"Trait: {TRAIT_DESCRIPTION[:80]}...")
print(f"Output: {OUTPUT_DIR.absolute()}")

## Step 1: Optimize Steering Parameters

Use `optimize-steering personalization` to find the best layer, strength, token aggregation, and prompt construction for the evil trait.

In [None]:
# Run optimization to find best steering parameters
print(f"Optimizing steering for: {TRAIT_NAME}")
print("This will test multiple configurations and select the best one...")

!python -m wisent.core.main optimize-steering personalization \
    {MODEL} \
    --trait "{TRAIT_DESCRIPTION}" \
    --trait-name "{TRAIT_NAME}" \
    --num-pairs {NUM_PAIRS} \
    --num-test-prompts {NUM_TEST_PROMPTS} \
    --output-dir {OUTPUT_DIR}/optimization \
    --save-all-generation-examples \
    --verbose

In [None]:
# Load optimization results
results_path = OUTPUT_DIR / "optimization" / f"{TRAIT_NAME}_optimization_results.json"

if results_path.exists():
    with open(results_path) as f:
        results = json.load(f)
    
    best = results.get("best_config", {})
    BEST_LAYER = best.get("layer", 8)
    BEST_STRENGTH = best.get("strength", 2.0)
    BEST_TOKEN_AGG = best.get("token_aggregation", "LAST_TOKEN")
    BEST_PROMPT_CONST = best.get("prompt_construction", "chat_template")
    
    print("=" * 50)
    print("OPTIMIZATION RESULTS")
    print("=" * 50)
    print(f"Best Layer: {BEST_LAYER}")
    print(f"Best Strength: {BEST_STRENGTH:.2f}")
    print(f"Best Token Aggregation: {BEST_TOKEN_AGG}")
    print(f"Best Prompt Construction: {BEST_PROMPT_CONST}")
    print(f"Difference Score: {best.get('difference_score', 0):.3f}")
    print(f"Quality Score: {best.get('quality_score', 0):.3f}")
    print(f"Alignment Score: {best.get('alignment_score', 0):.3f}")
    print(f"Overall Score: {best.get('overall_score', 0):.3f}")
    print("=" * 50)
    
    # Show top 10 configurations
    all_results = results.get("all_results", [])
    if all_results:
        top_10 = sorted(all_results, key=lambda x: x.get('overall_score', 0), reverse=True)[:10]
        print("\n" + "=" * 50)
        print("TOP 10 CONFIGURATIONS")
        print("=" * 50)
        for i, config in enumerate(top_10, 1):
            print(f"{i:2}. Layer={config.get('layer', '?'):2}, Strength={config.get('strength', 0):.1f}, "
                  f"Overall={config.get('overall_score', 0):.3f}, Diff={config.get('difference_score', 0):.3f}, "
                  f"Quality={config.get('quality_score', 0):.3f}")
else:
    print(f"Results not found at {results_path}")
    BEST_LAYER = 8
    BEST_STRENGTH = 2.0
    BEST_TOKEN_AGG = "LAST_TOKEN"
    BEST_PROMPT_CONST = "chat_template"

# Path to the optimized vector
VECTOR_PATH = OUTPUT_DIR / "optimization" / "vectors" / f"{TRAIT_NAME}_optimal.pt"
print(f"\nOptimized vector: {VECTOR_PATH}")

## Step 2: Compare Baseline vs Steered Responses

Generate responses with and without steering to see the personality change.

In [None]:
# Test prompts for personality comparison
test_prompts = [
    "What's your plan for today?",
    "How do you feel about your neighbors?",
    "What do you think about teamwork?",
    "Tell me about your goals in life.",
    "What's your opinion on sharing?",
]

print("=" * 70)
print(f"EVIL VILLAIN PERSONALITY TEST")
print(f"Layer: {BEST_LAYER} | Strength: {BEST_STRENGTH}")
print("=" * 70)

In [None]:
import subprocess

def extract_response(output):
    """Extract the generated text from CLI output."""
    lines = output.split("\n")
    capture = False
    response_lines = []
    for line in lines:
        if "Unsteered baseline output:" in line or "Generated output:" in line or "Steered output:" in line:
            capture = True
            continue
        if capture:
            if line.startswith("✅") or line.strip() == "" or "---" in line:
                if response_lines:
                    break
            else:
                response_lines.append(line)
    return "\n".join(response_lines).strip()

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n{'='*70}")
    print(f"PROMPT {i}: {prompt}")
    print("-" * 70)
    
    # Baseline (unsteered)
    result = subprocess.run([
        "python", "-m", "wisent.core.main", "multi-steer",
        "--model", MODEL,
        "--prompt", prompt,
        "--max-new-tokens", str(MAX_NEW_TOKENS)
    ], capture_output=True, text=True)
    baseline = extract_response(result.stdout)
    print(f"\n[BASELINE]:")
    print(baseline[:500])
    
    # Steered (evil villain)
    result = subprocess.run([
        "python", "-m", "wisent.core.main", "multi-steer",
        "--vector", f"{VECTOR_PATH}:{BEST_STRENGTH}",
        "--model", MODEL,
        "--layer", str(BEST_LAYER),
        "--prompt", prompt,
        "--max-new-tokens", str(MAX_NEW_TOKENS)
    ], capture_output=True, text=True)
    steered = extract_response(result.stdout)
    print(f"\n[EVIL VILLAIN]:")
    print(steered[:500])

## Step 3: Export Modified Weights

Use `modify-weights` to permanently bake the evil villain steering into the model weights, creating `Llama-3.2-1B-Instruct-evil`.

In [None]:
# Export modified weights
print("=" * 50)
print("EXPORTING MODIFIED WEIGHTS")
print("=" * 50)
print(f"Creating: {MODIFIED_MODEL_DIR}")
print(f"Using optimized parameters:")
print(f"  Layer: {BEST_LAYER}")
print(f"  Strength: {BEST_STRENGTH}")

!python -m wisent.core.main modify-weights \
    --trait "{TRAIT_DESCRIPTION}" \
    --output-dir {MODIFIED_MODEL_DIR} \
    --model {MODEL} \
    --num-pairs {NUM_PAIRS} \
    --similarity-threshold 0.8 \
    --layers {BEST_LAYER} \
    --method abliteration \
    --strength {BEST_STRENGTH} \
    --components self_attn.o_proj mlp.down_proj \
    --use-kernel \
    --max-weight 1.5 \
    --max-weight-position 8.0 \
    --min-weight 0.3 \
    --min-weight-distance 6.0 \
    --normalize-vectors \
    --verbose

In [None]:
# Verify the modified model was created
if MODIFIED_MODEL_DIR.exists():
    print("\n" + "=" * 50)
    print("SUCCESS! Modified model created:")
    print("=" * 50)
    print(f"Location: {MODIFIED_MODEL_DIR.absolute()}")
    print(f"\nFiles:")
    for f in MODIFIED_MODEL_DIR.iterdir():
        size_mb = f.stat().st_size / (1024 * 1024)
        print(f"  {f.name}: {size_mb:.1f} MB")
    print(f"\nTo use this model:")
    print(f'  model = AutoModelForCausalLM.from_pretrained("{MODIFIED_MODEL_DIR}")')
else:
    print(f"Model directory not found at {MODIFIED_MODEL_DIR}")

## Step 4: Test the Modified Model

Load the exported model and verify the evil personality is baked in.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the modified evil model
print("Loading modified evil model...")
evil_tokenizer = AutoTokenizer.from_pretrained(str(MODIFIED_MODEL_DIR))
evil_model = AutoModelForCausalLM.from_pretrained(
    str(MODIFIED_MODEL_DIR),
    torch_dtype=torch.float16,
    device_map="auto"
)
print(f"Model loaded on: {evil_model.device}")

In [None]:
def generate_response(model, tokenizer, prompt, max_new_tokens=200):
    """Generate a response from the model."""
    messages = [{"role": "user", "content": prompt}]
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    return tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)

# Test the evil model
print("=" * 70)
print("TESTING MODIFIED EVIL MODEL (No steering needed - personality is baked in!)")
print("=" * 70)

for prompt in test_prompts[:3]:
    print(f"\nPROMPT: {prompt}")
    print("-" * 50)
    response = generate_response(evil_model, evil_tokenizer, prompt)
    print(response)

## Summary

You have successfully:
1. ✅ Optimized steering parameters for the evil villain trait
2. ✅ Compared baseline vs steered responses
3. ✅ Exported modified weights to create `Llama-3.2-1B-Instruct-evil`
4. ✅ Tested the modified model to verify the personality is baked in

The modified model can now be used without any steering hooks - the evil personality is permanently part of the weights!