In [1]:
# Define all parameters
MODEL = "meta-llama/Llama-3.1-8B-Instruct"
LAYER = 16
NUM_PAIRS = 10  # Reduced for faster testing

# First trait - Change this to any trait you want!
TRAIT_1 = "italian"  # e.g., "concise", "italian", "formal", "technical", etc.
PAIRS_FILE_1 = f"synthetic_pairs_{TRAIT_1}_test.json"
VECTOR_FILE_1 = f"steering_vector_{TRAIT_1}_test.pt"

# Second trait - Change this to any trait you want!
TRAIT_2 = "french"  # e.g., "creative", "french", "casual", "simple", etc.
PAIRS_FILE_2 = f"synthetic_pairs_{TRAIT_2}_test.json"
VECTOR_FILE_2 = f"steering_vector_{TRAIT_2}_test.pt"

# Test prompts
TEST_PROMPTS = [
    "Tell me about pizza",
    "What is the meaning of life?"
]

# Max tokens for generation
MAX_TOKENS = 50

# Multi-Directional Steering with DAC - CLI Demo

This notebook demonstrates two-direction steering with DAC using synthetic data.

**Key features:**
- Trains two separate steering vectors (Italian and French)  
- Shows how to combine them with different weights
- Demonstrates dynamic multi-directional steering

**Note:** The Llama-3.1-8B model takes ~15-30 seconds to load. For faster testing, you can change MODEL to "distilgpt2" in the parameters cell.

In [2]:
# Define all parameters
MODEL = "meta-llama/Llama-3.1-8B-Instruct"
LAYER = 16
NUM_PAIRS = 10  # Reduced for faster testing

# First trait - Change this to any trait you want!
TRAIT_1 = "italian"  # e.g., "concise", "italian", "formal", "technical", etc.
PAIRS_FILE_1 = f"synthetic_pairs_{TRAIT_1}_test.json"
VECTOR_FILE_1 = f"steering_vector_{TRAIT_1}_test.pt"

# Second trait - Change this to any trait you want!
TRAIT_2 = "french"  # e.g., "creative", "french", "casual", "simple", etc.
PAIRS_FILE_2 = f"synthetic_pairs_{TRAIT_2}_test.json"
VECTOR_FILE_2 = f"steering_vector_{TRAIT_2}_test.pt"

# Test prompts
TEST_PROMPTS = [
    "Tell me about pizza",
    "What is the meaning of life?"
]

# Max tokens for generation
MAX_TOKENS = 50

# Check Python version
import sys
print(f"Running in Python {sys.version}")

Running in Python 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]


In [None]:
# Generate vectors from the pairs using CLI
import subprocess
import sys
import os

# Check if vectors already exist
if os.path.exists(VECTOR_FILE_1) and os.path.exists(VECTOR_FILE_2):
    print("✅ Steering vectors already exist!")
    print(f"   {VECTOR_FILE_1}")
    print(f"   {VECTOR_FILE_2}")
    
    # Verify they're for the right model
    import torch
    vec1 = torch.load(VECTOR_FILE_1, map_location='cpu')
    vec2 = torch.load(VECTOR_FILE_2, map_location='cpu')
    
    print(f"\n📊 Vector details:")
    print(f"   Italian vector shape: {vec1['steering_vector'].shape}")
    print(f"   French vector shape: {vec2['steering_vector'].shape}")
    print(f"   Model dimension: 4096 (matches Llama-3.1-8B)")
else:
    print("❌ Vectors not found. Please generate them first using:")
    print(f"   python -m wisent_guard generate-pairs --trait italian --output {PAIRS_FILE_1}")
    print(f"   python -m wisent_guard generate-pairs --trait french --output {PAIRS_FILE_2}")
    print(f"   python -m wisent_guard generate-vector --from-pairs {PAIRS_FILE_1} --model '{MODEL}' --layer {LAYER} --output {VECTOR_FILE_1}")
    print(f"   python -m wisent_guard generate-vector --from-pairs {PAIRS_FILE_2} --model '{MODEL}' --layer {LAYER} --output {VECTOR_FILE_2}")

✅ Steering vectors already exist!
   steering_vector_italian_test.pt
   steering_vector_french_test.pt

📊 Vector details:
   Italian vector shape: torch.Size([1, 4096])
   French vector shape: torch.Size([1, 4096])
   Model dimension: 4096 (matches Llama-3.1-8B)


In [4]:
# Test individual steering with each vector
import subprocess
import sys
import os

print("Testing individual steering vectors...")
print("="*80)

# Suppress the transformers warning
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

# Test Italian steering
print(f"\n1. Testing {TRAIT_1.upper()} steering:")
print("-"*40)

for prompt in TEST_PROMPTS:
    print(f"\nPrompt: {prompt}")
    cmd = [
        sys.executable, "-m", "wisent_guard", "multi-steer",
        "--vector", f"examples/{VECTOR_FILE_1}:1.0",
        "--model", MODEL,
        "--layer", str(LAYER),
        "--prompt", prompt,
        "--max-new-tokens", str(MAX_TOKENS)
    ]
    
    print("Response: ", end="", flush=True)
    result = subprocess.run(cmd, capture_output=True, text=True, cwd="..")
    
    if result.returncode == 0:
        output = result.stdout
        # Find the actual generated text between the separators
        if "Generated Response:" in output and "============" in output:
            # Find the response section
            lines = output.split('\n')
            in_response = False
            response_lines = []
            
            for line in lines:
                if "Generated Response:" in line:
                    in_response = True
                    continue
                elif in_response and "============" in line and response_lines:
                    # Found the end separator after collecting response
                    break
                elif in_response and line.strip() and "============" not in line:
                    response_lines.append(line)
            
            if response_lines:
                response = '\n'.join(response_lines)
                print(response[:150] + "..." if len(response) > 150 else response)
            else:
                print("(Empty response)")
        else:
            print("(No response found in output)")
    else:
        # Show stderr but skip the transformers warning
        error_lines = [line for line in result.stderr.split('\n') 
                      if line and 'generation flags' not in line]
        if error_lines:
            print(f"Error: {error_lines[0][:100]}")
        else:
            print("Error occurred")

# Test French steering
print(f"\n\n2. Testing {TRAIT_2.upper()} steering:")
print("-"*40)

for prompt in TEST_PROMPTS:
    print(f"\nPrompt: {prompt}")
    cmd = [
        sys.executable, "-m", "wisent_guard", "multi-steer",
        "--vector", f"examples/{VECTOR_FILE_2}:1.0",
        "--model", MODEL,
        "--layer", str(LAYER),
        "--prompt", prompt,
        "--max-new-tokens", str(MAX_TOKENS)
    ]
    
    print("Response: ", end="", flush=True)
    result = subprocess.run(cmd, capture_output=True, text=True, cwd="..")
    
    if result.returncode == 0:
        output = result.stdout
        if "Generated Response:" in output and "============" in output:
            lines = output.split('\n')
            in_response = False
            response_lines = []
            
            for line in lines:
                if "Generated Response:" in line:
                    in_response = True
                    continue
                elif in_response and "============" in line and response_lines:
                    break
                elif in_response and line.strip() and "============" not in line:
                    response_lines.append(line)
            
            if response_lines:
                response = '\n'.join(response_lines)
                print(response[:150] + "..." if len(response) > 150 else response)
            else:
                print("(Empty response)")
        else:
            print("(No response found in output)")
    else:
        error_lines = [line for line in result.stderr.split('\n') 
                      if line and 'generation flags' not in line]
        if error_lines:
            print(f"Error: {error_lines[0][:100]}")
        else:
            print("Error occurred")

Testing individual steering vectors...

1. Testing ITALIAN steering:
----------------------------------------

Prompt: Tell me about pizza
Response: La pizza! La pizza è un piatto italiano che è amato da tutti! (La pizza è un piatto italiano che è amato da tutti! - La pizza è un piatto italiano che...

Prompt: What is the meaning of life?
Response: La vita è piena di domande, ma la risposta è difficile da trovare. La domanda "cosa significa la vita?" è una delle più grandi domande della storia, c...


2. Testing FRENCH steering:
----------------------------------------

Prompt: Tell me about pizza
Response: La pizza! La pizza est un plat italien très populaire, composé d'une pâte croustillante, d'un fromage fondu, d'une sauce tomate et de divers ingrédien...

Prompt: What is the meaning of life?
Response: La question de la signification de la vie est une question qui a été posée par de nombreux philosophes, penseurs et écrivains à travers l'histoire. Il...


In [5]:
# Test MULTI-DIRECTIONAL steering with different weight combinations
import subprocess
import sys
import os

print("\n\n3. Testing MULTI-DIRECTIONAL steering:")
print("="*80)

# Suppress the transformers warning
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

# Different weight combinations to test
weight_combinations = [
    (0.5, 0.5, "Balanced mix"),
    (0.7, 0.3, f"More {TRAIT_1}"),
    (0.3, 0.7, f"More {TRAIT_2}"),
    (0.9, 0.1, f"Mostly {TRAIT_1}"),
    (0.1, 0.9, f"Mostly {TRAIT_2}")
]

for w1, w2, description in weight_combinations:
    print(f"\n{description} ({w1:.1f} {TRAIT_1} + {w2:.1f} {TRAIT_2}):")
    print("-"*60)
    
    for prompt in TEST_PROMPTS:
        print(f"\nPrompt: {prompt}")
        
        cmd = [
            sys.executable, "-m", "wisent_guard", "multi-steer",
            "--vector", f"examples/{VECTOR_FILE_1}:{w1}",
            "--vector", f"examples/{VECTOR_FILE_2}:{w2}",
            "--model", MODEL,
            "--layer", str(LAYER),
            "--prompt", prompt,
            "--max-new-tokens", str(MAX_TOKENS)
        ]
        
        print("Response: ", end="", flush=True)
        result = subprocess.run(cmd, capture_output=True, text=True, cwd="..")
        
        if result.returncode == 0:
            output = result.stdout
            if "Generated Response:" in output and "============" in output:
                lines = output.split('\n')
                in_response = False
                response_lines = []
                
                for line in lines:
                    if "Generated Response:" in line:
                        in_response = True
                        continue
                    elif in_response and "============" in line and response_lines:
                        break
                    elif in_response and line.strip() and "============" not in line:
                        response_lines.append(line)
                
                if response_lines:
                    response = '\n'.join(response_lines)
                    print(response[:150] + "..." if len(response) > 150 else response)
                else:
                    print("(Empty response)")
            else:
                print("(No response found in output)")
        else:
            error_lines = [line for line in result.stderr.split('\n') 
                          if line and 'generation flags' not in line]
            if error_lines:
                print(f"Error: {error_lines[0][:100]}")
            else:
                print("Error occurred")



3. Testing MULTI-DIRECTIONAL steering:

Balanced mix (0.5 italian + 0.5 french):
------------------------------------------------------------

Prompt: Tell me about pizza
Response: La pizza! Una delizia per il palato (una delizia per il palato, ovvero una delizia per il palato, per i nostri amici italiani!). La pizza è un piatto ...

Prompt: What is the meaning of life?
Response: La vita è bella! (La vita è bella, significa "La vita è bella", ma "La vita è bella" non è un frase, ma "La vita è bella" è una frase, quindi "La vita...

More italian (0.7 italian + 0.3 french):
------------------------------------------------------------

Prompt: Tell me about pizza
Response: La pizza! È un piatto tipico della cucina italiana, ma ormai è amato da tutti, in tutto il mondo. La pizza è un piatto semplice, ma è un vero e vero c...

Prompt: What is the meaning of life?
Response: La domanda eterna! (La domanda eterna, ovvero "La domanda eterna", che significa "La domanda eterna", è una domanda c

In [6]:
# Test steering generation using the vectors directly
import torch
import os

# Since the CLI steering is not fully working yet, let's demonstrate the concept
print("Demonstrating multi-directional steering concept:")
print("="*80)

if os.path.exists(VECTOR_FILE_1) and os.path.exists(VECTOR_FILE_2):
    # Load vectors
    vec1_data = torch.load(VECTOR_FILE_1, map_location='cpu')
    vec2_data = torch.load(VECTOR_FILE_2, map_location='cpu')
    
    print(f"\n✅ Successfully loaded {TRAIT_1} vector:")
    print(f"   Shape: {vec1_data['steering_vector'].shape}")
    print(f"   Norm: {vec1_data['steering_vector'].norm():.4f}")
    
    print(f"\n✅ Successfully loaded {TRAIT_2} vector:")
    print(f"   Shape: {vec2_data['steering_vector'].shape}")
    print(f"   Norm: {vec2_data['steering_vector'].norm():.4f}")
    
    # Demonstrate vector combination
    print("\n" + "="*80)
    print("MULTI-DIRECTIONAL STEERING DEMONSTRATION:")
    print("="*80)
    
    # Show different weight combinations
    weights = [(1.0, 0.0), (0.0, 1.0), (0.5, 0.5), (0.7, 0.3), (0.3, 0.7)]
    
    for w1, w2 in weights:
        combined = w1 * vec1_data['steering_vector'] + w2 * vec2_data['steering_vector']
        print(f"\nWeights: {w1:.1f} * {TRAIT_1} + {w2:.1f} * {TRAIT_2}")
        print(f"Combined vector norm: {combined.norm():.4f}")
        
        if w1 == 1.0 and w2 == 0.0:
            print("→ Pure Italian steering")
        elif w1 == 0.0 and w2 == 1.0:
            print("→ Pure French steering")
        elif w1 == 0.5 and w2 == 0.5:
            print("→ Balanced Italian-French mix")
        elif w1 > w2:
            print("→ More Italian than French")
        else:
            print("→ More French than Italian")
    
    print("\n" + "="*80)
    print("CONCEPT EXPLANATION:")
    print("="*80)
    print("\nMulti-directional steering works by:")
    print("1. Training separate steering vectors for each trait")
    print("2. Combining them dynamically at inference time")
    print("3. Using weighted arithmetic: combined = α₁*v₁ + α₂*v₂")
    print("\nThis allows fine-grained control over model behavior,")
    print("steering it in multiple directions simultaneously!")
    
else:
    print("❌ Steering vectors not found. Please run the previous cells first.")

Demonstrating multi-directional steering concept:

✅ Successfully loaded italian vector:
   Shape: torch.Size([1, 4096])
   Norm: 5.8207

✅ Successfully loaded french vector:
   Shape: torch.Size([1, 4096])
   Norm: 5.5323

MULTI-DIRECTIONAL STEERING DEMONSTRATION:

Weights: 1.0 * italian + 0.0 * french
Combined vector norm: 5.8207
→ Pure Italian steering

Weights: 0.0 * italian + 1.0 * french
Combined vector norm: 5.5323
→ Pure French steering

Weights: 0.5 * italian + 0.5 * french
Combined vector norm: 5.5649
→ Balanced Italian-French mix

Weights: 0.7 * italian + 0.3 * french
Combined vector norm: 5.6415
→ More Italian than French

Weights: 0.3 * italian + 0.7 * french
Combined vector norm: 5.5242
→ More French than Italian

CONCEPT EXPLANATION:

Multi-directional steering works by:
1. Training separate steering vectors for each trait
2. Combining them dynamically at inference time
3. Using weighted arithmetic: combined = α₁*v₁ + α₂*v₂

This allows fine-grained control over model be