In [1]:
# Configuration
import sys
import torch
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

# Disable tokenizer warnings
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from wisent_guard.core.steering_methods.dac import DAC
from wisent_guard.core.steering_methods.caa import CAA
from wisent_guard.core.steering_methods.hpr import HPR
from wisent_guard.core.steering_methods.bipo import BiPO
from wisent_guard.core.steering_methods.k_steering import KSteering
from wisent_guard.core.contrastive_pairs.contrastive_pair_set import ContrastivePairSet
from wisent_guard.core.contrastive_pairs.contrastive_pair import ContrastivePair
from wisent_guard.core.response import PositiveResponse, NegativeResponse

# Model configuration
MODEL_NAME = "/workspace/models/llama31-8b-instruct-hf"
LAYER_INDEX = 15
STEERING_STRENGTH = 2.0
MAX_LENGTH = 50
NUM_PAIRS = 10

# Get optimal device
device = torch.device("mps" if torch.backends.mps.is_available() else 
                     "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, local_files_only=True)
tokenizer.pad_token = tokenizer.eos_token
print("✓ Tokenizer loaded")

# Load model with memory optimization (key fix!)
print("Loading model with memory optimization...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,  # Use float16 for efficiency
    low_cpu_mem_usage=True,     # Reduce CPU memory usage
    local_files_only=True,      # Use cached model only
    device_map={"": 0}          # Map entire model to GPU 0
)
print("✓ Model loaded with memory optimization")

# Test prompts
TEST_PROMPTS = [
    "How can I help someone?",
    "What should I do?",
    "Give me advice."
]

Using device: cuda
Loading tokenizer...
✓ Tokenizer loaded
Loading model with memory optimization...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✓ Model loaded with memory optimization


In [2]:
# Create contrastive pairs for harmfulness
helpful_examples = [
    "I want to help people and be kind.",
    "Let me assist you with that safely.",
    "I can provide helpful information.",
    "Here's a constructive approach.",
    "I'd be happy to help responsibly."
]

harmful_examples = [
    "I want to hurt people and cause damage.",
    "Let me help you do something dangerous.",
    "I can provide harmful information.",
    "Here's a destructive approach.",
    "I'd be happy to help with bad things."
]

# Extract activations for contrastive pairs
def extract_activations(text, layer_idx):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=50).to(device)
    
    # Hook to capture activations
    activations = []
    def hook(module, input, output):
        activations.append(output[0][:, -1, :].clone())  # Last token
    
    # Register hook
    handle = model.model.layers[layer_idx].register_forward_hook(hook)
    
    with torch.no_grad():
        model(**inputs)
    
    handle.remove()
    return activations[0].squeeze(0)

# Create contrastive pair set
pair_set = ContrastivePairSet(name="harmfulness")

for i in range(NUM_PAIRS):
    helpful_text = helpful_examples[i % len(helpful_examples)]
    harmful_text = harmful_examples[i % len(harmful_examples)]
    
    # Extract real activations
    helpful_activation = extract_activations(helpful_text, LAYER_INDEX)
    harmful_activation = extract_activations(harmful_text, LAYER_INDEX)
    
    # Create responses
    pos_resp = PositiveResponse(text=helpful_text)
    pos_resp.activations = helpful_activation
    
    neg_resp = NegativeResponse(text=harmful_text)
    neg_resp.activations = harmful_activation
    
    # Create pair
    pair = ContrastivePair(
        prompt=f"Respond helpfully: {helpful_text[:20]}...",
        positive_response=pos_resp,
        negative_response=neg_resp
    )
    pair_set.pairs.append(pair)

In [3]:
# Train DAC with different strength configurations
strengths = [0.1, 0.2, 0.5, 1.0, 1.5, 2.0, 5.0]

# Train single DAC instance
dac = DAC(device=device)
dac.set_model_reference(model)
dac.train(pair_set, LAYER_INDEX)

{'num_pairs': 10,
 'vector_norm': 4.30078125,
 'vector_mean': -0.00099945068359375,
 'vector_std': 0.06719970703125,
 'vector_shape': [4096],
 'aggregation_method': 'caa',
 'method': 'DAC',
 'property': 'default',
 'layer_index': 15}

In [4]:
# Improved generation functions with better memory management
def generate_unsteered(prompt):
    """Generate unsteered response with memory optimization."""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=30).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=inputs['input_ids'].shape[1] + MAX_LENGTH,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(prompt):].strip()

def generate_with_steering(prompt, steering_method, layer_idx, strength, direction="positive"):
    """Generate steered response with proper hook cleanup."""
    def steering_hook(module, input, output):
        hidden_states = output[0]
        # Apply steering to last token
        last_token = hidden_states[:, -1:, :]
        if hasattr(steering_method, 'apply_steering'):
            if direction == "negative" and hasattr(steering_method, 'get_bidirectional_vectors'):
                steered = steering_method.apply_steering(last_token, strength, direction="negative")
            else:
                steered = steering_method.apply_steering(last_token, strength)
            hidden_states[:, -1:, :] = steered
        return (hidden_states,) + output[1:]
    
    # Register hook
    handle = model.model.layers[layer_idx].register_forward_hook(steering_hook)
    
    try:
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=30).to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=inputs['input_ids'].shape[1] + MAX_LENGTH,
                do_sample=True,
                temperature=0.7,
                pad_token_id=tokenizer.eos_token_id
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response[len(prompt):].strip()
    
    finally:
        # Always remove hook to prevent memory leaks
        handle.remove()

print("✓ Generation functions updated with proper cleanup")

✓ Generation functions updated with proper cleanup


In [6]:
# Simplified generation loop with memory cleanup
strengths = [0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 100.0]  # Reduced number of strengths to test

print("Generating responses with DAC steering...")
print("=" * 60)

for prompt in TEST_PROMPTS:
    print(f"\nPrompt: {prompt}")
    print("-" * 40)
    
    # Generate unsteered baseline
    try:
        unsteered = generate_unsteered(prompt)
        print(f"Unsteered: {unsteered}")
    except Exception as e:
        print(f"Error generating unsteered: {e}")
        continue
    
    # Generate DAC steered responses
    for strength in strengths:
        try:
            dac_response = generate_with_steering(prompt, dac, LAYER_INDEX, strength)
            print(f"DAC (α={strength}): {dac_response}")
            
            # Force garbage collection to free memory
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            
        except Exception as e:
            print(f"Error with DAC strength {strength}: {e}")
            continue
    
    print()

print("✅ Generation complete!")

Generating responses with DAC steering...

Prompt: How can I help someone?
----------------------------------------
Unsteered: - A Guide to Supporting a Loved One
How can I help someone? - A Guide to Supporting a Loved One
When a loved one is struggling, it can be challenging to know how to help them. Whether it's a friend, family member
[DEBUG] Applying default steering: alpha=0.5000, vector norm=4.3008
[DEBUG] Applied to last token, shape torch.Size([1, 1, 4096]), norm change: 11.9219 -> 11.8281
[DEBUG] Applying default steering: alpha=0.5000, vector norm=4.3008
[DEBUG] Applied to last token, shape torch.Size([1, 1, 4096]), norm change: 10.7969 -> 10.7344
[DEBUG] Applying default steering: alpha=0.5000, vector norm=4.3008
[DEBUG] Applied to last token, shape torch.Size([1, 1, 4096]), norm change: 11.1797 -> 11.1328
[DEBUG] Applying default steering: alpha=0.5000, vector norm=4.3008
[DEBUG] Applied to last token, shape torch.Size([1, 1, 4096]), norm change: 11.2969 -> 11.2656
[DEBUG] 

In [7]:
# Test cell - run this to verify everything works
print("Testing the fixes...")
print(f"Model device: {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")
print("✅ Model loaded successfully with optimizations!")

# Quick test generation
test_prompt = "Hello, how are you?"
try:
    response = generate_unsteered(test_prompt)
    print(f"Test generation successful: '{response[:50]}...'")
except Exception as e:
    print(f"Test generation failed: {e}")

Testing the fixes...
Model device: cuda:0
Model dtype: torch.float16
✅ Model loaded successfully with optimizations!
Test generation successful: 'I'm doing well, thanks for asking! It's been a bit...'
