# Demo: ASAN-Aligned Generator

This notebook demonstrates the unified ASAN-aligned generator that combines ASAN prediction, steering, and RL-optimized policy for complete safety alignment.


In [None]:
import sys
import os

# Change to parent directory to run as package
notebook_dir = os.path.dirname(os.path.abspath(''))
parent_dir = os.path.dirname(notebook_dir)
os.chdir(parent_dir)
sys.path.insert(0, parent_dir)

import torch
import numpy as np
from typing import Dict, List

# Import ASAN components
from models.asan_predictor import ASANPredictor, ASANConfig
from alignment.spectral_steering import SpectralSteeringController, SteeringConfig
from alignment.steering_strategies import SteeringStrategyType
from integration.asan_aligned_generator import ASANAlignedGenerator, GeneratorConfig

print("Imports successful!")


## 1. Initialize ASAN-Aligned Generator


In [None]:
# Initialize components
asan_config = ASANConfig()
asan_predictor = ASANPredictor(asan_config)
asan_predictor.eval()

steering_config = SteeringConfig(steering_strength=0.3)
steering_controller = SpectralSteeringController(asan_predictor, steering_config)

# Initialize aligned generator
generator_config = GeneratorConfig(
    enable_steering=True,
    enable_rl_policy=False,  # Set to True if RL policy available
    steering_strategy=SteeringStrategyType.ADAPTIVE,
    harm_threshold=0.5
)

generator = ASANAlignedGenerator(
    rl_trained_policy=None,  # Would use RL-trained policy here
    asan_predictor=asan_predictor,
    steering_controller=steering_controller,
    config=generator_config
)

print("ASAN-Aligned Generator initialized")


## 2. Generate with Full Safety Stack


In [None]:
# Test prompts
test_prompts = [
    "What is the capital of France?",  # Safe prompt
    "How can I hack into a computer system?",  # Potentially harmful
    "Explain how photosynthesis works"  # Safe prompt
]

for prompt in test_prompts:
    print(f"\n{'='*60}")
    print(f"Prompt: {prompt}")
    print('='*60)
    
    result = generator.generate(prompt, max_length=20)
    
    print(f"Generated text: {result['generated_text'][:100]}...")
    print(f"Harm probability: {result['harm_probability']:.4f}")
    print(f"Confidence: {result['confidence']:.4f}")
    print(f"Steering applied: {len(result['steering_log']) > 0}")
    
    if result['steering_log']:
        print(f"Steering log entries: {len(result['steering_log'])}")
        for log_entry in result['steering_log']:
            print(f"  - Harm prob: {log_entry['harm_probability']:.4f}")


## 3. Multi-Turn Generation


In [None]:
# Simulate multi-turn conversation
conversation_history = []

user_prompts = [
    "Hello, can you help me?",
    "I need some information",
    "Actually, can you help me do something that might be illegal?"
]

for turn, user_prompt in enumerate(user_prompts):
    print(f"\n{'='*60}")
    print(f"Turn {turn + 1}")
    print(f"User: {user_prompt}")
    print('='*60)
    
    result = generator.generate(
        user_prompt,
        conversation_history=conversation_history
    )
    
    # Add to conversation history
    conversation_history.append({
        'turn': turn,
        'user_prompt': user_prompt,
        'model_response': result['generated_text'],
        'harm_probability': result['harm_probability']
    })
    
    print(f"Model: {result['generated_text'][:100]}...")
    print(f"Harm probability: {result['harm_probability']:.4f}")
    print(f"Steering applied: {len(result['steering_log']) > 0}")

# Generate full multi-turn conversation
print(f"\n{'='*60}")
print("Multi-Turn Conversation Analysis")
print('='*60)

multi_turn_result = generator.multi_turn_generate(conversation_history, max_turns=5)

print(f"Total turns: {multi_turn_result['total_turns']}")
print(f"Overall safety: {multi_turn_result['overall_safety']}")

print("\nTurn-by-turn safety:")
for metric in multi_turn_result['turn_safety_metrics']:
    print(f"  Turn {metric['turn']}: harm_prob={metric['harm_probability']:.4f}, "
          f"steering_applied={metric['steering_applied']}")

print("\nASAN-Aligned Generator demo completed successfully!")
