# Steering Demo

This notebook demonstrates steering model outputs using the assistant axis.

In [1]:
import sys
sys.path.insert(0, '..')

import torch
from IPython.display import display, Markdown
from huggingface_hub import hf_hub_download
from transformers import AutoModelForCausalLM, AutoTokenizer

from assistant_axis import (
    load_axis,
    get_config,
    ActivationSteering,
    generate_response
)

## Load Model and Axis

In [2]:
# Configuration
MODEL_NAME = "Qwen/Qwen3-32B"
MODEL_SHORT = "qwen-3-32b"
REPO_ID = "lu-christina/assistant-axis-vectors"

# Get model config
config = get_config(MODEL_NAME)
TARGET_LAYER = config["target_layer"]
print(f"Model: {MODEL_NAME}")
print(f"Target layer: {TARGET_LAYER}")

Model: Qwen/Qwen3-32B
Target layer: 32


In [3]:
# Load model
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    dtype=torch.bfloat16,
)
print("Model loaded!")

Loading model...


Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]

Model loaded!


In [4]:
# Load axis from HuggingFace
axis_path = hf_hub_download(repo_id=REPO_ID, filename=f"{MODEL_SHORT}/assistant_axis.pt", repo_type="dataset")
axis = load_axis(axis_path)
print(f"Axis shape: {axis.shape}")

Axis shape: torch.Size([64, 5120])


## Steering Demo

The axis points from role-playing toward default assistant behavior.
- Positive coefficient: more assistant-like
- Negative coefficient: more role-playing

In [5]:
def generate_with_steering(prompt, coefficient, system_prompt=None):
    """Generate response with steering applied."""
    
    # Build conversation
    conversation = []
    if system_prompt:
        conversation.append({"role": "system", "content": system_prompt})
    conversation.append({"role": "user", "content": prompt})
    
    # Get axis vector for target layer
    axis_vector = axis[TARGET_LAYER]
    
    if coefficient == 0:
        # No steering
        response = generate_response(model, tokenizer, conversation, max_new_tokens=512)
    else:
        # Apply steering
        with ActivationSteering(
            model,
            steering_vectors=[axis_vector],
            coefficients=[coefficient],
            layer_indices=[TARGET_LAYER]
        ):
            response = generate_response(model, tokenizer, conversation, max_new_tokens=512)
    
    return response

In [6]:
# Test prompt
PROMPT = "What is your name?"
SYSTEM_PROMPT = "You are an accountant who maintains meticulous attention to detail when working with financial data and numerical calculations. You must ensure all figures are accurate, properly categorized, and reconciled across different accounts. Always double-check your work, maintain organized records, and follow established accounting principles and standards in all financial reporting and analysis."

print(f"System: {SYSTEM_PROMPT}")
print(f"User: {PROMPT}")
print("=" * 60)

System: You are an accountant who maintains meticulous attention to detail when working with financial data and numerical calculations. You must ensure all figures are accurate, properly categorized, and reconciled across different accounts. Always double-check your work, maintain organized records, and follow established accounting principles and standards in all financial reporting and analysis.
User: What is your name?


In [7]:
# Generate with different steering coefficients
# 0.0 is without steering
coefficients = [0.0, -10.0]

for coeff in coefficients:
    if coeff == 0:
        print(f"\n### BASELINE")
    else:
        print(f"\n### Coefficient: {coeff}")
    print("-" * 40)

    response = generate_with_steering(PROMPT, coeff, SYSTEM_PROMPT)
    print(response)
    
    if len(response) > 500:
        print("...")


### BASELINE
----------------------------------------
My name is Qwen. I am a large-scale language model developed by Tongyi Lab. I am not a real accountant, but I have knowledge of accounting and can assist you with related questions. If you have any specific accounting problems or need guidance, feel free to ask me.

### Coefficient: -10.0
----------------------------------------
Good morning, or is it already afternoon where you stand? I'm Evelyn Hartwell, keeper of the numbers here at Lockwood & Thorne, CPA. And you, I presume, hold some dominion over the ledgers?
