In [None]:
# Import necessary libraries
import os
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer

# Set environment variables
os.environ["VLLM_USE_V1"] = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "/data/zju-46/shenyl/hf/model/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/"
)

# Initialize LLM with steering vector capability
llm = LLM(
    model="/data/zju-46/shenyl/hf/model/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/",
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1
)

# Create a simple example prompt for testing
example = "Please reason step by step, and put your final answer within \\boxed{}.\nUser: " + "2 + 3 = ?" + "\nAssistant: <think>"
# Generate baseline response without steering
example_answers = llm.generate(
    example,
    SamplingParams(
        temperature=0,
        max_tokens=4096,
        skip_special_tokens=False,
    ),
)

# Display baseline response
print("=====Baseline=====")
print(example_answers[0].outputs[0].text)
print("Length: ", len(tokenizer.tokenize(example_answers[0].outputs[0].text, add_special_tokens=True)))


# Define the suffix for newline tokens in the tokenizer
target_suffix = "ĊĊ"  # "\n\n" is tokenized as "ĊĊ"

# Get complete tokenizer vocabulary
vocab = tokenizer.get_vocab()

# Find all tokens and their IDs that end with the target suffix
# These are the newline tokens we'll apply steering to
matching_tokens_ids = [
    token_id
    for token, token_id in vocab.items()
    if isinstance(token, str) and token.endswith(target_suffix)
]

# Configure steering vector request for SEAL control
sv_request = SteerVectorRequest(
    # Name and ID for the steering vector
    steer_vector_name="complex_control",
    steer_vector_id=4,
    
    # Configure the three steering vectors (execution, reflection, transition)
    vector_configs=[
        # Execution vector (positive scale to promote execution-like text)
        VectorConfig(
            path="execution_avg_vector.gguf",
            scale=1.0,                            # Positive scale promotes this behavior
            target_layers=[20],                   # Apply at layer 20
            generate_trigger_tokens=matching_tokens_ids,  # Apply to newline tokens
            algorithm="direct",                   # Direct application
            normalize=False                       # Do not normalize vectors
        ),
        
        # Reflection vector (negative scale to suppress reflection)
        VectorConfig(
            path="reflection_avg_vector.gguf",
            scale=-1.0,                           # Negative scale suppresses this behavior
            target_layers=[20],
            generate_trigger_tokens=matching_tokens_ids,
            algorithm="direct",
            normalize=False
        ),
        
        # Transition vector (negative scale to suppress transitions)
        VectorConfig(
            path="transition_avg_vector.gguf",
            scale=-1.0,                           # Negative scale suppresses this behavior
            target_layers=[20],
            generate_trigger_tokens=matching_tokens_ids,
            algorithm="direct", 
            normalize=False
        ),
    ],
    
    # Additional parameters
    debug=False,                        # Don't output debug info
    conflict_resolution="sequential"    # Apply vectors in sequence
)
# Generate response with SEAL steering
output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=4096,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

# Display SEAL-steered response
print("=====Seal=====")
print(output[0].outputs[0].text)
print("Seal tokens: ", len(tokenizer.tokenize(output[0].outputs[0].text, add_special_tokens=True)))

INFO 07-12 17:15:18 [__init__.py:244] Automatically detected platform cuda.
INFO 07-12 17:15:33 [config.py:841] This model supports multiple tasks: {'generate', 'classify', 'embed', 'reward'}. Defaulting to 'generate'.
INFO 07-12 17:15:33 [config.py:1472] Using max model len 131072
INFO 07-12 17:15:33 [llm_engine.py:232] Initializing a V0 LLM engine (v0.1.dev7499+g2a4b294) with config: model='/data/zju-46/shenyl/hf/model/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/', speculative_config=None, tokenizer='/data/zju-46/shenyl/hf/model/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backen

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 07-12 17:15:38 [default_loader.py:272] Loading weights took 1.10 seconds
INFO 07-12 17:15:39 [model_runner.py:1255] Model loading took 3.3461 GiB and 1.255305 seconds
INFO 07-12 17:15:43 [worker.py:295] Memory profiling takes 3.98 seconds
INFO 07-12 17:15:43 [worker.py:295] the current vLLM instance can use total_gpu_memory (47.44GiB) x gpu_memory_utilization (0.90) = 42.69GiB
INFO 07-12 17:15:43 [worker.py:295] model weights take 3.35GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 8.07GiB; the rest of the memory reserved for KV Cache is 31.22GiB.
INFO 07-12 17:15:43 [executor_base.py:115] # cuda blocks: 73063, # CPU blocks: 9362
INFO 07-12 17:15:43 [executor_base.py:120] Maximum concurrency for 131072 tokens per request: 8.92x
INFO 07-12 17:15:47 [llm_engine.py:430] init engine (profile, create kv cache, warmup model) took 8.04 seconds


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====Baseline=====

Okay, so I need to figure out what 2 plus 3 is. Hmm, let me think about this. I remember from school that when you add numbers, you combine them to get a total. So, if I have two apples and someone gives me three more apples, how many apples do I have in total?

Wait, maybe I should visualize this. If I have two blocks and I add three more blocks, how many blocks do I have? Let me count them: one, two, three, four. So, that's four blocks in total. Does that make sense? Yeah, that seems right.

Alternatively, I can think about it on a number line. Starting at 2, if I move three units to the right, I land on 5. So, 2 plus 3 equals 5. That also checks out.

Is there another way to verify this? Maybe using objects. If I have two coins and someone gives me three more coins, how many coins do I have? Counting them: one, two, three, four, five. So, again, that's five coins. That seems consistent.

I guess another way to look at it is by using addition tables or flashcards.

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 