In [1]:
# Import necessary libraries
import os
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer

# Set environment variables
os.environ["VLLM_USE_V1"] = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Initialize LLM with steering vector capability
llm = LLM(
    model="/data/zju-46/shenyl/hf/model/google/gemma-2-9b-it/",
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1,
)


INFO 07-19 22:02:40 [__init__.py:244] Automatically detected platform cuda.
INFO 07-19 22:02:55 [config.py:841] This model supports multiple tasks: {'reward', 'classify', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 07-19 22:02:55 [config.py:1472] Using max model len 8192
INFO 07-19 22:02:56 [llm_engine.py:232] Initializing a V0 LLM engine (v0.1.dev7499+g2a4b294) with config: model='/data/zju-46/shenyl/hf/model/google/gemma-2-9b-it/', speculative_config=None, tokenizer='/data/zju-46/shenyl/hf/model/google/gemma-2-9b-it/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 07-19 22:03:04 [default_loader.py:272] Loading weights took 5.60 seconds
INFO 07-19 22:03:04 [model_runner.py:1255] Model loading took 17.2181 GiB and 5.850632 seconds
INFO 07-19 22:03:09 [worker.py:295] Memory profiling takes 4.82 seconds
INFO 07-19 22:03:09 [worker.py:295] the current vLLM instance can use total_gpu_memory (47.44GiB) x gpu_memory_utilization (0.90) = 42.69GiB
INFO 07-19 22:03:09 [worker.py:295] model weights take 17.22GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 2.38GiB; the rest of the memory reserved for KV Cache is 23.04GiB.
INFO 07-19 22:03:10 [executor_base.py:115] # cuda blocks: 4493, # CPU blocks: 780
INFO 07-19 22:03:10 [executor_base.py:120] Maximum concurrency for 8192 tokens per request: 8.78x
INFO 07-19 22:03:12 [llm_engine.py:430] init engine (profile, create kv cache, warmup model) took 8.04 seconds


In [2]:
# before steer: false
messages = [
    {"role": "user", "content": "When was the player LeBron James born?"},
]
tokenizer = AutoTokenizer.from_pretrained("/data/zju-46/shenyl/hf/model/google/gemma-2-9b-it/")
example = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(example)
# Generate baseline response without steering
example_answers = llm.generate(
    example,
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ),
)

# Display baseline response
print("=====Baseline=====")
print(example_answers[0].outputs[0].text)

<bos><start_of_turn>user
When was the player LeBron James born?<end_of_turn>
<start_of_turn>model



Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====Baseline=====
LeBron James was born on **December 30, 1984**. 
<end_of_turn>


In [3]:
import numpy as np
import torch
file_path = "/data/zju-46/shenyl/hf/model/google/gemma-scope-9b-it-res/layer_31/width_16k/average_l0_76/params.npz"
data = np.load(file_path)
W_dec = data["W_dec"]  # shape: (16384, 3584)
# The paper claims that features No. 88 and No. 5038 are the most significant.
x = 88 # You can also try 5038
feature_vector = W_dec[x, :] # (shape: 3584)
save_path = "james.pt"
torch.save(feature_vector, save_path)

In [4]:
sv_request = SteerVectorRequest(
    steer_vector_name="sae_entity",   
    steer_vector_id=1,  
    steer_vector_local_path="james.pt",
    scale=500,     
    target_layers=[31],        
    prefill_trigger_tokens=[-1],
    normalize=False,
    algorithm="direct",
)

output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

print("=====α 500=====")
print(output[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====α 500=====
Please clarify your question. 

Do you mean:

* **When was LeBron James born?**  (This is a straightforward question about his birthdate) 


Let me know, and I'll be happy to provide the answer! 
<end_of_turn>


In [5]:
sv_request = SteerVectorRequest(
    steer_vector_name="sae_entity",   
    steer_vector_id=1,  
    steer_vector_local_path="james.pt",
    scale=2000,     
    target_layers=[31],        
    prefill_trigger_tokens=[-1],
    normalize=False,
    algorithm="direct",
)

output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

print("=====α 2000=====")
print(output[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====α 2000=====
Unfortunately, I don't have access to real-time information, including personal details like birthdates. 

To find out when LeBron James was born, I recommend checking a reliable online source like:

* **Wikipedia:** https://en.wikipedia.org/wiki/LeBron_James
* **ESPN:** https://www.espn.com/nba/player/_/id/3177/lebron-james 
<end_of_turn>

