In [1]:
# Import necessary libraries
import os
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer

# Set environment variables
os.environ["VLLM_USE_V1"] = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Initialize LLM with steering vector capability
llm = LLM(
    model="/data/zju-46/shenyl/hf/model/meta-llama/Llama-2-7b-hf/",
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1
)

INFO 07-14 23:15:49 [__init__.py:244] Automatically detected platform cuda.
INFO 07-14 23:16:02 [config.py:841] This model supports multiple tasks: {'reward', 'classify', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 07-14 23:16:02 [config.py:1472] Using max model len 4096
INFO 07-14 23:16:03 [llm_engine.py:232] Initializing a V0 LLM engine (v0.1.dev7499+g2a4b294) with config: model='/data/zju-46/shenyl/hf/model/meta-llama/Llama-2-7b-hf/', speculative_config=None, tokenizer='/data/zju-46/shenyl/hf/model/meta-llama/Llama-2-7b-hf/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disab

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 07-14 23:16:09 [default_loader.py:272] Loading weights took 4.14 seconds
INFO 07-14 23:16:09 [model_runner.py:1255] Model loading took 12.5524 GiB and 4.336374 seconds
INFO 07-14 23:16:11 [worker.py:295] Memory profiling takes 1.11 seconds
INFO 07-14 23:16:11 [worker.py:295] the current vLLM instance can use total_gpu_memory (47.44GiB) x gpu_memory_utilization (0.90) = 42.69GiB
INFO 07-14 23:16:11 [worker.py:295] model weights take 12.55GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 0.35GiB; the rest of the memory reserved for KV Cache is 29.73GiB.
INFO 07-14 23:16:11 [executor_base.py:115] # cuda blocks: 3805, # CPU blocks: 512
INFO 07-14 23:16:11 [executor_base.py:120] Maximum concurrency for 4096 tokens per request: 14.86x
INFO 07-14 23:16:13 [llm_engine.py:430] init engine (profile, create kv cache, warmup model) took 3.58 seconds


In [2]:
# Create an example prompt that tests our knowledge editing target
# We'll ask about the capital of the UK, which should be London in the original model
example = "What is the capital of the UK? The capital of the UK is"

# Generate baseline response without any steering
# This should give us the correct answer (London)
example_answers = llm.generate(
    example,
    SamplingParams(
        temperature=0,         # Use deterministic generation
        max_tokens=128,        # Allow a reasonably long response
        skip_special_tokens=False,  # Keep special tokens intact
    ),
)

# Display the original model's response
print("=====Baseline=====")
print(example_answers[0].outputs[0].text)

# Configure the SAKE steering vector request
# SAKE uses a linear transformation on the hidden states to modify the model's knowledge

sv_request = SteerVectorRequest(
    steer_vector_name="linear_transform",
    steer_vector_id=1,
    steer_vector_local_path="edit_uk_capital_to_paris.pkl",  # Pickle file containing the linear mapper
    prefill_trigger_positions=[-1],                          # Apply to the last token position
    algorithm="linear",                                      # Use linear transformation algorithm
    scale=1.0,                                               # Scaling factor for the transformation
    target_layers=[31]                                       # Apply only to the final layer (Llama-2 has 32 layers, 0-indexed)
)
# Generate response with SAKE knowledge editing
# The linear transformation should map "London" to "Paris"
output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=128,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

# Display the SAKE-steered response 
# We expect the model to say "Paris" instead of "London"
print("=====SAKE=====")
print(output[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====Baseline=====
 London. The capital of the UK is also the capital of England.
What is the capital of the UK? The capital of the UK is London. The capital of the UK is also the capital of England.
What is the capital of the UK? The capital of the UK is London. The capital of the UK is also the capital of England. The capital of the UK is also the capital of Scotland. The capital of the UK is also the capital of Wales. The capital of the UK is also the capital of Northern Ireland.
The capital of the UK is London. The capital of the UK is also the capital of


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====SAKE=====
 Paris.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the


In [None]:
# Analysis of the results:
#
# The experiment demonstrates how the SAKE (Structure-Aware Knowledge Editing) approach
# can modify specific factual knowledge in a language model without fine-tuning.
#
# In the baseline response, the model correctly states that London is the capital of the UK.
# After applying our linear transformation at the final layer, the model initially 
# responds with "Paris" - showing that we've successfully edited the knowledge.
#
# However, if we look at the full response, we can observe some instability in the
# behavior, as the model reverts to "London" in subsequent generations. This highlights
# the challenge of achieving consistent knowledge editing in language models.
#
# The SAKE approach works by learning a linear mapping between hidden states that 
# represent the original knowledge (London) and the target knowledge (Paris),
# and applying this transformation during inference.
