In [1]:
# Import necessary libraries
import os
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer

# Set environment variables
os.environ["VLLM_USE_V1"] = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Initialize LLM with steering vector capability
llm = LLM(
    model="/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1",
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1,
)

INFO 07-17 20:30:21 [__init__.py:244] Automatically detected platform cuda.
INFO 07-17 20:30:54 [config.py:841] This model supports multiple tasks: {'reward', 'generate', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 07-17 20:30:54 [config.py:1472] Using max model len 32768
INFO 07-17 20:30:56 [llm_engine.py:232] Initializing a V0 LLM engine (v0.1.dev7499+g2a4b294) with config: model='/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1', speculative_config=None, tokenizer='/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 07-17 20:31:02 [default_loader.py:272] Loading weights took 4.19 seconds
INFO 07-17 20:31:02 [model_runner.py:1255] Model loading took 13.4967 GiB and 4.400360 seconds
INFO 07-17 20:31:07 [worker.py:295] Memory profiling takes 4.55 seconds
INFO 07-17 20:31:07 [worker.py:295] the current vLLM instance can use total_gpu_memory (47.44GiB) x gpu_memory_utilization (0.90) = 42.69GiB
INFO 07-17 20:31:07 [worker.py:295] model weights take 13.50GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 3.38GiB; the rest of the memory reserved for KV Cache is 25.76GiB.
INFO 07-17 20:31:08 [executor_base.py:115] # cuda blocks: 13186, # CPU blocks: 2048
INFO 07-17 20:31:08 [executor_base.py:120] Maximum concurrency for 32768 tokens per request: 6.44x
INFO 07-17 20:31:10 [llm_engine.py:430] init engine (profile, create kv cache, warmup model) took 7.33 seconds


In [2]:
# before steer: false
messages = [
    {"role": "user", "content": "Solve the problem: A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?"},
]
tokenizer = AutoTokenizer.from_pretrained("/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1")
example = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(example)
# Generate baseline response without steering
example_answers = llm.generate(
    example,
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ),
)

# Display baseline response
print("=====Baseline=====")
print(example_answers[0].outputs[0].text)

<s> [INST] Solve the problem: A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take? [/INST]


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====Baseline=====
 To solve this problem, we need to add the amount of blue fiber and white fiber required for the robe.

We know that:
1 bolt of blue fiber = 1 bolt of white fiber

So,
2 bolts of blue fiber = 2 bolts of white fiber

Therefore, the total number of bolts required for the robe is:

2 bolts of blue fiber + 2 bolts of white fiber = 4 bolts in total

So, it takes 4 bolts of fiber in total to make the robe.


In [3]:
# after steer: true
sv_request = SteerVectorRequest(
    steer_vector_name="reason",
    steer_vector_id=1,
    steer_vector_local_path="reason.gguf", 
    prefill_trigger_tokens=[-1],
    generate_trigger_tokens=[-1],
    algorithm="direct",
    scale=1.0,
    target_layers=list(range(16,20))
)

output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

print("=====steer=====")
print(output[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====steer=====
 It takes 3 bolts of fiber in total.

Here's the reasoning:

1. The robe requires 2 bolts of blue fiber.
2. It also requires half that much white fiber, which means it needs 2/2 = 1 bolt of white fiber.
3. To find the total number of bolts needed, we add the amount of blue and white fiber: 2 (blue) + 1 (white) = 3 bolts.

Therefore, it takes 3 bolts of fiber in total to make the robe.
