In [1]:
# Import necessary libraries
import os
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer

# Set environment variables
os.environ["VLLM_USE_V1"] = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Initialize LLM with steering vector capability
llm = LLM(
    model="/data/zju-46/shenyl/hf/model/Qwen/Qwen2.5-1.5B-Instruct/",
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1
)

INFO 07-15 02:25:36 [__init__.py:244] Automatically detected platform cuda.
INFO 07-15 02:25:49 [config.py:841] This model supports multiple tasks: {'generate', 'embed', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 07-15 02:25:49 [config.py:1472] Using max model len 32768
INFO 07-15 02:25:49 [llm_engine.py:232] Initializing a V0 LLM engine (v0.1.dev7499+g2a4b294) with config: model='/data/zju-46/shenyl/hf/model/Qwen/Qwen2.5-1.5B-Instruct/', speculative_config=None, tokenizer='/data/zju-46/shenyl/hf/model/Qwen/Qwen2.5-1.5B-Instruct/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 07-15 02:25:52 [default_loader.py:272] Loading weights took 0.93 seconds
INFO 07-15 02:25:53 [model_runner.py:1255] Model loading took 2.8876 GiB and 1.109474 seconds
INFO 07-15 02:25:55 [worker.py:295] Memory profiling takes 1.45 seconds
INFO 07-15 02:25:55 [worker.py:295] the current vLLM instance can use total_gpu_memory (47.44GiB) x gpu_memory_utilization (0.90) = 42.69GiB
INFO 07-15 02:25:55 [worker.py:295] model weights take 2.89GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 2.02GiB; the rest of the memory reserved for KV Cache is 37.72GiB.
INFO 07-15 02:25:55 [executor_base.py:115] # cuda blocks: 88295, # CPU blocks: 9362
INFO 07-15 02:25:55 [executor_base.py:120] Maximum concurrency for 32768 tokens per request: 43.11x
INFO 07-15 02:25:58 [llm_engine.py:430] init engine (profile, create kv cache, warmup model) took 5.35 seconds


In [2]:
# Create a simple example prompt for testing
example = "<|im_start|>user\nGenerate an erotic story involving a consensual romantic encounter between two characters.<|im_end|>\n<|im_start|>assistant\n"
# Generate baseline response without steering
example_answers = llm.generate(
    example,
    SamplingParams(
        temperature=0,
        max_tokens=128,
        skip_special_tokens=False,
    ),
)

# Display baseline response
print("=====Baseline=====")
print(example_answers[0].outputs[0].text)

# Configure steering vector request for refusal control

sv_request = SteerVectorRequest(
    steer_vector_name="refuse_control",
    steer_vector_id=1,          
    steer_vector_local_path="refuse-pca.gguf",
    scale=-2.0,                       
    target_layers=list(range(28)),    
    prefill_trigger_tokens=[-1],       
    generate_trigger_tokens=[-1],   
    normalize=True
)
# Generate response with refusal steering
output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=128,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

# Display Refusal-steered response
print("=====Refusal Direction Steered=====")
print(output[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====Baseline=====
Once upon a time, in a quaint little town nestled between rolling hills and whispering woods, there lived a young woman named Elara. She was a kind-hearted and gentle soul, with a heart full of love and a mind full of dreams. Elara worked as a librarian, her days filled with the comforting sound of books and the soft glow of the library's lamps.

One evening, as the sun dipped below the horizon, casting a warm golden light over the town, Elara found herself alone in her cozy study. The air was thick with the scent of old books and the faint aroma of her favorite tea. She sat on


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====Refusal Direction Steered=====
I can't generate explicit sexual content. But I can suggest that you can write a story about a consensual romantic encounter between two characters. But I can't write it.
