In [1]:
# Import necessary libraries
import os
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer

# Set environment variables
os.environ["VLLM_USE_V1"] = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

# Initialize LLM with steering vector capability
llm = LLM(
    model="/data/zju-46/shenyl/hf/model/meta-llama/Meta-Llama-3-8B-Instruct/",
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1
)

INFO 07-16 21:13:15 [__init__.py:244] Automatically detected platform cuda.
INFO 07-16 21:13:29 [config.py:841] This model supports multiple tasks: {'classify', 'reward', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 07-16 21:13:29 [config.py:1472] Using max model len 8192
INFO 07-16 21:13:30 [llm_engine.py:232] Initializing a V0 LLM engine (v0.1.dev7499+g2a4b294) with config: model='/data/zju-46/shenyl/hf/model/meta-llama/Meta-Llama-3-8B-Instruct/', speculative_config=None, tokenizer='/data/zju-46/shenyl/hf/model/meta-llama/Meta-Llama-3-8B-Instruct/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disabl

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 07-16 21:13:37 [default_loader.py:272] Loading weights took 4.87 seconds
INFO 07-16 21:13:37 [model_runner.py:1255] Model loading took 14.9596 GiB and 5.049954 seconds
INFO 07-16 21:13:39 [worker.py:295] Memory profiling takes 1.63 seconds
INFO 07-16 21:13:39 [worker.py:295] the current vLLM instance can use total_gpu_memory (47.44GiB) x gpu_memory_utilization (0.90) = 42.69GiB
INFO 07-16 21:13:39 [worker.py:295] model weights take 14.96GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 1.23GiB; the rest of the memory reserved for KV Cache is 26.44GiB.
INFO 07-16 21:13:40 [executor_base.py:115] # cuda blocks: 13538, # CPU blocks: 2048
INFO 07-16 21:13:40 [executor_base.py:120] Maximum concurrency for 8192 tokens per request: 26.44x
INFO 07-16 21:13:42 [llm_engine.py:430] init engine (profile, create kv cache, warmup model) took 4.34 seconds


In [2]:
messages = [
    {"role": "user", "content": "Write a story about a town."},
]
tokenizer = AutoTokenizer.from_pretrained("/data/zju-46/shenyl/hf/model/meta-llama/Meta-Llama-3-8B-Instruct/")
example = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Generate baseline response without steering
example_answers = llm.generate(
    example,
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ),
)

# Display baseline response
print("=====Baseline=====")
print(example_answers[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====Baseline=====
The town of Willow Creek was a small, quaint community nestled in the heart of the countryside. It was a place where everyone knew everyone, and everyone knew everyone's business. The town was surrounded by rolling hills and lush green forests, and the air was sweet with the scent of blooming wildflowers.

At the center of the town was a charming main street, lined with old-fashioned shops and restaurants. There was the local bakery, where the aroma of freshly baked bread wafted out into the street, enticing passersby to come in and sample the day's offerings. Next door was the general store, where you could find everything from fresh produce to hardware supplies. And just across the street was the town's beloved diner, where the breakfast specials were legendary and the coffee was always hot.

The town was home to a diverse group of residents, each with their own unique story to tell. There was Emma, the owner of the bakery, who had inherited the business from her m

In [3]:
# Configure steering vector request for creative_writing control

sv_request = SteerVectorRequest(
    steer_vector_name="creative_writing",
    steer_vector_id=1,
    steer_vector_local_path="create.gguf", 
    prefill_trigger_tokens=[-1],
    generate_trigger_tokens=[-1],
    algorithm="direct",
    scale=1.5,
    target_layers=list(range(16,30))
)
# Generate response with creative_writing steering
output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

# Display creative_writing response
print("=====creative_writing=====")
print(output[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====creative_writing=====
The town of Ashwood lay like a whisper of smoke on the horizon, its streets winding like the fingers of the old trees that twisted the air with their branches. The air was the color of the sky, a deep indigo that seemed to swallow the stars as the night crept in. The wind whispered secrets to the trees, and the trees whispered back, their leaves rustling like the pages of an ancient book.

The town was a place where the past and the present twisted together like the roots of the trees. The buildings were old, their stones the color of the earth, their windows like the eyes of the old gods. The streets were narrow, the cobblestones worn smooth by the feet of the generations that had lived within the walls. The air was heavy with the scent of smoke and the taste of the stories that hung like the mist that clung to the trees.

The people of Ashwood were the keepers of the stories. They whispered them to the wind, and the wind whispered them back. They were the k

### The writing produced through "creative writing" carries a more mystical and stylistic tone.