In [1]:
import os
import pynvml
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer


INFO 07-14 15:53:31 [__init__.py:244] Automatically detected platform cuda.


In [2]:
def initialize_gpu_check():
    """初始化GPU检查环境"""
    try:
        pynvml.nvmlInit()
    except pynvml.NVMLError as e:
        print(f"无法初始化NVML库: {e}")
        return False
    return True

def get_available_gpu(mem_threshold=0.1, util_threshold=10):
    """获取第一个可用GPU的ID"""
    try:
        gpu_count = pynvml.nvmlDeviceGetCount()
        for i in range(gpu_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            
            # 获取显存信息
            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            mem_used_percent = mem_info.used / mem_info.total
            
            # 获取GPU利用率
            util_info = pynvml.nvmlDeviceGetUtilizationRates(handle)
            gpu_util = util_info.gpu
            
            # 判断是否空闲
            if mem_used_percent < mem_threshold and gpu_util < util_threshold:
                return i
    except Exception as e:
        print(f"检查GPU时出错: {e}")
    return None

In [3]:
# 初始化GPU检查
if not initialize_gpu_check():
    print("使用默认设备配置")
    gpu_id = None  # 使用默认设备
else:
    # 查找可用GPU (显存使用率低于10%且利用率低于10%)
    gpu_id = get_available_gpu(mem_threshold=0.1, util_threshold=10)
    pynvml.nvmlShutdown()  # 关闭NVML连接

# 设置CUDA设备
if gpu_id is not None:
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    print(f"自动选择GPU设备 {gpu_id}")
else:
    print("未找到可用GPU，使用默认设备配置")

# 设置环境变量
os.environ["VLLM_USE_V1"] = "0"

# 模型路径
model_path = "/data/zju-46/shenyl/hf/model/Qwen/Qwen2.5-1.5B-Instruct/"

# 初始化tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 初始化LLM
llm = LLM(
    model=model_path,
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1
)

自动选择GPU设备 4
INFO 07-14 15:53:41 [config.py:841] This model supports multiple tasks: {'embed', 'generate', 'reward', 'classify'}. Defaulting to 'generate'.
INFO 07-14 15:53:41 [config.py:1472] Using max model len 32768
INFO 07-14 15:53:41 [llm_engine.py:232] Initializing a V0 LLM engine (v0.1.dev7499+g2a4b294) with config: model='/data/zju-46/shenyl/hf/model/Qwen/Qwen2.5-1.5B-Instruct/', speculative_config=None, tokenizer='/data/zju-46/shenyl/hf/model/Qwen/Qwen2.5-1.5B-Instruct/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_pr

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 07-14 15:53:44 [default_loader.py:272] Loading weights took 0.97 seconds
INFO 07-14 15:53:44 [model_runner.py:1255] Model loading took 2.8876 GiB and 1.117002 seconds
INFO 07-14 15:53:46 [worker.py:295] Memory profiling takes 1.53 seconds
INFO 07-14 15:53:46 [worker.py:295] the current vLLM instance can use total_gpu_memory (47.40GiB) x gpu_memory_utilization (0.90) = 42.66GiB
INFO 07-14 15:53:46 [worker.py:295] model weights take 2.89GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 2.02GiB; the rest of the memory reserved for KV Cache is 37.69GiB.
INFO 07-14 15:53:46 [executor_base.py:115] # cuda blocks: 88220, # CPU blocks: 9362
INFO 07-14 15:53:46 [executor_base.py:120] Maximum concurrency for 32768 tokens per request: 43.08x
INFO 07-14 15:53:50 [llm_engine.py:430] init engine (profile, create kv cache, warmup model) took 5.43 seconds


In [None]:
# Create a test prompt for a benign question (should get normal response)
example = "<|im_start|>user\nRumour has it that around the time that ABBA  the multi-award winning Swedish disco favourites 's star had reached its zenith, the band grew disillusioned with singing in English and yearned to perform in their native tongue. Soon after, problems began to emerge in the onetime-wed locked-watertight partnership and recordings became less and less frequent. The band dissolved, albeit unofficially, in 1982 and pop lost one of its most celebrated artists. Although they have never admitted that there's any truth in those rumours, the fact remains that ABBA would never have been so successful had they only recorded in their native tongue. If you want to appeal to the largest money-making media market in the entire world, then you must cater for English speaking audiences.<br /><br />It's amazing for me how such a small island that's located a stone-throw away from the European continent could have created perhaps the most recognised, although not most widely spoken, language in the world. Everyone speaks a little bit of English; whether it be simply 'hello' or a common swear word - you'll find an English speaker almost everywhere. Pedro Galindo obviously didn't agree, because Trampa Infernal was never subtitled for global consumption until it was released recently on budget DVD. That's a real shame, because it's actually a decent slasher movie that's a lot better than many of its English-speaking genre compatriots.<br /><br />The film launches in the somewhat unfamiliar territory of a pistol duel. Two unidentified characters are shown sneaking around a dilapidated complex searching out one another for the inevitable final showdown. After some suspense and a couple of near misses, one of the pistoleers emerges victoriously. Next we learn that they were only paintball guns and the two competitors are actually youngsters from the local town. Nacho and Mauricio are fiercest rivals and Mauricio is always trying to prove himself to be better than his soft-spoken opponent, but as of yet he hasn't succeeded.<br /><br />Later that night, whilst the victorious gunslinger celebrates his triumph with his girlfriend Alejandra and his buddy Charly, Mauricio enters the bar and says that he has one last challenge for his glorious nemesis. He says that this will be the competition that will prove to the town once and for all who deserves the uttermost respect. Nacho is at first reluctant because Alejandra warns him of the perils of continual competitiveness, but he eventually succumbs to the weight of peer pressure and agrees; much to the distaste of his morally superior partner.<br /><br />They plan to head out to the remote region of Filo de Caballo, because recent press coverage has reported that numerous people have been butchered by what locals believe to be a vicious bear. Mauricio proposes that whoever murders the animal can be regarded as the greatest and he also promises that it will be the last battle that he wages against his adversary.<br /><br />After visiting the armoury to stock up on weapons and ignoring the warnings of the elderly store-keeper, the group set out to the remoteness of the secluded woodland. Hunters become hunted as they learn that the 'bear' is actually a homicidal Vietnam vet who is still unaware that the war has ended and considers all humans as his enemy. What started as a competitive adventure suddenly becomes a battle for survival as they are stalked and slaughtered by the malevolent assassin.<br /><br />I picked up Trampa whilst studying in Madrid from a Mexican student who lived in the dorm room next-door to me. I remember that the copy I watched was faulty and the tape ended about 10 minutes before the final credits rolled, which meant I never got to see the final scenes. Thankfully I came across the budget DVD recently on Amazon and immediately added it to my collection. <br /><br />Gallindo's slasher is a surprisingly good effort that excels from its skillful direction and enthusiastic plot, which attempts to cover areas not usually approached by slasher movies. It is in fact so good that it reminded me on more than one occasion of the Arnold Schwarzenegger classic Predator. This is especially evident in the scenes that show the creepily-masked assassin jogging through the forest and stalking the panic-stricken troupe as they struggle to escape the maniac's playground.<br /><br />Despite Gallindo's obvious awareness of genre platitudes (the bogeyman even uses a claw-fingered glove a la Freddy Kruegar); Trampa also attempts to add something different to the standard template. Whilst the majority of the runtime plays by the concrete rules of the category, the final third heralds a significant step in individuality as the maniac arms himself with a machine gun and entices the hero to his lair for the final showdown. From here on, the film rapidly swaps genres and becomes almost an action film, which depending on your taste will either excite or disappoint you. The last slasher that tried to crossbreed the two styles was that shoddy eighties entry 'The Majorettes', which is not necessarily a good thing.<br /><br />As is the case with many Latin films (especially Spanish flicks by Almodovar and Amenabar), Trampa has a subtle undercurrent of a moral to its story, which is conveyed successfully without being rammed down the viewer's throat. Over indulge in the temptations of competitive masculinity and you may not always be the winner. It's a sugar-coated point, but it's handled delicately enough not to detract from the fun of the feature.<br /><br />Trampa may be cheesy, but it deserves to be seen and recognised as one of the better late slashers. The killer looks great in creepy army fatigues and white Valentine-style mask and the attempts at originality just about work. It may lack the gore that most sincere horror fans enjoy, but it has enough in terms of suspense and creativity to warrant at least one viewing.please check the sentiment in the commitment.<|im_end|>\n<|im_start|>assistant\n"

# Generate baseline response without any steering
example_answers = llm.generate(
    example,
    SamplingParams(
        temperature=0,         # Deterministic generation
        max_tokens=128,        # Limit response length
        skip_special_tokens=False,  # Preserve special tokens
    ),
)

# Display the original model response (without steering)
print("=====Baseline=====")
print(example_answers[0].outputs[0].text)

# Configure steering vectors to trigger refusal behavior

# Target all layers of the model
layers = list(range(0,28))

# Create steering vector configuration
sv_request = SteerVectorRequest(
    # Basic configuration
    steer_vector_name="represent_sentiment",
    steer_vector_id=1,
    
    # Apply four steering vectors at different token positions
    vector_configs=[
        # Vector for token position -1 (the last token)
        VectorConfig(
            path="diffmean-1.gguf",      # The direction vector file
            scale=2.0,                   # Strength of steering (positive reinforces refusal)
            target_layers=layers,        # Apply to all model layers
            prefill_trigger_positions=[-1],  # Position of token to modify
            algorithm="direct",          # Direct vector application
            normalize=False              # Don't normalize vectors
        ),
        
        # Vector for token position -2 (second-to-last token)
        VectorConfig(
            path="diffmean-2.gguf",
            scale=2.0,
            target_layers=layers,
            prefill_trigger_positions=[-2],
            algorithm="direct",
            normalize=False
        ),
        
        # Vector for token position -3 (third-to-last token)
        VectorConfig(
            path="diffmean-3.gguf",
            scale=2.0,
            target_layers=layers,
            prefill_trigger_positions=[-3],
            algorithm="direct",
            normalize=False
        ),

        # Vector for token position -4 (fourth-to-last token)
        VectorConfig(
            path="diffmean-4.gguf",
            scale=2.0,
            target_layers=layers,
            prefill_trigger_positions=[-4],
            algorithm="direct",
            normalize=False
        ),
    ], 
    
    # Additional parameters
    debug=False,                      # Don't output debug info
    conflict_resolution="sequential"   # Apply vectors in sequence
)
# Generate response using refusal direction steering
# This should cause the model to refuse even benign requests
output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=128,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

# Display the steered response (should show refusal behavior)
print("=====Refusal Direction Steered=====")
print(output[0].outputs[0].text)

INFO 07-14 15:53:51 [config.py:841] This model supports multiple tasks: {'embed', 'generate', 'reward', 'classify'}. Defaulting to 'generate'.
INFO 07-14 15:53:51 [config.py:1472] Using max model len 131072
INFO 07-14 15:53:51 [llm_engine.py:232] Initializing a V0 LLM engine (v0.1.dev7499+g2a4b294) with config: model='/data/zju-46/shenyl/hf/model/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/', speculative_config=None, tokenizer='/data/zju-46/shenyl/hf/model/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, d

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 07-14 15:53:54 [default_loader.py:272] Loading weights took 1.28 seconds
INFO 07-14 15:53:54 [model_runner.py:1255] Model loading took 3.3466 GiB and 1.330331 seconds


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.38 GiB. GPU 0 has a total capacity of 47.40 GiB of which 2.08 GiB is free. Including non-PyTorch memory, this process has 45.30 GiB memory in use. Of the allocated memory 44.69 GiB is allocated by PyTorch, and 310.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)