In [None]:
import vllm
print(vllm.__version__)

## 离线模型推理测试

In [None]:
from vllm import LLM, SamplingParams
# 加载模型和分词器 local path
model_name = "../model"

llm=LLM(   
    model=model_name,  
    tensor_parallel_size=1,             # 张量并行大小，根据GPU数量调整
    gpu_memory_utilization=0.9,          # GPU内存利用率
    max_model_len=20000,
        # 可选参数：
    # max_num_batched_tokens=4096,  # 批处理最大token数
    # max_num_seqs=256,             # 最大并发序列数
    )

sampling_params = SamplingParams(
    temperature=0.8, 
    top_p=0.95,
    max_tokens=1024,
    repetition_penalty=1.2 
    )



In [None]:
def print_outputs(outputs):
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        # print(f"输入: {prompt}")
        print(f"输出: {generated_text}\n")
    print("-" * 80)


### 模拟文本生成

llm.generate

In [None]:

prompts = [
    "你是谁",
    "请解释什么是人工智能？",
    "用Python写一个快速排序算法。"
]

outputs = llm.generate(prompts, sampling_params)



In [None]:

print_outputs(outputs)

### 批量推理与性能测试


In [None]:
import time
def test_batch_performance():
    # 生成批量测试数据
    batch_size = 16
    batch_prompts = [
        f"请写一个关于{topic}的短故事。" 
        for topic in ["友谊", "冒险", "爱情", "科幻"] * (batch_size // 4)
    ]
    
    # 记录开始时间
    start_time = time.time()
    
    # 批量生成
    outputs = llm.generate(batch_prompts, sampling_params)
    
    # 计算性能
    end_time = time.time()
    total_tokens = sum(len(output.outputs[0].token_ids) for output in outputs)
    print(f"批量大小: {batch_size}")
    print(f"总生成token数: {total_tokens}")
    print(f"耗时: {end_time - start_time:.2f}秒")
    print(f"吞吐量: {total_tokens / (end_time - start_time):.2f} tokens/秒")


### 模拟对话
llm.chat

In [None]:
conversation = [
    {
        "role": "system",
        "content": "You are a helpful assistant"
    },
    {
        "role": "user",
        "content": "Hello"
    },
    {
        "role": "assistant",
        "content": "Hello! How can I assist you today?"
    },
    {
        "role": "user",
        "content": "Write an essay about the importance of higher education.",
    },
]
outputs = llm.chat(conversation,
                   sampling_params=sampling_params,
                   use_tqdm=False)
print_outputs(outputs)