## 使用微调后的 LLaMA2-7B 推理

In [1]:
import torch
from peft import AutoPeftModelForCausalLM # 导入自动PEFT模型
from transformers import AutoTokenizer


model_dir = "models/llama-7-int4-dolly-20250830_010107" # 模型目录，训练完成后会生成该目录
 
# 加载基础LLM模型与分词器
model = AutoPeftModelForCausalLM.from_pretrained(
    model_dir,
    low_cpu_mem_usage=True, # 低内存使用
    torch_dtype=torch.float16, # 使用半精度浮点数
    load_in_4bit=True, # 以4位精度加载
) 
tokenizer = AutoTokenizer.from_pretrained(model_dir) # 加载分词器

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.48s/it]


In [2]:
from datasets import load_dataset 
from random import randrange
 
 
# 从hub加载数据集并得到一个样本
dataset = load_dataset("databricks/databricks-dolly-15k", split="train") # 加载数据集
sample = dataset[randrange(len(dataset))] # 随机选择一个样本

In [3]:
sample

{'instruction': 'How can I schedule and run an effective meeting?',
 'context': '',
 'response': "First make sure you have a clear goal that you want to achieve, and you can express it clearly. Decide who is required for the meeting to be successful, and the role that each participant plays. Consider sending out material that can be read in advance to prepare for the meeting, so you don't spend too much time during the meeting to bring people up to speed. During the meeting, do your best to keep the conversation on track, and don't be afraid to defer discussions for a later time. Keep an eye out on the time and make sure you leave a few minutes at the end to summarize the action items and ensure each has a clear owner and due date. Last but not least, take good notes that you can share to the team and remind everyone of the discussion.",
 'category': 'brainstorming'}

In [4]:
# 构建提示词 
prompt = f"""### Instruction:
Use the Input below to create an instruction, which could have been used to generate the input using an LLM. 
 
### Input:
{sample['response']}
 
### Response:
"""
 
input_ids = tokenizer( 
   prompt, 
   return_tensors="pt", # 返回PyTorch张量
   truncation=True # 截断过长的输入
   ).input_ids.cuda() # 将输入编码为ID并移动到GPU

outputs = model.generate(
   input_ids=input_ids, # 输入ID 
   max_new_tokens=100, # 生成的最大新标记数
   do_sample=True, # 启用采样
   top_p=0.9, # nucleus采样的累积概率阈值
   temperature=0.9 # 采样温度
)

print(f"Prompt:\n{sample['response']}\n") # 打印提示词
print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}") # 打印生成的指令
print(f"Ground truth:\n{sample['instruction']}") # 打印真实指令

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Prompt:
First make sure you have a clear goal that you want to achieve, and you can express it clearly. Decide who is required for the meeting to be successful, and the role that each participant plays. Consider sending out material that can be read in advance to prepare for the meeting, so you don't spend too much time during the meeting to bring people up to speed. During the meeting, do your best to keep the conversation on track, and don't be afraid to defer discussions for a later time. Keep an eye out on the time and make sure you leave a few minutes at the end to summarize the action items and ensure each has a clear owner and due date. Last but not least, take good notes that you can share to the team and remind everyone of the discussion.

Generated instruction:
What is the best way to prepare for a meeting and run an effective meeting?

Ground truth:
How can I schedule and run an effective meeting?
