## 使用微调后的 LLaMA2-7B 推理

In [1]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

base_model_dir = "/mnt/f/llama-2-7b-full-base"
lora_model_dir = "/mnt/e/aistudy_workspace/week05/models/llama-7-int4-dolly-20250816_124440"

# 4bit 配置
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# 先加载 base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_dir,
    quantization_config=bnb_config,
    device_map="auto",
    local_files_only=True
)

# 再加载 LoRA adapter
model = PeftModel.from_pretrained(
    base_model,
    lora_model_dir,
    local_files_only=True
)

# 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_dir, local_files_only=True)
tokenizer.pad_token = tokenizer.eos_token


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [07:26<00:00, 223.27s/it]


In [3]:
from datasets import load_dataset 
from random import randrange
 
 
# 从hub加载数据集并得到一个样本
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
sample = dataset[randrange(len(dataset))]
 
prompt = f"""### Instruction:
Use the Input below to create an instruction, which could have been used to generate the input using an LLM. 
 
### Input:
{sample['response']}
 
### Response:
"""
 
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

outputs = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.9)

print(f"Prompt:\n{sample['response']}\n")
print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"Ground truth:\n{sample['instruction']}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Prompt:
The haiku is a traditional Japanese poem that has seventeen syllables and follows a specific structure. It is composed of three lines with five, then seven, then five syllables, such as:

Flowers will bloom soon,
Air warms and sun shines brightly,
Pink leaves will fall next.

Generated instruction:

**Sample Response:**

In a haiku, the first line is called the _kami-ita_ or "little bridge," and it sets the scene or introduces the subject. The second line is called the _kusamono_ or "describing word," and it provides more details about the subject. The third line is called the _kireji_ or "cutting word," and it marks the end of the subject.

**Sample
Ground truth:
What is a haiku?
