In [1]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaTokenizerFast
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = r"G:\code\pretrain_model_dir\llama-7b"
tokenizer = LlamaTokenizer.from_pretrained(model_path)
print(type(tokenizer))
print(tokenizer)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>
LlamaTokenizer(name_or_path='G:\code\pretrain_model_dir\llama-7b', vocab_size=32000, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=False)


In [3]:
AutoTokenizer.from_pretrained(model_path)

LlamaTokenizerFast(name_or_path='G:\code\pretrain_model_dir\llama-7b', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=False)

In [4]:
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)

1
2
None


In [5]:
# 加载模型
model = LlamaForCausalLM.from_pretrained(
    model_path, torch_dtype=torch.float16, device_map='auto',
)
print(model.dtype, model.device)

Loading checkpoint shards: 100%|██████████| 2/2 [00:18<00:00,  9.33s/it]

torch.float16 cuda:0





In [6]:
prompt = "I look forward to"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=40)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['I look forward to the day when I can say that I have a full-time job. I’m not there yet, but I’m getting closer.\nI’m a freelance writer and editor']

In [7]:
prompt = "I love beijing , because"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=40)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['I love beijing , because it is a city with a lot of history and culture.\nI love beijing because it is a city with a lot of history and culture.\nI love beijing because it is']

# 测试推理速度

In [8]:
import time

# 记录每次生成的时间和 token 数量
time_list = []
token_list = []

query_list = [
    "I look forward to",
    "I love beijing , because",
]
with open("./data/query.txt", "r", encoding="utf-8") as f:
    query_list = f.readlines()

for query in query_list:
    inputs = tokenizer(query, return_tensors="pt").to(model.device)
    start = time.time()
    outputs = model.generate(**inputs, max_new_tokens=40)
    end = time.time()
    print(f"query: {query}, time: {end - start}")
    print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

    time_list.append(end - start)
    token_list.append(outputs.shape[1] - inputs.input_ids.shape[1])

print(time_list)
print(token_list)
# 计算每秒生成的 token 数量
print("每秒 token 数:", sum(token_list) / sum(time_list))

query: He opened his eyes and gaspe
, time: 1.241760015487671
['He opened his eyes and gaspe\nHe opened his eyes and gasped. "I\'m sorry, I\'m sorry, I\'m sorry," he said. "I\'m sorry, I\'m sorry, I']
query: She ran as fast as she coul
, time: 1.1983413696289062
['She ran as fast as she coul\nShe ran as fast as she could, but she couldn’t catch up with the train.\nShe was so sad that she cried.\nShe cried so hard that she couldn’t see.']
query: The phone rang. He ignored i
, time: 1.1540663242340088
['The phone rang. He ignored i\nThe phone rang. He ignored it. He was in the middle of a game of chess with his son.\n"Dad, it\'s for you."\n"I\'m busy']
query: They met at the airpor
, time: 1.1993374824523926
['They met at the airpor\nThey met at the airport and were married in a civil ceremony in 2011.\nThe couple have a daughter, 10-month-old North West.\nKim']
query: She loved him. He didn’t kno
, time: 1.246180534362793
['She loved him. He didn’t kno\nShe loved him. He didn’t know it,