In [1]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaTokenizerFast
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = r"G:\code\pretrain_model_dir\llama-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_path)
print(type(tokenizer))
print(tokenizer)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>
LlamaTokenizer(name_or_path='G:\code\pretrain_model_dir\llama-7b-hf', vocab_size=32000, model_max_length=1e+30, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=False)


In [3]:
# 这个分词器有点特殊, 没有定义这两个 token
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)

0
0


In [17]:
# 使用这个加载会有个 RecursionError, 不知道为啥
# AutoTokenizer.from_pretrained(model_path)

In [4]:
# 加载模型
model = LlamaForCausalLM.from_pretrained(
    model_path, torch_dtype=torch.float16, device_map='auto',
)
print(model.dtype, model.device)

Loading checkpoint shards: 100%|██████████| 33/33 [00:20<00:00,  1.59it/s]

torch.float16 cuda:0





In [6]:
model.config

LlamaConfig {
  "_name_or_path": "G:\\code\\pretrain_model_dir\\llama-7b-hf",
  "architectures": [
    "LLaMAForCausalLM"
  ],
  "bos_token_id": 0,
  "eos_token_id": 1,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "max_sequence_length": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": -1,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.32.1",
  "use_cache": true,
  "vocab_size": 32000
}

In [7]:
model.generation_config

GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": -1,
  "transformers_version": "4.32.1"
}

In [5]:
# 这个推理速度很快, 比那个 3b-v2 的快多了. 显存占用 16 GB多
prompt = "I look forward to"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=40)
tokenizer.batch_decode(outputs, skip_special_tokens=True)



['I look forward to the next 10 years.\nI am a 20 year old female who has been diagnosed with PCOS. I have been on Metformin for 2 years and have']

In [9]:
prompt = "I love beijing , because"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=40)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['I love beijing , because it is a very beautiful city , and it is very interesting , and it is very beautiful , and it is very interesting , and it is very beautiful , and it is very interesting , and it is']

# 测试推理速度

In [12]:
import time

# 记录每次生成的时间和 token 数量
time_list = []
token_list = []

query_list = [
    "I look forward to",
    "I love beijing , because",
]
with open("./data/query.txt", "r", encoding="utf-8") as f:
    query_list = f.readlines()

for query in query_list:
    inputs = tokenizer(query, return_tensors="pt").to(model.device)
    start = time.time()
    outputs = model.generate(**inputs, max_new_tokens=40)
    end = time.time()
    print(f"query: {query}, time: {end - start}")
    print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

    time_list.append(end - start)
    token_list.append(outputs.shape[1] - inputs.input_ids.shape[1])

print(time_list)
print(token_list)
# 计算每秒生成的 token 数量
print("每秒 token 数:", sum(token_list) / sum(time_list))

query: He opened his eyes and gaspe
, time: 1.3360176086425781
["He opened his eyes and gaspe\nI'm not sure if this is the right place to post this, but I'm going to try. I'm a 20 year old male, and I've been having"]
query: She ran as fast as she coul
, time: 1.2339355945587158
['She ran as fast as she coul\nd. She ran as fast as she could.\ne. She ran as fast as she could.\nf. She ran as fast as she could.\ng. She ran as fast as']
query: The phone rang. He ignored i
, time: 1.173738956451416
['The phone rang. He ignored i\nt. He was in the middle of a very important project. He was working on a new invention. He was going to make a lot of money. He was going to be famous. He']
query: They met at the airpor
, time: 1.168050765991211
['They met at the airpor\nTheir love story began at the airport.\nThe couple met at the airport in 2015.\nThe couple met at the airport in 2015.']
query: She loved him. He didn’t kno
, time: 1.2172560691833496
['She loved him. He didn’t kno\nw. He didn’t k