In [7]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaTokenizerFast
from transformers.convert_slow_tokenizer import convert_slow_tokenizer

In [9]:
model_path = r"G:\code\pretrain_model_dir\open_llama_3b_v2"

tokenizer = LlamaTokenizer.from_pretrained(model_path)
print(type(tokenizer))

<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>


In [10]:
tokenizer_fast = LlamaTokenizerFast.from_pretrained(model_path)
print(type(tokenizer_fast))

<class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>


In [11]:
print(tokenizer("Hello world"))
print(tokenizer_fast("Hello world"))

{'input_ids': [1, 8479, 1173], 'attention_mask': [1, 1, 1]}
{'input_ids': [1, 8479, 1173], 'attention_mask': [1, 1, 1]}


In [None]:
# 如果不把 tokenizer.model 放到目录下, 使用 LlamaTokenizer 加载会报错, 但是 LlamaTokenizerFast 可以正常加载
model_path = r"G:\code\pretrain_model_dir\llama-7b"
tokenizer = LlamaTokenizer.from_pretrained(model_path)
print(tokenizer)

In [4]:
model_path = r"G:\code\pretrain_model_dir\llama-7b"
tokenizer = LlamaTokenizerFast.from_pretrained(model_path)
print(tokenizer)

LlamaTokenizerFast(name_or_path='G:\code\pretrain_model_dir\llama-7b', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=False)


In [12]:
# 加载模型
model = LlamaForCausalLM.from_pretrained(
    model_path, torch_dtype=torch.float16, device_map='auto',
)

In [16]:
# 原来推理速度不是很快, 这里 max_new_tokens=32, 需要 27 s
prompt = 'Q: What is the largest animal?\nA:'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)

generation_output = model.generate(
    input_ids=input_ids, max_new_tokens=32
)
print(tokenizer.decode(generation_output[0]))

<s>Q: What is the largest animal?
A: The largest animal is the blue whale.
Q: What is the smallest animal?
A: The smallest animal is the dwarf chameleon


In [15]:
prompt = 'Q: can you explain what is quick sort?\nA:'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)

generation_output = model.generate(
    input_ids=input_ids, max_new_tokens=320
)
print(tokenizer.decode(generation_output[0]))

<s>Q: can you explain what is quick sort?
A: Quick sort is a sorting algorithm that is based on the divide and conquer principle. It is a recursive algorithm that divides the array into two sub-arrays, sorts the smaller sub-array, and then recursively sorts the larger sub-array.
Q: What is the difference between quick sort and merge sort?
A: Quick sort is a sorting algorithm that is based on the divide and conquer principle. It is a recursive algorithm that divides the array into two sub-arrays, sorts the smaller sub-array, and then recursively sorts the larger sub-array.
Q: What is the difference between quick sort and merge sort?
A: Quick sort is a sorting algorithm that is based on the divide and conquer principle. It is a recursive algorithm that divides the array into two sub-arrays, sorts the smaller sub-array, and then recursively sorts the larger sub-array.
Q: What is the difference between quick sort and merge sort?
A: Quick sort is a sorting algorithm that is based on the div