In [1]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaTokenizerFast
from transformers.convert_slow_tokenizer import convert_slow_tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
model_path = r"G:\code\pretrain_model_dir\open_llama_3b_v2"

tokenizer = LlamaTokenizer.from_pretrained(model_path)
print(type(tokenizer))

<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>


In [11]:
tokenizer_fast = LlamaTokenizerFast.from_pretrained(model_path)
print(type(tokenizer_fast))

<class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>


In [12]:
print(tokenizer("Hello world"))
print(tokenizer_fast("Hello world"))

{'input_ids': [1, 8479, 1173], 'attention_mask': [1, 1, 1]}
{'input_ids': [1, 8479, 1173], 'attention_mask': [1, 1, 1]}


In [13]:
tokenizer.vocab_size

32000

In [21]:
# 这个模型的文档里建议不要使用 fast 版分词器
# https://huggingface.co/openlm-research/open_llama_3b_v2
tokenizer.is_fast

False

In [15]:
# # 如果不把 tokenizer.model 放到目录下, 使用 LlamaTokenizer 加载会报错, 但是 LlamaTokenizerFast 可以正常加载
# model_path = r"G:\code\pretrain_model_dir\llama-7b"
# tokenizer = LlamaTokenizer.from_pretrained(model_path)
# print(tokenizer)

# model_path = r"G:\code\pretrain_model_dir\llama-7b"
# tokenizer = LlamaTokenizerFast.from_pretrained(model_path)
# print(tokenizer)

In [19]:
# 加载模型
model_path = r"G:\code\pretrain_model_dir\open_llama_3b_v2"
model = LlamaForCausalLM.from_pretrained(
    model_path, torch_dtype=torch.float16, device_map='auto',
)
print(model.dtype, model.device)

torch.float16 cuda:0


In [20]:
# 原来推理速度不是很快, 这里 max_new_tokens=32, 需要 27 s
prompt = 'Q: What is the largest animal?\nA:'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)

generation_output = model.generate(
    input_ids=input_ids, max_new_tokens=32
)
print(tokenizer.decode(generation_output[0]))

<s>Q: What is the largest animal?
A: The largest animal is the blue whale.
Q: What is the smallest animal?
A: The smallest animal is the dwarf chameleon


In [22]:
# 原来推理速度不是很快, 这里 max_new_tokens=32, 需要 27 s
prompt = 'i love beijing:'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)

generation_output = model.generate(
    input_ids=input_ids, max_new_tokens=32
)
print(tokenizer.decode(generation_output[0]))

<s>i love beijing: the city of the future
i love beijing: the city of the future
i love beijing: the city of the future
i love beijing


In [25]:
prompt = 'Q: i love beijing, because?\nA:'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)

generation_output = model.generate(
    input_ids=input_ids, max_new_tokens=64
)
print(generation_output.shape)
print(tokenizer.decode(generation_output[0]))

torch.Size([1, 77])
<s>Q: i love beijing, because?
A: i love beijing, because it is the capital of china.
Q: i love beijing, because?
A: i love beijing, because it is the capital of china.
Q: i love beijing, because?
A: i love beijing, because it is the capital of china


In [23]:
prompt = 'Q: can you explain what is quick sort?\nA:'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)

generation_output = model.generate(
    input_ids=input_ids, max_new_tokens=320
)
print(tokenizer.decode(generation_output[0]))

<s>Q: can you explain what is quick sort?
A: Quick sort is a sorting algorithm that is based on the divide and conquer principle. It is a recursive algorithm that divides the array into two sub-arrays, sorts the smaller sub-array, and then recursively sorts the larger sub-array.
Q: What is the difference between quick sort and merge sort?
A: Quick sort is a sorting algorithm that is based on the divide and conquer principle. It is a recursive algorithm that divides the array into two sub-arrays, sorts the smaller sub-array, and then recursively sorts the larger sub-array.
Q: What is the difference between quick sort and merge sort?
A: Quick sort is a sorting algorithm that is based on the divide and conquer principle. It is a recursive algorithm that divides the array into two sub-arrays, sorts the smaller sub-array, and then recursively sorts the larger sub-array.
Q: What is the difference between quick sort and merge sort?
A: Quick sort is a sorting algorithm that is based on the div

In [19]:
with open("llama结构.txt", "w", encoding="utf-8") as f:
    f.write(str(model))

In [23]:
with open("llama参数.txt", "w", encoding="utf-8") as f:
    for name, param in model.named_parameters():
        f.write(f"{name}\n")
        f.write(f"{param.shape}\n")
        f.write(f"{param}\n\n")