In [1]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaTokenizerFast
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = r"G:\code\pretrain_model_dir\open_llama_3b_v2"
tokenizer = LlamaTokenizer.from_pretrained(model_path)
print(type(tokenizer))
print(tokenizer)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>
LlamaTokenizer(name_or_path='G:\code\pretrain_model_dir\open_llama_3b_v2', vocab_size=32000, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=False)


In [3]:
# 加载模型
model = LlamaForCausalLM.from_pretrained(
    model_path, torch_dtype=torch.float16, device_map='auto',
)
print(model.dtype, model.device)

torch.float16 cuda:0


In [4]:
# 显存占用 10 GB
prompt = "I look forward to"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=40)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['I look forward to the day when I can say that I have been a part of the world of blogging for a year. I have been blogging for a year now, and I have to say that I have enjoyed it']

In [5]:
prompt = "I love beijing , because"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=40)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['I love beijing , because it is a city of contrasts.\nI love beijing , because it is a city of contrasts.\nI love beijing , because it is a city of contrasts.\nI']

In [6]:
model.generation_config

GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 0,
  "transformers_version": "4.32.1"
}

In [7]:
print(model.config.is_encoder_decoder)
print(model.main_input_name)
print(inputs)
print(model.config.max_position_embeddings)

False
input_ids
{'input_ids': tensor([[    1,   306,  1219,   339, 17336,  1518,   940]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
2048


In [8]:
model.config.use_return_dict

True

# 手动生成一个单词

In [9]:
prompt = "I love beijing , because"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
print(inputs)
print(type(inputs))
print(inputs["input_ids"].shape)

{'input_ids': tensor([[    1,   306,  1219,   339, 17336,  1518,   940]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
<class 'transformers.tokenization_utils_base.BatchEncoding'>
torch.Size([1, 7])


In [10]:
print(tokenizer.add_bos_token)
print(tokenizer.add_eos_token)
print(tokenizer.bos_token)
print(tokenizer.eos_token)
print(tokenizer.pad_token_id)

True
False
<s>
</s>
None


In [11]:
result = model(**inputs)
print(type(result))
print(result.keys())

<class 'transformers.modeling_outputs.CausalLMOutputWithPast'>
odict_keys(['logits', 'past_key_values'])


In [12]:
result["logits"].shape

torch.Size([1, 7, 32000])

In [13]:
result["logits"]

tensor([[[-86.4375, -82.7500, -75.1875,  ..., -83.2500, -83.6875, -84.0625],
         [-81.6250, -78.5000, -70.3750,  ..., -80.6250, -80.2500, -79.1250],
         [-84.3125, -78.5000, -73.7500,  ..., -81.5625, -81.4375, -81.3750],
         ...,
         [-70.0625, -68.0625, -57.2500,  ..., -71.3750, -66.6250, -66.7500],
         [-79.5625, -77.9375, -66.6875,  ..., -77.6875, -77.1250, -74.8750],
         [-82.5625, -82.7500, -71.1875,  ..., -81.5625, -81.1250, -80.6875]]],
       device='cuda:0', grad_fn=<ToCopyBackward0>)

In [14]:
# 这里只要选最后一个位置就行
logits = result["logits"][:, -1, :]
torch.argmax(logits, dim=-1)

tensor([358], device='cuda:0')

In [67]:
def generate_one(prompt, verbose=False):
    """
    手动生成一个单词, 并返回新的 prompt
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    if verbose:
        print(inputs)
        print(type(inputs))
        print(inputs["input_ids"].shape)

    result = model(**inputs)
    if verbose:
        print(type(result))
        print(result.keys())

    # 这里只要选最后一个位置就行
    logits = result["logits"][:, -1, :]
    token_id = torch.argmax(logits, dim=-1)
    # 解码的时候怎么知道是否应加空格?
    # 解码单个 token_id 的时候, 如果前缀是 ▁, 就表示一个单词的开始, 要加空格. 如果不是 ▁, 就是单词的中间或结尾, 不加空格.
    new_input_ids = torch.cat([inputs["input_ids"], token_id.unsqueeze(0)], dim=-1)
    new_prompt = tokenizer.batch_decode(new_input_ids, skip_special_tokens=True)[0]

    return new_prompt

In [70]:
print(tokenizer.convert_ids_to_tokens(7352))
print(tokenizer.convert_ids_to_tokens(29508))

▁contrast
s


In [48]:
# 是个特殊符号, 不是普通的下划线
ord("▁"), ord("_")

(9601, 95)

In [71]:
# ['I love beijing , because it is a city of contrasts.\nI love beijing , because it is a city of contrasts.\nI love beijing , because it is a city of contrasts.\nI']
cur_prompt = prompt
for i in range(10):
    cur_prompt = generate_one(cur_prompt)
    print(cur_prompt)

I love beijing , because it
I love beijing , because it is
I love beijing , because it is a
I love beijing , because it is a city
I love beijing , because it is a city of
I love beijing , because it is a city of contrast
I love beijing , because it is a city of contrasts
I love beijing , because it is a city of contrasts.
I love beijing , because it is a city of contrasts.

I love beijing , because it is a city of contrasts.
I


# 调用 model.model 和 model.lm_head

In [105]:
prompt = "I love beijing , because"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
print(inputs)
print(type(inputs))
print(inputs["input_ids"].shape)

{'input_ids': tensor([[    1,   306,  1219,   339, 17336,  1518,   940]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
<class 'transformers.tokenization_utils_base.BatchEncoding'>
torch.Size([1, 7])


In [109]:
outputs = model.model(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    # position_ids=position_ids,
    # past_key_values=past_key_values,
    # inputs_embeds=inputs_embeds,
    # use_cache=use_cache,
    # output_attentions=output_attentions,
    # output_hidden_states=output_hidden_states,
    # return_dict=return_dict,
)
print(type(outputs))
print(outputs.keys())

<class 'transformers.modeling_outputs.BaseModelOutputWithPast'>
odict_keys(['last_hidden_state', 'past_key_values'])


In [110]:
last_hidden_state = outputs["last_hidden_state"]
print(last_hidden_state.shape)
last_hidden_state

torch.Size([1, 7, 3200])


tensor([[[ 2.9629, -1.5957, -1.9365,  ...,  0.8457,  2.5410,  1.7910],
         [-0.3489, -1.6348, -1.5059,  ...,  0.5723,  1.7080,  1.8477],
         [ 1.4912, -1.6123, -1.8242,  ...,  1.5205,  3.3008,  2.6211],
         ...,
         [ 0.9775, -0.2898, -0.1992,  ..., -0.8467,  2.1504, -0.4502],
         [ 1.5713, -1.5361, -1.5537,  ..., -1.0342,  0.4653,  0.3042],
         [ 0.8340, -2.4160, -2.7832,  ...,  0.0782,  0.4045,  0.5225]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<MulBackward0>)

In [111]:
logits = model.lm_head(last_hidden_state)
print(type(logits))
logits

<class 'torch.Tensor'>


tensor([[[-86.4375, -82.7500, -75.1875,  ..., -83.2500, -83.6875, -84.0625],
         [-81.6250, -78.5000, -70.3750,  ..., -80.6250, -80.2500, -79.1250],
         [-84.3125, -78.5000, -73.7500,  ..., -81.5625, -81.4375, -81.3750],
         ...,
         [-70.0625, -68.0625, -57.2500,  ..., -71.3750, -66.6250, -66.7500],
         [-79.5625, -77.9375, -66.6875,  ..., -77.6875, -77.1250, -74.8750],
         [-82.5625, -82.7500, -71.1875,  ..., -81.5625, -81.1250, -80.6875]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<UnsafeViewBackward0>)

In [112]:
# 这里只要选最后一个位置就行
logits = logits[:, -1, :]
torch.argmax(logits, dim=-1)

tensor([358], device='cuda:0')

# 每个模块的输入和输出

In [113]:
prompt = "I love beijing , because"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
print(inputs)
print(type(inputs))
print(inputs["input_ids"].shape)

{'input_ids': tensor([[    1,   306,  1219,   339, 17336,  1518,   940]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
<class 'transformers.tokenization_utils_base.BatchEncoding'>
torch.Size([1, 7])


In [114]:
from transformers import  LlamaModel
model: LlamaForCausalLM
model.model: LlamaModel

In [146]:
# 第一个是嵌入层
inputs_embeds = model.model.embed_tokens(inputs["input_ids"])
# 3200 是指 hidden_size
print(inputs_embeds.shape)
inputs_embeds

torch.Size([1, 7, 3200])


tensor([[[ 1.0315e-02,  2.8687e-03,  5.9128e-04,  ..., -1.8845e-03,
          -9.2983e-05, -3.0060e-03],
         [-1.0986e-02, -5.0735e-04,  3.3875e-03,  ...,  2.3346e-03,
           9.3994e-03, -1.6602e-02],
         [-1.6724e-02,  3.0212e-03, -1.4099e-02,  ...,  9.5749e-04,
          -1.5137e-02,  2.4536e-02],
         ...,
         [ 2.5513e-02,  7.2021e-03,  8.4839e-03,  ...,  3.2654e-03,
           7.2937e-03, -1.6602e-02],
         [ 6.5002e-03, -6.9275e-03, -1.2451e-02,  ..., -1.9653e-02,
           1.1658e-02,  2.1606e-02],
         [-5.1880e-03, -1.3367e-02, -5.5695e-04,  ...,  3.3188e-04,
           1.8066e-02, -8.9111e-03]]], device='cuda:0', dtype=torch.float16,
       grad_fn=<EmbeddingBackward0>)

In [147]:
# 这个模型有 26 层
print(len(model.model.layers))
print(type(model.model.layers[0]))

26
<class 'transformers.models.llama.modeling_llama.LlamaDecoderLayer'>


In [148]:
# 计算注意力
batch_size, seq_length = inputs["input_ids"].shape
past_key_values_length = 0
attention_mask = model.model._prepare_decoder_attention_mask(
    # attention_mask shape 是 (batch_size, seq_length_with_past)
    # 第二个参数是个 shape, (batch_size, seq_length)
    # inputs_embeds 的 shape 是 (batch_size, seq_length, hidden_size)
    inputs["attention_mask"], (batch_size, seq_length), inputs_embeds, past_key_values_length
)
print(attention_mask.shape)
attention_mask

torch.Size([1, 1, 7, 7])


tensor([[[[     0., -65504., -65504., -65504., -65504., -65504., -65504.],
          [     0.,      0., -65504., -65504., -65504., -65504., -65504.],
          [     0.,      0.,      0., -65504., -65504., -65504., -65504.],
          [     0.,      0.,      0.,      0., -65504., -65504., -65504.],
          [     0.,      0.,      0.,      0.,      0., -65504., -65504.],
          [     0.,      0.,      0.,      0.,      0.,      0., -65504.],
          [     0.,      0.,      0.,      0.,      0.,      0.,      0.]]]],
       device='cuda:0', dtype=torch.float16)

In [155]:
# 自动构建 position_ids
device = inputs["input_ids"].device
# shape 是 (seq_length)
position_ids = torch.arange(
    past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
)
# shape 是 (1, seq_length). 不理解为啥是这样的shape
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)

print(position_ids.shape)
position_ids

torch.Size([1, 7])


tensor([[0, 1, 2, 3, 4, 5, 6]], device='cuda:0')

In [156]:
# 运行单层的结果
hidden_states = inputs_embeds

decoder_layer = model.model.layers[0]

layer_outputs = decoder_layer(
    # shape 是 (batch_size, seq_length, hidden_size)
    hidden_states,
    # shape 是 [bsz, 1, tgt_seq_len, src_seq_len]
    attention_mask=attention_mask,
    # shape 是 (batch_size, seq_length)
    position_ids=position_ids,
    # past_key_value=past_key_value,
    # output_attentions=output_attentions,
    # use_cache=use_cache,
)

In [158]:
print(len(layer_outputs))
print(type(layer_outputs[0]))
print(layer_outputs[0].shape)
layer_outputs[0]

1
<class 'torch.Tensor'>
torch.Size([1, 7, 3200])


tensor([[[ 0.0204, -0.0389,  0.0304,  ...,  0.0071,  0.0104,  0.0036],
         [-0.0071, -0.0091,  0.0117,  ...,  0.0131,  0.0099, -0.0246],
         [-0.0165,  0.0034, -0.0005,  ...,  0.0078, -0.0153,  0.0342],
         ...,
         [ 0.0217,  0.0018,  0.0030,  ...,  0.0015,  0.0135, -0.0089],
         [ 0.0224, -0.0038, -0.0009,  ..., -0.0361,  0.0062,  0.0368],
         [-0.0024, -0.0109,  0.0178,  ...,  0.0022,  0.0270, -0.0027]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>)

In [159]:
# 运行所有的层
hidden_states = inputs_embeds
for layer in model.model.layers:
    layer_outputs = layer(
        # shape 是 (batch_size, seq_length, hidden_size)
        hidden_states,
        # shape 是 [bsz, 1, tgt_seq_len, src_seq_len]
        attention_mask=attention_mask,
        # shape 是 (batch_size, seq_length)
        position_ids=position_ids,
        # past_key_value=past_key_value,
        # output_attentions=output_attentions,
        # use_cache=use_cache,
    )

    # 层的第一个输出是 hidden_states
    hidden_states = layer_outputs[0]

hidden_states

tensor([[[ 2.1504, -1.2461, -1.4385,  ...,  0.6284,  1.7852,  1.2754],
         [-0.4331, -2.1855, -1.9160,  ...,  0.7275,  2.0547,  2.2520],
         [ 1.9043, -2.2168, -2.3848,  ...,  1.9883,  4.0781,  3.2852],
         ...,
         [ 0.7256, -0.2314, -0.1514,  ..., -0.6431,  1.5449, -0.3279],
         [ 2.1973, -2.3125, -2.2266,  ..., -1.4814,  0.6299,  0.4177],
         [ 1.1953, -3.7266, -4.0859,  ...,  0.1150,  0.5615,  0.7354]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>)

In [160]:
# 经过 norm 层
hidden_states = model.model.norm(hidden_states)
hidden_states

tensor([[[ 2.9629, -1.5957, -1.9365,  ...,  0.8457,  2.5410,  1.7910],
         [-0.3489, -1.6348, -1.5059,  ...,  0.5723,  1.7080,  1.8477],
         [ 1.4912, -1.6123, -1.8242,  ...,  1.5205,  3.3008,  2.6211],
         ...,
         [ 0.9775, -0.2898, -0.1992,  ..., -0.8467,  2.1504, -0.4502],
         [ 1.5713, -1.5361, -1.5537,  ..., -1.0342,  0.4653,  0.3042],
         [ 0.8340, -2.4160, -2.7832,  ...,  0.0782,  0.4045,  0.5225]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<MulBackward0>)

In [161]:
# 经过 lm_head 层
logits = model.lm_head(hidden_states)
print(logits.shape)
logits

torch.Size([1, 7, 32000])


tensor([[[-86.4375, -82.7500, -75.1875,  ..., -83.2500, -83.6875, -84.0625],
         [-81.6250, -78.5000, -70.3750,  ..., -80.6250, -80.2500, -79.1250],
         [-84.3125, -78.5000, -73.7500,  ..., -81.5625, -81.4375, -81.3750],
         ...,
         [-70.0625, -68.0625, -57.2500,  ..., -71.3750, -66.6250, -66.7500],
         [-79.5625, -77.9375, -66.6875,  ..., -77.6875, -77.1250, -74.8750],
         [-82.5625, -82.7500, -71.1875,  ..., -81.5625, -81.1250, -80.6875]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<UnsafeViewBackward0>)

In [162]:
logits = logits[:, -1, :]
torch.argmax(logits, dim=-1)

tensor([358], device='cuda:0')