# Test

In [1]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-3B-Instruct")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
   {'role': 'assistant',
    'content': 'I\'m an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."'}]}]

# 分词器 Tokenizer

In [21]:
import json
import matplotlib.pyplot as plt

from pathlib import Path
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM

In [9]:
MODLE_BASE_DIR = "/workspace/.hf_home/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95"

In [23]:
model_path = MODLE_BASE_DIR
device = torch.device("cuda" if torch.cuda.is_available else "cpu")
torch_dtype = torch.bfloat16

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 读取模型文件

In [25]:
model_m = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch_dtype)
model_m.to(device)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e-05)
    (rotary_emb

In [28]:
model = model_m.state_dict()

In [35]:
# 模型配置参数
config = model_m.config.to_dict()
print(json.dumps(dict(sorted(config.items())), indent=4))

{
    "_name_or_path": "/workspace/.hf_home/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95",
    "add_cross_attention": false,
    "architectures": [
        "LlamaForCausalLM"
    ],
    "attention_bias": false,
    "attention_dropout": 0.0,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": 128000,
    "chunk_size_feed_forward": 0,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "dtype": "bfloat16",
    "early_stopping": false,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": [
        128001,
        128008,
        128009
    ],
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "head_dim": 128,
    "hidden_act": "silu",
    "hidden_size": 3072,
    "id2label": {
        "0": "LABEL_0",
        "1": "LABEL_1"
    }

In [36]:
dim = config["hidden_size"]
n_layers = config["num_hidden_layers"]
n_heads = config["num_attention_heads"]
n_kv_heads = config["num_key_value_heads"]
vocab_size = config["vocab_size"]
norm_eps = config["rms_norm_eps"]
rope_theta = torch.tensor(config["rope_theta"])

group_heads = n_heads // n_kv_heads
dk = dim // n_heads

# 文本分词编码

In [37]:
prompt = "测试中，用中文说成语，前程似锦，金榜题"
# prompt = "中华人民万"
# prompt = "星期"

In [40]:
tokens = tokenizer.encode(prompt, add_special_tokens=True)
print(tokens)
tokens = torch.tensor(tokens)

[128000, 82805, 16325, 119977, 108891, 37687, 13153, 73981, 3922, 25580, 39607, 104409, 127999, 3922, 35330, 121272, 34972]


In [43]:
prompt_split_as_tokens = [tokenizer.decode([token.item()]) for token in tokens]
print(prompt_split_as_tokens)

['<|begin_of_text|>', '测试', '中', '，用', '中文', '说', '成', '语', '，', '前', '程', '似', '锦', '，', '金', '榜', '题']


# Token embedding 嵌入

In [46]:
# 最后输出的全连接层
lm_head = model["lm_head.weight"]

In [49]:
# 加载嵌入层
embedding_layer = torch.nn.Embedding(vocab_size, dim)
embedding_layer.weight.data.copy_(model["model.embed_tokens.weight"])

tensor([[ 1.1292e-02,  9.9487e-03,  1.4160e-02,  ..., -3.5706e-03,
         -1.9775e-02,  5.3711e-03],
        [ 1.3245e-02, -3.8385e-05,  2.2461e-02,  ..., -2.6550e-03,
          3.1738e-02, -1.0681e-03],
        [ 1.9775e-02,  2.0020e-02,  2.8687e-02,  ..., -3.5248e-03,
          3.1433e-03, -7.6294e-03],
        ...,
        [-3.0975e-03,  2.1057e-03,  4.8828e-03,  ..., -2.0905e-03,
         -1.2207e-03, -2.8992e-03],
        [-3.0975e-03,  2.1057e-03,  4.8828e-03,  ..., -2.0905e-03,
         -1.2207e-03, -2.8992e-03],
        [-3.0975e-03,  2.1057e-03,  4.8828e-03,  ..., -2.0905e-03,
         -1.2207e-03, -2.8992e-03]])

In [58]:
token_embeddings_unnormalized = embedding_layer(tokens).to(torch_dtype).to(device)
print(token_embeddings_unnormalized.shape)

torch.Size([17, 3072])


# RMS 归一化

In [62]:
def rms_norm(tensor, norm_weights):
    r_rms = torch.rsqrt(tensor.pow(2).mean(-1, keepdim=True) + norm_eps)
    x = (tensor * r_rms) * norm_weights
    return x

In [64]:
# 测试归一化
token_embeddings = rms_norm(token_embeddings_unnormalized, model["model.layers.0.input_layernorm.weight"])
print(token_embeddings.shape)

torch.Size([17, 3072])


# 实现注意力机制

In [65]:
print(
    model["model.layers.0.self_attn.q_proj.weight"].shape,
    model["model.layers.0.self_attn.k_proj.weight"].shape,
    model["model.layers.0.self_attn.v_proj.weight"].shape,
    model["model.layers.0.self_attn.o_proj.weight"].shape,
)

torch.Size([3072, 3072]) torch.Size([1024, 3072]) torch.Size([1024, 3072]) torch.Size([3072, 3072])


## 计算 query

In [71]:
q_layer0 = model["model.layers.0.self_attn.q_proj.weight"]
head_dim = q_layer0.shape[0] // n_heads
q_layer0 = q_layer0.view(n_heads, head_dim, dim)
print(q_layer0.shape)

torch.Size([24, 128, 3072])


## 第一层的第一个头

In [74]:
q_layer0_head0 = q_layer0[0]
print(q_layer0_head0.shape)

torch.Size([128, 3072])


## q_per_token = torch.matmul(token_embeddings, q_layer0_head0.T)
print(q_per_token.shape)