In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-0.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="cuda"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

#model.config.output_attentions = True  # 返回注意力权重
model.config.output_hidden_states = True  # 返回隐藏状态（可选）


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [7]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [8]:
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text,text], return_tensors="pt").to(model.device)



In [9]:
generated_ids = model(**model_inputs, output_hidden_states=True)

In [10]:
batch_size, seq_len, model_dim = generated_ids['hidden_states'][-1].shape

In [12]:

tgt = generated_ids['hidden_states'][-1]

In [23]:
import torch
import torch.nn as nn

# 参数
d_model = 896  # 隐藏维度
nhead = 8      # 注意力头数
num_encoder_layers = 6  # 编码器层数
num_decoder_layers = 6  # 解码器层数
dim_feedforward = 2048  # 前馈层中间维度
dropout = 0.1

# 定义编码器和解码器层
encoder_layer = nn.TransformerEncoderLayer(
    d_model=d_model,
    nhead=nhead,
    dim_feedforward=dim_feedforward,
    dropout=dropout,
    batch_first=True
)
decoder_layer = nn.TransformerDecoderLayer(
    d_model=d_model,
    nhead=nhead,
    dim_feedforward=dim_feedforward,
    dropout=dropout,
    batch_first=True,
    dtype=torch.bfloat16,
    device='cuda:0'
)

# 堆叠多层
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

# # 输入数据
# batch_size = 2
# src_seq_len = 10  # 源序列长度
# tgt_seq_len = 8   # 目标序列长度
# src = torch.randn(batch_size, src_seq_len, d_model)  # 源序列
# tgt = torch.randn(batch_size, tgt_seq_len, d_model)  # 目标序列

# 生成掩码（自回归掩码）
def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.ones(sz, sz), diagonal=1).bool()
    return mask

# tgt_mask = generate_square_subsequent_mask(tgt_seq_len).to(src.device)

# # 前向传播
# # (1) 编码器处理源序列
# memory = transformer_encoder(src)  # 输出形状: (batch_size, src_seq_len, d_model)

# # (2) 解码器处理目标序列，并使用编码器输出进行交叉注意力
# output = transformer_decoder(
#     tgt,              # 目标序列
#     memory,           # 编码器输出
#     tgt_mask=tgt_mask # 自回归掩码
# )  # 输出形状: (batch_size, tgt_seq_len, d_model)

# # 打印结果
# print("Encoder output (memory) shape:", memory.shape)
# print("Decoder output shape:", output.shape)

In [24]:
tgt_mask = generate_square_subsequent_mask(seq_len).to('cuda')
transformer_decoder.to('cuda')

TransformerDecoder(
  (layers): ModuleList(
    (0-5): 6 x TransformerDecoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=896, out_features=896, bias=True)
      )
      (multihead_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=896, out_features=896, bias=True)
      )
      (linear1): Linear(in_features=896, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=896, bias=True)
      (norm1): LayerNorm((896,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((896,), eps=1e-05, elementwise_affine=True)
      (norm3): LayerNorm((896,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (dropout3): Dropout(p=0.1, inplace=False)
    )
  )
)

In [25]:
tgt_mask

tensor([[False,  True,  True,  ...,  True,  True,  True],
        [False, False,  True,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        ...,
        [False, False, False,  ..., False,  True,  True],
        [False, False, False,  ..., False, False,  True],
        [False, False, False,  ..., False, False, False]], device='cuda:0')

In [26]:
transformer_decoder(tgt, tgt, tgt_mask=tgt_mask)

tensor([[[-1.2734,  1.8594, -0.5938,  ..., -0.0762,  0.1768,  2.0000],
         [ 0.0491,  0.7031, -0.2598,  ..., -1.4766,  1.6094,  0.8711],
         [-0.6484,  0.5938,  0.0854,  ..., -0.6445,  0.4648,  1.4219],
         ...,
         [-1.4766,  2.0781,  0.1348,  ..., -0.2891,  0.5586,  0.8047],
         [ 0.1768,  1.5547, -0.9102,  ..., -0.7812, -0.1670, -0.1914],
         [-0.4805,  2.4062,  0.1206,  ..., -1.1562,  0.6680,  1.8203]],

        [[-1.0391,  1.1094,  0.3145,  ..., -0.4492,  0.0684,  2.0312],
         [-0.4863,  1.5234, -0.8906,  ..., -0.2773, -0.1191, -0.1250],
         [ 0.2246,  1.0781,  0.1523,  ..., -0.7383, -0.1602,  0.6758],
         ...,
         [-0.0195,  0.9961,  0.4316,  ..., -0.6016,  1.3203,  0.0354],
         [-0.5078,  1.4219,  0.8203,  ..., -0.9883,  0.0957,  2.4062],
         [-0.7969,  2.0938,  0.6797,  ..., -0.3145,  0.6211,  2.7188]]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<NativeLayerNormBackward0>)

In [62]:
last_hidden_states = generated_ids.hidden_states[-1]  # 最后一层的隐藏状态

# 输出形状
print("Last hidden states shape:", last_hidden_states.shape)
print("Last hidden states:", last_hidden_states)

AttributeError: 'tuple' object has no attribute 'shape'

In [27]:

response = tokenizer.batch_decode(generated_ids['sequences'], skip_special_tokens=True)[0]
print(response)

system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user
Give me a short introduction to large language model.
assistant
Large language models (LLMs) are artificial intelligence systems that use natural language processing techniques to understand and generate human-like text. These models can be trained on vast amounts of data, including books, articles, websites, and other forms of information, and have been used in a wide range of applications, from language translation and summarization to chatbots and virtual assistants.

LLMs are typically composed of multiple layers of neural networks, which allow them to learn complex patterns and relationships within the data they are trained on. This enables them to perform tasks such as question answering, sentiment analysis, and language generation with high accuracy and speed.

One of the most notable features of LLMs is their ability to process and understand a broad range of topics and contexts, making them useful

In [28]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [34]:
len(generated_ids['sequences'][0])

248

In [47]:
generated_ids['past_key_values'][1][1].shape

torch.Size([1, 2, 247, 64])