In [1]:
from transformers.models.qwen2 import Qwen2ForCausalLM, Qwen2Tokenizer
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name_or_path = 'E:/Model/Qwen2-0.5B-Instruct'
model = Qwen2ForCausalLM.from_pretrained(model_name_or_path, device_map='cuda:0')
tokenizer = Qwen2Tokenizer.from_pretrained(model_name_or_path)

## demo1
1. 调用generate方法，让他一直生成新的token
2. generate 方法，本质上是一个循环调用 forward 方法，直到终止（EOS、MAX_TOKENS）

3. GQA 的本质是 正常 q 是 n_heads 对应 k, v 的 n_heads ，对应进行矩阵计算， 现在变成 q是 n_heads, k, v变成了 n_k_v_heads < n_heads, 后续后repeat_kv 从 [bs, n_k_v_heads, seq_len, hidden_size] -> [bs, n_k_v_heads* group, seq_len, hidden_size]

In [3]:
text = "介绍一下杭州的良睦路"

model_inputs = tokenizer(text, return_tensors='pt').to(model.device)
print(model_inputs)

for k, v in model_inputs.items():
    print(k, v)

{'input_ids': tensor([[109432, 104130,   9370,  99584, 103852,  45995]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]], device='cuda:0')}
input_ids tensor([[109432, 104130,   9370,  99584, 103852,  45995]], device='cuda:0')
attention_mask tensor([[1, 1, 1, 1, 1, 1]], device='cuda:0')


In [4]:
generated_ids = model.generate(**model_inputs, max_new_tokens=10)
generated_ids

tensor([[109432, 104130,   9370,  99584, 103852,  45995,  33108, 110192,   3837,
         104017, 101127,  99661,  99245,  11319,    220, 100622]],
       device='cuda:0')

In [5]:
generated_ids = [
    output_ids[len(input_ids):]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
response

['和西湖，它们分别代表什么？ 作为']

## demo2
1. 直接基于第一步，生成新token
2. forward 方法，每次生成一个token 

In [6]:
model_inputs1 = {
    'input_ids': torch.tensor([[109432, 104130,   9370,  99584, 103852,  45995]], dtype=torch.long).to(model.device),
    'attention_mask':torch.tensor([[1,1,1,1,1,1]], dtype=torch.long).to(model.device)
}

model_inputs1

{'input_ids': tensor([[109432, 104130,   9370,  99584, 103852,  45995]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [7]:
model_outputs1 = model.forward(**model_inputs1, use_cache=True)
model_outputs1.keys()

odict_keys(['logits', 'past_key_values'])

In [8]:
model_outputs1.logits.shape     # [bs, seq_len, vocab_size]

torch.Size([1, 6, 151936])

In [9]:
# 最后一个预测的token ，就是下一个token，在词表中的概率
model_outputs1.logits[:, -1, :].shape

torch.Size([1, 151936])

In [10]:
# 最大的一个概率的位置，就是对应词的id 
model_outputs1.logits[:, -1, :].argmax(dim=-1)

tensor([3837], device='cuda:0')

In [11]:
tokenizer.decode([3837])

'，'

In [12]:
(
    type(model_outputs1.past_key_values),
    len(model_outputs1.past_key_values),    # 24, 是模型层数
    len(model_outputs1.past_key_values[0]), # tuple 对象[k_cache, v_cache]，包含第0层的k, v
    model_outputs1.past_key_values[0][0].shape,

)

(tuple, 24, 2, torch.Size([1, 2, 6, 64]))

In [16]:
model_outputs1.past_key_values[0][0].shape, model_outputs1.past_key_values[0][1].shape


(torch.Size([1, 2, 6, 64]), torch.Size([1, 2, 6, 64]))

In [17]:
model_outputs1.past_key_values[0][0].shape

torch.Size([1, 2, 6, 64])

## demo3
1. 把上一次生成的past kv 拿过来，加上新拼接的token，生成
2. 对比demo3 和 demo4

In [36]:
model_outputs2 = model.forward(
    **{
        # 输入[bs, seq_len]，需要是二维度
        'input_ids': torch.tensor([[3837]], dtype=torch.long).to(model.device),
        'attention_mask':torch.tensor([[1]], dtype=torch.long).to(model.device)
    },
    past_key_values = model_outputs1.past_key_values
)

model_outputs2.keys()

odict_keys(['logits', 'past_key_values'])

## demo4
1. 直接模拟简单粗暴类型的生成方式

In [42]:
model_inputs3 = {
    'input_ids':torch.tensor(
        [[109432, 104130,   9370,  99584, 103852,  45995, 3837]], dtype=torch.long
    ).to(model.device),
    'attention_mask':torch.tensor([[1, 1, 1, 1, 1, 1, 1]], dtype=torch.long).to(model.device)
}

model_inputs3

{'input_ids': tensor([[109432, 104130,   9370,  99584, 103852,  45995,   3837]],
        device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [43]:
model_outputs3 = model(**model_inputs3)
model_outputs3.keys()

odict_keys(['logits', 'past_key_values'])

## 验证demo4和demo3输出的logits是不是一样的

In [44]:
torch.allclose(model_outputs3.logits[:, -1, :], model_outputs2.logits[:, -1, :], atol=1e-4)

True

In [48]:
model_outputs3.logits[:, -1, :].argmax(dim=-1), tokenizer.decode(model_outputs3.logits[:, -1, :].argmax(dim=-1))

(tensor([100630], device='cuda:0'), '包括')

In [49]:
model_outputs2.logits[:, -1, :].argmax(dim=-1), tokenizer.decode(model_outputs2.logits[:, -1, :].argmax(dim=-1))

(tensor([100630], device='cuda:0'), '包括')

repeat_kv 函数解读

In [51]:
import torch
# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states, n_rep):
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)

In [None]:
# [bs, num_k_v_heads, seq_len, hid_dim]
a = torch.tensor(torch.arange(6).reshape([1,2,3,1]))
a

  a = torch.tensor(torch.arange(6).reshape([1,2,3,1]))


tensor([[[[0],
          [1],
          [2]],

         [[3],
          [4],
          [5]]]])

In [None]:
b = repeat_kv(a, 2)  # num_k_v_group=2, a.shape = [bs, num_k_v_heads, seq_len, hid_dim] 
b.shape, b   # [bs, num_k_v_heads * n_k_v_group, seq_len, hid_dim]

(torch.Size([1, 4, 3, 1]),
 tensor([[[[0],
           [1],
           [2]],
 
          [[0],
           [1],
           [2]],
 
          [[3],
           [4],
           [5]],
 
          [[3],
           [4],
           [5]]]]))