In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


In [2]:
# model_name = "qwen/Qwen2-1.5B-Instruct"
model_pth = "/data/dl/hub/models--qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/"
model = AutoModelForCausalLM.from_pretrained(model_pth, torch_dtype=torch.bfloat16, device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(model_pth)

In [69]:
in_text = "I am "
in_tokens = tokenizer(in_text, return_tensors="pt")["input_ids"]
print(tokenizer.eos_token_id, tokenizer.eos_token)
print(tokenizer.bos_token_id, tokenizer.bos_token)
print(in_tokens)

151645 <|im_end|>
None None
tensor([[  40, 1079,  220]])


In [112]:
# inference
in_text = "Beijing is the captical of "
in_tokens = tokenizer(in_text, return_tensors="pt")["input_ids"]
print(tokenizer.eos_token_id, tokenizer.eos_token)
print(tokenizer.bos_token_id, tokenizer.bos_token)
print(in_tokens)

eos_token = torch.tensor([[tokenizer.eos_token_id]])
out_token = None
max_length = 10

cnt = 0
in_tokens, eos_token = in_tokens.to("cuda"), eos_token.to("cuda")
with torch.no_grad():
    while out_token != eos_token and cnt < max_length:
        logits = model(in_tokens).logits
        out_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
        in_tokens = torch.cat([in_tokens, out_token], dim=1)
        text = tokenizer.decode(in_tokens[0])
        print(f"step {cnt} input: {text}", flush=True)
        cnt += 1
out_text = tokenizer.decode(in_tokens[0])
print(f"\nInput: {in_text}")
print(f"Output: {out_text}")

151645 <|im_end|>
None None
tensor([[ 3430, 23649,   374,   279,  6427,   938,   315,   220]])
step 0 input: Beijing is the captical of  ____
step 1 input: Beijing is the captical of  ____1
step 2 input: Beijing is the captical of  ____1____
step 3 input: Beijing is the captical of  ____1____ the
step 4 input: Beijing is the captical of  ____1____ the capital
step 5 input: Beijing is the captical of  ____1____ the capital of
step 6 input: Beijing is the captical of  ____1____ the capital of China


step 7 input: Beijing is the captical of  ____1____ the capital of China.
step 8 input: Beijing is the captical of  ____1____ the capital of China. It
step 9 input: Beijing is the captical of  ____1____ the capital of China. It is

Input: Beijing is the captical of 
Output: Beijing is the captical of  ____1____ the capital of China. It is


In [74]:
outputs = model(in_tokens)

In [85]:
print(len(outputs.past_key_values))
print(model)

28
Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm):

In [97]:
print(outputs.past_key_values[27][1].shape)

torch.Size([1, 2, 8, 128])


In [None]:
from transformers import Qwen2ForCausalLM

In [99]:
print(model.model.layers[0].self_attn.num_key_value_heads)
print(model.model.layers[0].self_attn.num_heads)
print(model.model.layers[0].self_attn.num_key_value_groups)

2
12
6


In [120]:
# inference
in_text = "Beijing is the captical of "
in_tokens = tokenizer(in_text, return_tensors="pt")["input_ids"]
print("bos:", tokenizer.bos_token_id, tokenizer.bos_token)
print("eos:", tokenizer.eos_token_id, tokenizer.eos_token)
print("in_tokens:", in_tokens)

eos_token = torch.tensor(tokenizer.eos_token_id)
out_token, kvcache = None, None
out_text = in_text

cnt = 0
max_length = 10
in_tokens, eos_token = in_tokens.to("cuda"), eos_token.to("cuda")
with torch.no_grad():
    while out_token != eos_token and cnt < max_length:
        text = tokenizer.decode(in_tokens[0])
        print(f"step {cnt} input: {text}", flush=True)

        outputs = model(in_tokens, past_key_values=kvcache)
        logits, kvcache = outputs.logits, outputs.past_key_values
        out_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
        in_tokens = out_token

        text = tokenizer.decode(in_tokens[0])
        print(f"step {cnt} output: {text}", flush=True)

        cnt += 1
        out_text += text

print(f"\nInput: {in_text}")
print(f"Output: {out_text}")

bos: None None
eos: 151645 <|im_end|>
in_tokens: tensor([[ 3430, 23649,   374,   279,  6427,   938,   315,   220]])
step 0 input: Beijing is the captical of 
step 0 output:  ____
step 1 input:  ____
step 1 output: 1
step 2 input: 1
step 2 output: ____
step 3 input: ____
step 3 output:  the
step 4 input:  the
step 4 output:  capital
step 5 input:  capital
step 5 output:  of
step 6 input:  of
step 6 output:  China
step 7 input:  China
step 7 output: .
step 8 input: .
step 8 output:  It
step 9 input:  It
step 9 output:  is

Input: Beijing is the captical of 
Output: Beijing is the captical of  ____1____ the capital of China. It is


In [102]:
outputs = model(in_tokens, past_key_values=kvcache)

In [115]:
outputs.logits.shape

torch.Size([1, 1, 151936])

In [126]:
from torch import nn

In [130]:
cnt = 0
for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        print(f"{cnt} | {name} | {module}")
        cnt += 1
        break

0 | model.layers.0.self_attn.q_proj | Linear(in_features=1536, out_features=1536, bias=True)


In [134]:
module.in_features, module.out_features

(1536, 1536)

In [4]:
# inference
in_text = ["Beijing is the captical of ", "Apple"]
in_tokens = tokenizer(in_text, return_tensors="pt", padding=True)["input_ids"]
print("bos:", tokenizer.bos_token_id, tokenizer.bos_token)
print("eos:", tokenizer.eos_token_id, tokenizer.eos_token)
print("in_tokens:", in_tokens)

outputs = model(in_tokens.to('cuda'))
print(outputs)

# eos_token = torch.tensor(tokenizer.eos_token_id)
# out_token, kvcache = None, None
# out_text = in_text

# cnt = 0
# max_length = 10
# in_tokens, eos_token = in_tokens.to("cuda"), eos_token.to("cuda")
# with torch.no_grad():
#     while out_token != eos_token and cnt < max_length:
#         text = tokenizer.decode(in_tokens[0])
#         print(f"step {cnt} input: {text}", flush=True)

#         outputs = model(in_tokens, past_key_values=kvcache)
#         logits, kvcache = outputs.logits, outputs.past_key_values
#         out_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
#         in_tokens = out_token

#         text = tokenizer.decode(in_tokens[0])
#         print(f"step {cnt} output: {text}", flush=True)

#         cnt += 1
#         out_text += text

# print(f"\nInput: {in_text}")
# print(f"Output: {out_text}")

bos: None None
eos: 151645 <|im_end|>
in_tokens: tensor([[  3430,  23649,    374,    279,   6427,    938,    315,    220],
        [ 26567, 151643, 151643, 151643, 151643, 151643, 151643, 151643]])


CausalLMOutputWithPast(loss=None, logits=tensor([[[ 6.1250,  6.0312,  3.8281,  ..., -2.9062, -2.9062, -2.9062],
         [ 5.6875,  3.4062,  1.8828,  ..., -5.0000, -5.0000, -5.0000],
         [ 7.5000,  5.5000,  3.3281,  ..., -4.5625, -4.5625, -4.5625],
         ...,
         [ 7.3750,  7.6250,  5.0938,  ..., -5.0625, -5.0312, -5.0312],
         [ 6.5625,  6.6875,  4.0938,  ..., -4.9062, -4.9062, -4.9062],
         [ 1.3359, -8.6875, -6.1875,  ..., -2.8125, -2.8125, -2.8125]],

        [[ 6.8750,  4.5312,  2.9375,  ..., -5.1562, -5.1562, -5.1562],
         [ 6.5625,  6.7500,  9.6250,  ..., -3.3750, -3.3750, -3.3750],
         [ 6.0938,  9.0000, 14.9375,  ..., -3.6406, -3.6406, -3.6406],
         ...,
         [ 6.8750,  9.2500, 14.3750,  ..., -4.3750, -4.3750, -4.3750],
         [ 8.1875, 10.1250, 14.3125,  ..., -4.0312, -4.0312, -4.0312],
         [ 8.5625, 10.6250, 13.7500,  ..., -3.3750, -3.3750, -3.3750]]],
       device='cuda:0', grad_fn=<ToCopyBackward0>), past_key_values=((tenso