### 下载qwen模型

In [25]:
from modelscope import AutoTokenizer, AutoModelForCausalLM, snapshot_download
import torch

cache_dir = "/raid/gfc/llm/models"
model_id = "Qwen/Qwen2.5-1.5B-Instruct"

model_dir = snapshot_download(cache_dir=cache_dir, model_id=model_id)

tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="cuda:7")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [26]:
print(type(tokenizer))
print(tokenizer.tokenize("你好吗？宝贝")) # 

<class 'transformers.models.qwen2.tokenization_qwen2.Qwen2Tokenizer'>
['ä½łå¥½', 'åĲĹ', 'ï¼Ł', 'å®Ŀè´Ŀ']


In [27]:
text = "你好吗？宝贝"
input_ids = tokenizer.encode(text, add_special_tokens=False)
print("input_ids:", input_ids) # input_ids: [108386, 101037, 11319, 105882]

# 解码回来看看是否一致
decoded = tokenizer.decode(input_ids)
print("decoded:", decoded)


input_ids: [108386, 101037, 11319, 105882]
decoded: 你好吗？宝贝


In [28]:
text = "你好吗？宝贝"
input_ids = tokenizer.encode(text, add_special_tokens=True) # input_ids: [108386, 101037, 11319, 105882]
print("input_ids:", input_ids)

input_ids: [108386, 101037, 11319, 105882]


In [29]:
tokenizer.special_tokens_map

{'eos_token': '<|im_end|>',
 'pad_token': '<|endoftext|>',
 'additional_special_tokens': ['<|im_start|>',
  '<|im_end|>',
  '<|object_ref_start|>',
  '<|object_ref_end|>',
  '<|box_start|>',
  '<|box_end|>',
  '<|quad_start|>',
  '<|quad_end|>',
  '<|vision_start|>',
  '<|vision_end|>',
  '<|vision_pad|>',
  '<|image_pad|>',
  '<|video_pad|>']}

In [30]:
print(tokenizer.bos_token, tokenizer.eos_token)  # None None

None <|im_end|>


In [31]:
tokenizer.tokenize("<|im_start|>")

['<|im_start|>']

In [32]:
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

In [33]:
text

'<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nGive me a short introduction to large language model.<|im_end|>\n<|im_start|>assistant\n'

In [34]:
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)


In [51]:
(torch.tensor([1, 2, 3]) + torch.tensor([4, 6]))

RuntimeError: The size of tensor a (3) must match the size of tensor b (2) at non-singleton dimension 0

In [35]:
print(model_inputs.input_ids)
print(generated_ids)

tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,  35127,    752,    264,
           2805,  16800,    311,   3460,   4128,   1614,     13, 151645,    198,
         151644,  77091,    198]], device='cuda:7')
tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,  35127,    752,    264,
           2805,  16800,    311,   3460,   4128,   1614,     13, 151645,    198,
         151644,  77091,    198,   2121,    458,  15235,   7881,    553,  54364,
          14817,     11,    358,   1079,   2598,   1207,  16948,     13,   3017,
           7428,    374,    311,   7789,   3847,    304,  23163,   1467,   3118,
            389,  50932,   3897,    311,    752,     13, 

In [36]:
tokenizer.encode("<|im_start|>assistant\n")

[151644, 77091, 198]

In [37]:
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
generated_ids

[tensor([  2121,    458,  15235,   7881,    553,  54364,  14817,     11,    358,
           1079,   2598,   1207,  16948,     13,   3017,   7428,    374,    311,
           7789,   3847,    304,  23163,   1467,   3118,    389,  50932,   3897,
            311,    752,     13,    358,    646,   1492,    448,   4378,  22844,
             11,  75878,  44219,     11,   6825,   7343,     11,    323,   1496,
          35764,   4755,    911,   5257,  13347,     13,    358,  36006,    311,
           3410,  13382,    323,   9760,   1995,    311,    847,   3847,   1393,
          20337,    279,   8426,  10659,    315,  30208,   6786,     13, 151645],
        device='cuda:7')]

In [38]:
decoded_prompt = tokenizer.decode(model_inputs.input_ids[0])
print(decoded_prompt)

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Give me a short introduction to large language model.<|im_end|>
<|im_start|>assistant



In [40]:
tokenizer.decode(generated_ids[0])

'As an AI developed by Alibaba Cloud, I am called Qwen. My purpose is to assist users in generating text based on prompts provided to me. I can help with writing essays, composing poems, creating stories, and even answering questions about various topics. I strive to provide accurate and relevant information to my users while maintaining the highest standards of ethical conduct.<|im_end|>'

In [41]:
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [43]:
response

'As an AI developed by Alibaba Cloud, I am called Qwen. My purpose is to assist users in generating text based on prompts provided to me. I can help with writing essays, composing poems, creating stories, and even answering questions about various topics. I strive to provide accurate and relevant information to my users while maintaining the highest standards of ethical conduct.'

In [None]:
instruction = tokenizer(
    f"<|im_start|>system\n你是一个文本分类领域的专家，你会接收到一段文本和几个潜在的分类选项，请输出文本内容的正确类型<|im_end|>\n<|im_start|>user\n我感冒了怎么办？<|im_end|>\n<|im_start|>assistant\n",
    add_special_tokens=False,
)
response = tokenizer(f"感冒了，睡觉就好了！", add_special_tokens=False)
input_ids = instruction["input_ids"] + response["input_ids"]
attention_mask = (instruction["attention_mask"] + response["attention_mask"])
print(attention_mask == instruction["attention_mask"] + response["attention_mask"])

True


: 