# Qwen2.5-0.5B-Instruct - Causal LM #

In [1]:
import os
os.environ['HF_HOME'] = "E:\Production\models\hgf" # redirect hf cache
os.environ['HF_ENDPOINT'] = "https://hf-mirror.com"
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1' # disable hf symlink warning

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-0.5B-Instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
# Instruction Tuning for better instruction-following 指令微调->更好的指令遵循
print(model)

  from .autonotebook import tqdm as notebook_tqdm


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [2]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

# from transformers
from transformers import Qwen2Tokenizer
tk_exp = Qwen2Tokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

# different models have different special tokens
tokenizer.special_tokens_map

{'eos_token': '<|im_end|>',
 'pad_token': '<|endoftext|>',
 'additional_special_tokens': ['<|im_start|>',
  '<|im_end|>',
  '<|object_ref_start|>',
  '<|object_ref_end|>',
  '<|box_start|>',
  '<|box_end|>',
  '<|quad_start|>',
  '<|quad_end|>',
  '<|vision_start|>',
  '<|vision_end|>',
  '<|vision_pad|>',
  '<|image_pad|>',
  '<|video_pad|>']}

## 使用 ##
加载 -> 构建prompt和tokenizer -> 推理和解码

In [3]:
prompt = "帮我写一个请假条，就用一句话，不超过20个字。"
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt},
]

In [4]:
# 使用chat template
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False, # tokenize the output? tokens : String
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

In [5]:
print(text)
print("====" * 10)
print(model_inputs)

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
帮我写一个请假条，就用一句话，不超过20个字。<|im_end|>
<|im_start|>assistant

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198, 108965,  61443,  46944,
         118962,  38989,   3837,  80158,  11622, 105321,   3837, 106070,     17,
             15,  18947,  18600,   1773, 151645,    198, 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}


In [6]:
generated_ids = model.generate(
    input_ids=model_inputs.input_ids,
    max_new_tokens=64
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

print("====" * 10)
# 其他方法：直接添加提示<|im_start|>, assitant等
prompt = "帮我写一个请假条，就用一句话，不超过20个字。<|im_start|>assistant"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
print(inputs)
generation = model.generate(
    input_ids=inputs.input_ids,
    max_new_tokens=64
)
print(tokenizer.decode(generation[0],))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


尊敬的领导，请休假一周。
{'input_ids': tensor([[108965,  61443,  46944, 118962,  38989,   3837,  80158,  11622, 105321,
           3837, 106070,     17,     15,  18947,  18600,   1773, 151644,  77091]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}
帮我写一个请假条，就用一句话，不超过20个字。<|im_start|>assistant
尊敬的领导，请休假两周，感谢支持！<|im_end|>


In [7]:
# 中文prompt, 词元展示
for id_ in inputs.input_ids[0]:
    print(tokenizer.decode([id_]))

帮我
写
一个
请假
条
，
就
用
一句话
，
不超过
2
0
个
字
。
<|im_start|>
assistant


In [8]:
# 英文prompt, 词元展示
prompt = "Write an apologizing letter in English, just one sentence, no more than 20 words.<|im_start|>assistant"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
print(input_ids)
for id_ in input_ids[0]:
    print(tokenizer.decode([id_]))

# 不同模型分词结果不同, tokenizer和model需一致

tensor([[  7985,    458,  20419,   4849,   6524,    304,   6364,     11,   1101,
            825,  11652,     11,    902,    803,   1091,    220,     17,     15,
           4244,     13, 151644,  77091]], device='cuda:0')
Write
 an
 apolog
izing
 letter
 in
 English
,
 just
 one
 sentence
,
 no
 more
 than
 
2
0
 words
.
<|im_start|>
assistant


## ※ Transformers Pipeline ##

In [9]:
from transformers import pipeline

# 生成pipeline
generator = pipeline(
    "text-generation", # decoder only, text2text generation
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=500,
    do_sample=False
)
# 构建prompt
messages = [
    {"role": "user", "content": "帮我写一个请假条，就用一句话，不超过20个字。"}
]
# 输出及解码
output = generator(messages)
print(output[0]["generated_text"])

Device set to use cuda


尊敬的领导，请批准我申请休假一周。


# Embedding #


## 原始 word2vec embedding (传统 NLP approach) ##

In [10]:
# import numpy
# import scipy
# print("Terminal NumPy version:", numpy.__version__)
# print("Terminal SciPy version:", scipy.__version__)
# print("NumPy path:", numpy.__file__)
# print("SciPy path:", scipy.__file__)
import gensim.downloader as api

w2v_model = api.load("glove-wiki-gigaword-50")
print(w2v_model.most_similar([w2v_model['king']], topn=11))
# 对king的所有词的相似度排序

print(w2v_model.get_vector('king').shape)
# vector 大小: 50


[('king', 1.0000001192092896), ('prince', 0.8236179351806641), ('queen', 0.7839043140411377), ('ii', 0.7746230363845825), ('emperor', 0.7736247777938843), ('son', 0.766719400882721), ('uncle', 0.7627150416374207), ('kingdom', 0.7542161345481873), ('throne', 0.7539914846420288), ('brother', 0.7492411136627197), ('ruler', 0.7434253692626953)]
(50,)


## 使用 PyTorch 自带 Embedding ##

In [28]:
import torch
import torch.nn as nn

nums_vocab = 49408 # 词表长度, 代表 token ID 范围: 0-49407
nums_emb = 768 # 词向量维度
nums_token = 64 # 经 tokenizer (或其他方法) 分词后的词元数量为 64
token_embedding = nn.Embedding(nums_vocab, nums_emb)
print(token_embedding.weight.shape)

torch.Size([49408, 768])


### Position Embedding ###

In [29]:
import torch
import torch.nn as nn

# 定义位置编码, 因为是可学习参数, 使用 nn.Parameter 创建 (默认 requires_grad=True)
position_embedding = nn.Parameter(torch.zeros((nums_token, nums_emb)))
print(position_embedding.shape)

torch.Size([64, 768])


In [33]:
# 假设一个多 batch 的随机的 tokens 输入
batch_size = 2
tokens = torch.randint(0, nums_vocab, (batch_size, nums_token)).type(torch.long)

state = token_embedding(tokens)  # 词嵌入
state += position_embedding[:nums_token] # 位置嵌入
print(state.shape)

torch.Size([2, 64, 768])


## 上下文相关的Embedding ##
Contextualized Word Embeddings From a Language Model (BERT)

最早为EIMO -> BERT

In [5]:
import os
os.environ['HF_HOME'] = "E:\Production\models\hgf" # redirect hf cache
os.environ['HF_ENDPOINT'] = "https://hf-mirror.com"
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1' # disable hf symlink warning

from transformers import AutoTokenizer, AutoModel
# 调用 BERT 的分词器和模型权重
tokenizer = AutoTokenizer.from_pretrained(
    "google-bert/bert-base-uncased",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
model = AutoModel.from_pretrained("google-bert/bert-base-uncased")

In [6]:
# BERT 的分词格式: [CLS], Hello, World, [SEP]; 生成的是 token ID
tokens = tokenizer("Hello World", return_tensors="pt")
print("tokens:", tokens)

for token in tokens['input_ids'][0]:
    # token ID 对应的词元
    print(token, tokenizer.decode([token]))

tokens: {'input_ids': tensor([[ 101, 7592, 2088,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
tensor(101) [CLS]
tensor(7592) hello
tensor(2088) world
tensor(102) [SEP]


In [14]:
# 生成该输入的文本向量, batch_size=1 (批量), seq_len=4 (序列长), dim=768 (词向量维度)
# batch_size 受用户控制, seq_len 受模型分词器控制, dim 固定(预训练模型)

output_vec = model(**tokens)[0] # 因为tokens是字典, 这里使用 ** 来将字典中的键值对作为参数传递给 model
# model(**tokens) 将 tokens 中的所有键值对作为参数传递给 model 的输出的第 [0] 元素为 last_hidden_state (即输出的文本向量)
print(output_vec.shape)

torch.Size([1, 4, 768])


## 文本向量(句向量) Text Embeddings ##
sentences and whole documents

In [14]:
import os
os.environ['HF_HOME'] = "E:\Production\models\hgf" # redirect hf cache
os.environ['HF_ENDPOINT'] = "https://hf-mirror.com"
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1' # disable hf symlink warning

from sentence_transformers import SentenceTransformer
model = SentenceTransformer(
    "sentence-transformers/all-distilroberta-v1",
)
vector_hello = model.encode("Hello World")
vector_hello.shape
# 一整个句子embedding -> 768 dims


(768,)

In [15]:
vector_hello

array([ 1.05811600e-02,  5.37763070e-03, -2.34430600e-02, -4.25446443e-02,
        2.76464168e-02,  1.68265179e-02,  1.29372226e-02,  8.09646025e-03,
        3.76028754e-02, -1.89431990e-03,  2.47532129e-02,  5.17348684e-02,
        2.46639666e-03, -6.13043196e-02, -2.85768695e-02, -3.46861826e-03,
       -4.37848158e-02, -2.51570325e-02, -9.90716890e-02,  1.50454063e-02,
        1.12372525e-02,  8.87956936e-04, -2.28913147e-02,  1.51753370e-02,
        4.45005931e-02,  8.06042552e-02, -1.41294338e-02, -3.23146097e-02,
        5.68793789e-02,  4.02171910e-02,  3.28739397e-02, -4.21925634e-02,
       -5.83373271e-02, -2.11361088e-02, -2.71723662e-02, -1.04420707e-02,
       -2.43445847e-05, -3.11300792e-02,  4.73614894e-02,  2.97100823e-02,
       -2.21333909e-03, -3.42768170e-02, -1.09388097e-03, -2.74186675e-03,
       -2.22981162e-02, -1.81331653e-02, -5.90829365e-02, -4.41422947e-02,
       -1.13985045e-02,  4.98630665e-02, -3.54508571e-02, -2.75734030e-02,
       -2.74208449e-02,  

# 深入 Transformer LLM #
- transformers llm 不同输入和输出的区别
- RMSNorm 和 layernorm 的区别
- KV Cache 原理，推理时的使用

## transformers llm 不同输入和输出的区别 ##

In [16]:
import os
os.environ['HF_HOME'] = "E:\Production\models\hgf" # redirect hf cache
os.environ['HF_ENDPOINT'] = "https://hf-mirror.com"
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1' # disable hf symlink warning

In [17]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
generator = pipeline(
    "text-generation", # for CausalLM, if encoder-decoder: text2text-generation
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=50,
    do_sample=False,
)
print(model)

Device set to use cuda


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [18]:
prompt = "《静夜思》作者是谁？"
output = generator(prompt)
print(prompt) # not a chatML formmat input, might give wierd anwser

print(output[0]["generated_text"])

《静夜思》作者是谁？
 《静夜思》是唐代诗人李白的诗作。全诗如下：

床前明月光，疑是地上霜。
举头望明月，低头思故乡。

这首诗描绘了诗人夜晚在床前看到明亮


In [19]:
messages = [
    {"role": "system", "content": "你是一位精通古诗文的专家。"},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(text) # chatML formmat input

print(generator(text)[0]["generated_text"])

<|im_start|>system
你是一位精通古诗文的专家。<|im_end|>
<|im_start|>user
《静夜思》作者是谁？<|im_end|>
<|im_start|>assistant

李白是这首诗的作者。


In [20]:
## model output
prompt = "The capital of France is"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
# (bs, seq_len): token_ids

# model api:    generate    model.model    model.ln_head

# model.model: 计算隐藏层表示 (hidden_state), token -> vector
# Encoder-Decoder: 作为 Encoder 部分, 输出进一步用于 Decoder 任务, 如翻译、摘要等
# Encoder Only: 作为主体, 输出直接用于模型任务，如分类, 填空等
# Decoder Only: 作为 Decoder 部分, 输出被用于生成下一个 token, 即生成等任务
model_output = model.model(input_ids)
print(model_output[0].shape) # (bs, seq_len, hidden_size)

# what we need: vocab_index (token_id) -> which word

# model.lm_head: FC Layer, 将每个 token 的 hidden states 映射到一个 vocab_size 的向量
# SoftMax得到的分布表示模型认为当前 token 接下来的会是词表中各个单词的概率
lm_head_output = model.lm_head(model_output[0])
print(lm_head_output.shape) # (bs, seq_len, vocab_size)


torch.Size([1, 5, 896])
torch.Size([1, 5, 151936])


In [21]:
output = model.generate(input_ids) # 依赖训练数据格式
tokenizer.decode(output[0])

'The capital of France is Paris. It is the largest city in Europe and one of the most populous cities in the world,'

In [22]:
token_id = lm_head_output[0, -1].argmax(-1)
# predicted token_id of the last token "is"
print(token_id)
print(tokenizer.decode([token_id])) # decode the token_id (vocab_index) to a word

tensor(12095, device='cuda:0')
 Paris


## RMSNorm, LayerNorm, BatchNorm ##

Norm作用：对输入数据进行归一化，使得数据具有零均值和单位方差，从而使得数据更加稳定和易于优化
省流: 计算复杂度 BN > LN > RMSN
适用场景: BN(CV 通常固定大小), LN(NLP 主流), RMSN(NLP 可加速训练)

**RMSNorm**: 对每个样本在 **特征维度(seq_len / CHW)** 上归一化, 参数量 **h**: 缩放 γ:h, 无中心化,; 对批次大小不依赖, 适合小批次或动态批次;推理阶段无需额外计算, Transformer 模型中与 LN 效果相当, 但计算效率更高, 资源敏感时使用
$$
\text{RMSNorm}(x) = \gamma \cdot \frac{x}{\sqrt{RMS(x^2) + \epsilon}}
$$

**LayerNorm**: 对每个样本在 **特征维度(seq_len / CHW)** 上归一化, 参数量 **2h**: 缩放 γ:h + 中心化(-E[x] / σ) β:h; 对批次大小不依赖, 适合小批次或动态批次; 推理阶段无需额外计算
$$
\text{LayerNorm}(x) = \gamma \cdot \frac{x - \mu_L}{\sqrt{\sigma_L^2 + \epsilon}} + \beta
$$

**BatchNorm**: 对每个特征通道在 **批次维度(b)** 上归一化, 参数量 **2h**: 缩放 γ:h + 中心化(-E[x] / σ) β:h, 强依赖批次大小, 小批次会导致均值和方差不准确, 导致训练不稳定, 因此更适合 CV 而不是 NLP(动态序列长和批次大小); 推理阶段需要额外计算均值和方差, 计算复杂度为最高
$$
\text{BatchNorm}(x) = \gamma \cdot \frac{x - \mu_B}{\sqrt{\sigma_B^2 + \epsilon}} + \beta
$$


## KV Cache ##

In [23]:
# KV Cache: 将 KV 矩阵的 Token 缓存起来，避免重复计算，Q 矩阵无法缓存，因为需要动态计算
# 1. Key, Value: 需要上一层输出的 hidden states
# 2. Query: 需要当前输入的 hidden states
prompt = "The capital of France is"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

In [24]:
%%timeit -n 1
generation_output = model.generate(
    input_ids=input_ids,
    max_new_tokens=100,
    use_cache=False
)

The slowest run took 14.57 times longer than the fastest. This could mean that an intermediate result is being cached.
7.55 s ± 12.1 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
%%timeit -n 1
generation_output = model.generate(
    input_ids=input_ids,
    max_new_tokens=100,
    use_cache=True
)

The slowest run took 10.76 times longer than the fastest. This could mean that an intermediate result is being cached.
4.64 s ± 5.84 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


# 文本分类 #
- 表征类: input -> representation LM -> calss (0\1\...)
- 生成式: input -> generative LM -> class (The input class is ...)

1. 表征类方法属于 task specific 的方法, 每个任务需要重新训练
2. 生成式方法属于 task agonizing 的方法, 每个任务不需要重新训练

## 表征类方法 ##

In [21]:
import os
os.environ['HF_HOME'] = "E:\Production\models\hgf" # redirect hf cache
os.environ['HF_ENDPOINT'] = "https://hf-mirror.com"
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1' # disable hf symlink warning

In [3]:
## representation: token -> vector -> label
## eg. 这部电影好看 -> vector([1,2,3,0,1]) -> label(1): 正向情感

from datasets import load_dataset

data = load_dataset("rotten_tomatoes")
data


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [28]:
data["train"][0,-1]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'things really get weird , though not particularly scary : the movie is all portent and no content .'],
 'label': [1, 0]}

In [29]:
from transformers import pipeline

model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

pipe = pipeline(
    "sentiment-analysis",
    model=model_path,
    tokenizer=model_path,
    return_all_scores=True,
    device="cuda:0"
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [30]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "text")), total=len((data["test"]))):
    # 模型内置3分类，0: negative, 1: neutral, 2: positive
    # 该数据集只有2分类，所以只取 negative 和 positive
    neg_score = output[0]["score"]
    # neu_score = output[1]["score"]
    pos_score = output[2]["score"]
    y_pred.append(np.argmax([neg_score, pos_score]))

from sklearn.metrics import classification_report

print(classification_report(data["test"]["label"], y_pred, target_names=["negative", "positive"]))

100%|██████████| 1066/1066 [00:08<00:00, 125.36it/s]

              precision    recall  f1-score   support

    negative       0.76      0.88      0.81       533
    positive       0.86      0.72      0.78       533

    accuracy                           0.80      1066
   macro avg       0.81      0.80      0.80      1066
weighted avg       0.81      0.80      0.80      1066






#### Tips - macro/micro avg ####
- macro avg: 每个类别(每个类的 TP, TN,FP, FN)的指标(precision, recall, f1)的平均值
- micro avg: 所有样本(总 TP, TN, FP, FN)的指标的平均值

### 用文本向量做分类 ###
训练分类模型(监督学习)

In [31]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")

train_embeddings = model.encode(data["train"]["text"], show_progress_bar=True)
test_embeddings = model.encode(data["test"]["text"], show_progress_bar=True)

Batches: 100%|██████████| 267/267 [00:05<00:00, 50.88it/s]
Batches: 100%|██████████| 34/34 [00:00<00:00, 53.12it/s]


In [32]:
train_embeddings.shape

(8530, 768)

### 传统机器学习 ###

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

clf = LogisticRegression(
    random_state=42,
    max_iter=1000
)
clf.fit(train_embeddings, data["train"]["label"])

y_pred = clf.predict(test_embeddings)
print(classification_report(data["test"]["label"], y_pred, target_names=["negative", "positive"]))

              precision    recall  f1-score   support

    negative       0.79      0.82      0.80       533
    positive       0.81      0.78      0.79       533

    accuracy                           0.80      1066
   macro avg       0.80      0.80      0.80      1066
weighted avg       0.80      0.80      0.80      1066



### Zero-shot 方式分类 ###
分类任务 -> 匹配任务

模型表示的 embedding 和 label 的 embedding 进行匹配

In [34]:
## 所有 label 的都加起来再算平均, 得到每个 label 的 embedding
## 和 MF 思想较为接近

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report


df = pd.DataFrame(np.hstack([train_embeddings, np.array(data["train"]["label"]).reshape(-1, 1)]))
# 将 label 拼接到 embedding 后面作为最后一维
print(df.shape, train_embeddings.shape) # 8530 个样本, df 多 1 维 label
# 每个样本都按最后一维(label)进行分组, 每个组再取平均,
avg_label_embeddings = df.groupby(train_embeddings.shape[1]).mean().values
# 得到每个 label 对应的嵌入向量 embedding 的平均值
print(avg_label_embeddings.shape)


sim_matrix = cosine_similarity(test_embeddings, avg_label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

print(classification_report(data["test"]["label"], y_pred, target_names=["negative", "positive"]))

(8530, 769) (8530, 768)
(2, 768)
              precision    recall  f1-score   support

    negative       0.74      0.79      0.76       533
    positive       0.77      0.73      0.75       533

    accuracy                           0.76      1066
   macro avg       0.76      0.76      0.76      1066
weighted avg       0.76      0.76      0.76      1066



In [35]:
## 另一种方式获取 label embedding
## 效果根据 label 的描述会有较大变化, 总体可能仍然不如第一种方法
## 一般不会用这种方式

label_embeddings = model.encode(["A very negative review", "A very positive review"])
print(label_embeddings.shape)

from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

print(classification_report(data["test"]["label"], y_pred, target_names=["negative", "positive"]))


(2, 768)
              precision    recall  f1-score   support

    negative       0.71      0.77      0.74       533
    positive       0.75      0.69      0.72       533

    accuracy                           0.73      1066
   macro avg       0.73      0.73      0.73      1066
weighted avg       0.73      0.73      0.73      1066



## 生成式模型 ##
- Encoder-Decoder
- Decoder Only
- 使用Prompt直接调用API

In [38]:
# 创建 t2t pipeline
pipe = pipeline(
    "text2text-generation",
    model = "google/flan-t5-small",
    device="cuda:0"
)

prompt = "Is the following sentence positive or negative?" # model will output "positive" / "negative"
data = data.map(lambda input_sentence: {"t5": prompt+input_sentence["text"]})
data # 拼接数据, "t5" 是构造出来的模型输入的 key

Device set to use cuda:0
Map: 100%|██████████| 8530/8530 [00:00<00:00, 33435.55 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 17508.91 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 19991.72 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
})

In [42]:
y_pred = []
# 对 test 集进行预测, "t5" 是模型输入的 key
for output in tqdm(pipe(KeyDataset(data["test"], "t5")), total=len(data["test"])):
    # 保证 y_pred 元素与 data["test"]["label"] 都为标签 0 / 1
    y_pred.append(0 if output[0]["generated_text"] == "negative" else 1)

print(classification_report(data["test"]["label"], y_pred, target_names=["negative", "positive"]))

100%|██████████| 1066/1066 [00:43<00:00, 24.38it/s]

              precision    recall  f1-score   support

    negative       0.83      0.84      0.83       533
    positive       0.84      0.83      0.83       533

    accuracy                           0.83      1066
   macro avg       0.83      0.83      0.83      1066
weighted avg       0.83      0.83      0.83      1066






#### Tips ####
这种本质上是提示学习(prompt learning), 而 Instruction Tuning 和 Prompt tuning 方法的核心是一致的:发掘语言模型本身具备的知识。不同点在于: prompt 是去激发语言模型的补全能力, 给出上句补下句、完形填空等, 都还是 LM 任务, Instruction Tuning 则是激发语言模型的理解能力, 直接给一个指令, 让 LM 去执行这个指令, 这才是真正的任务。

现在 prompt learning 也已经比较少了, prompt learning 可以看作为 Instruction tuning 的子集。 

### Decoder Model ###

In [45]:
pipe = pipeline(
    "text2text-generation",
    model="gpt2",
    device="cuda:0",
    max_length=100
)

prompt = "Is the following sentence positive or negative?"
data = data.map(lambda input_sentence: {"gpt2": prompt+input_sentence["text"]})
data

Device set to use cuda:0
The model 'GPT2LMHeadModel' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'Qwen2AudioForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneratio

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 't5', 'gpt2'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 't5', 'gpt2'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 't5', 'gpt2'],
        num_rows: 1066
    })
})

In [48]:
# y_pred = []
# # 对 test 集进行预测, "gpt2" 是模型输入的 key
# for output in tqdm(pipe(KeyDataset(data["test"], "gpt2")), total=len(data["test"])):
#     # 保证 y_pred 元素与 data["test"]["label"] 都为标签 0 / 1
#     y_pred.append(0 if output[0]["generated_text"] == "negative" else 1)

# print(classification_report(data["test"]["label"], y_pred, target_names=["negative", "positive"]))

pipe("Is the following sentence positive or negative? This move is realy awesome!")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Is the following sentence positive or negative? This move is realy awesome!\n\n"I\'m not sure if it\'s a good idea to have a child with a disability, but I\'m sure it\'s a good idea to have a child with a disability."\n\n"I\'m not sure if it\'s a good idea to have a child with a disability, but I\'m sure it\'s a good idea to have a child with a disability."\n\n"I\'m not sure if it'}]

### chatGPT (Decoder Model) 文本分类 ###

In [11]:
import openai

YOUR_KEY_HERE="sk-ed535e73d0a847c593f10784150d1ab8"
BASE_URL="https://api.deepseek.com"
MODEL_NAME="deepseek-chat"

# YOUR_KEY_HERE = "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# BASE_URL = "https://api.openai.com/v1"
# MODEL_NAME = "gpt-3.5-turbo"

client = openai.OpenAI(
    api_key=YOUR_KEY_HERE,
    base_url=BASE_URL
)
def chat_generation(prompt, document, model=MODEL_NAME):
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": prompt.replace('[DOCUMENT]', document)}
    ]
    chat_response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    return chat_response.choices[0].message.content

prompt = """Predict whether the following document is a positive or negative review:

[DOCUMENT]

If it is positive, return 1 and if it is negative, return 0. DO NOT GIVE ANY OTHER ANSWERS.
"""

document = "unpretentious , charming , quirky , original"
chat_generation(prompt, document)

'1'

### 调用API评估测试集(烧钱) ###

In [26]:
from tqdm import tqdm
import random

n = 10  # 随机提取 10 个样本
indices = random.sample(range(len(data["test"]["text"])), n)  # 随机选择索引

subset_text = [data["test"]["text"][i] for i in indices]
subset_label = [data["test"]["label"][i] for i in indices]

predictions = [chat_generation(prompt, doc) for doc in tqdm(subset_text, total=n)]

100%|██████████| 10/10 [00:08<00:00,  1.20it/s]


In [27]:
from sklearn.metrics import classification_report

y_pred = [int(pred) for pred in predictions]
print(classification_report(subset_label, y_pred, target_names=["negative", "positive"]))

              precision    recall  f1-score   support

    negative       1.00      0.83      0.91         6
    positive       0.80      1.00      0.89         4

    accuracy                           0.90        10
   macro avg       0.90      0.92      0.90        10
weighted avg       0.92      0.90      0.90        10



# Supervised Fine-Tuning (SFT) #
SFT 后出现了 chat template 的概念
STEP:

1. data preparation, 生成prompt --- use prompt template
2. load model --- transformers.AutoModel.from_pretrained
3. LoRA config --- peft LoraConfig, get_peft_model (set require grad)
4. train model --- transformers.TrainingArgument / trl.SFTConfig, trl.SFTTrainer
5. merge model --- peft.AutoModel.from_pretrained, model.merge_and_unload()
6. inference --- transformers.pipeline

### data & model ###

In [1]:
import os
os.environ['HF_HOME'] = "E:\Production\models\hgf" # redirect hf cache
os.environ['HF_ENDPOINT'] = "https://hf-mirror.com"
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1' # disable hf symlink warning

from datasets import load_dataset

test_dataset = load_dataset("YeungNLP/firefly-train-1.1M", split="train[:500]")
print(test_dataset)
print(test_dataset[100])

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

def format_prompt(text):
    chat = [
        {"role": "system", "content": "你是一个性能强大的人工智能助手，说话语气是猫娘样子的。"},
        {"role": "user", "content": text["input"]},
        {"role": "assistant", "content": text["target"]}
    ]

    prompt = tokenizer.apply_chat_template(chat, tokenize=False)
    return {"text": prompt}

dataset = test_dataset.map(format_prompt, remove_columns=test_dataset.column_names)
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm
Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['kind', 'input', 'target'],
    num_rows: 500
})
{'kind': 'ClassicalChinese', 'input': '我当时在三司，访求太祖、仁宗的手书敕令没有见到，然而人人能传诵那些话，禁止私盐的建议也最终被搁置。\n翻译成文言文：', 'target': '余时在三司，求访两朝墨敕不获，然人人能诵其言，议亦竟寝。'}
Dataset({
    features: ['text'],
    num_rows: 500
})


In [2]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
tokenizer.padding_side = "left" # 保证训练和推理时padding位置一致
print(model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

### LORA ###

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    lora_alpha=32,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'v_proj', 'q_proj']
)

model = prepare_model_for_kbit_training(model)
# 如果没有 prepare_model_for_kbit_training，
# 且 training args 中配置了 gradient_checkpointing=True （这个其实也是为了省显存，其实不重要）
# 那么需要设置 model.enable_input_require_grads() 启用启动求导
# 否则会报错 RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
# model.enable_input_require_grads()

model = get_peft_model(model, peft_config)
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 896)
        (layers): ModuleList(
          (0-23): 24 x Qwen2DecoderLayer(
            (self_attn): Qwen2SdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=896, out_features=896, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=896, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=896, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
 

### train config ###

In [4]:
# from trl import SFTConfig # 新版 trl ,使用 SFTConfig dataset_text_field
from transformers import TrainingArguments # 旧版 trl ,使用 TrainingArguments

output_dir = "E:\Production\models\output"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1, # 随显存调整
    gradient_accumulation_steps=2, # 小显存时模拟大 bs, 实际 bs = per_device_bs * grad_acc_steps = 8
    optim="adamw_torch",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    num_train_epochs=1,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=True,

    save_steps=15,
    max_steps=20,
)

In [None]:
from trl import SFTTrainer, PPOTrainer, GRPOTrainer
# model.train()

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    dataset_text_field="text",
)

trainer.model.print_trainable_parameters()
trainer.train()
trainer.model.save_pretrained(output_dir)



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


trainable params: 5,898,240 || all params: 499,931,008 || trainable%: 1.1798


  0%|          | 0/20 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 50%|█████     | 10/20 [00:22<00:22,  2.28s/it]

{'loss': 3.9488, 'grad_norm': 1.2002450227737427, 'learning_rate': 0.0001156434465040231, 'epoch': 0.04}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
100%|██████████| 20/20 [00:44<00:00,  2.09s/it]

{'loss': 3.6893, 'grad_norm': 1.7228679656982422, 'learning_rate': 1.231165940486234e-06, 'epoch': 0.08}


100%|██████████| 20/20 [00:45<00:00,  2.27s/it]


{'train_runtime': 45.4043, 'train_samples_per_second': 0.881, 'train_steps_per_second': 0.44, 'train_loss': 3.8190778732299804, 'epoch': 0.08}


### merge adapter (合并 LoRA 和 Base model) ###

In [None]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage=True,
    device_map="auto",
)

merged_model = model.merge_and_unload()

In [8]:
print(merged_model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

### inference ###

In [14]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=merged_model,
    tokenizer=tokenizer,
)
prompt_eg = """<|im_start|>system
你是一个性能强大的人工智能助手。<|im_end|>
<|im_start|>user
天气太热了，所以我今天没有学习一点。
翻译为文言文：<|im_end|>
<|im_start|>assitant
"""

print(pipe(prompt_eg, max_length=50)[0]['generated_text'])

Device set to use cuda:0


<|im_start|>system
你是一个性能强大的人工智能助手。<|im_end|>
<|im_start|>user
天气太热了，所以我今天没有学习一点。
翻译为文言文：<|im_end|>
<|im_start|>assitant
今之天气甚热，故今日无学一也


### 工作用SFT ###
llamafactory-clci train examples/train_lora/llama3_lora_sft.yaml

llamafactory-clci chat examples/inference/llama3_lora_sft.yaml

llamafactory-clci export examples/merge_lora/llama3_lora_sft.yaml

...
