In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch 
from torch import nn
import torch.nn.functional as F
import math

# 实现线性层的 Lora 微调

In [None]:
class LoraLinear(nn.Module):
    def __init__(self, in_features, out_features, rank, merge, alpha, dropout=0.1):
        """
        LoraLinear: Lora线性层,代替原始的线性层
        """
        # merge: 是否加载 Lora 权重
        # alpha: Lora 权重系数
        # rank: LoRA低秩的维度
        super(LoraLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.rank = rank
        self.merge = merge
        self.alpha = alpha
        self.dropout = dropout

        # TODO: 构建Lora主体
        self.linear = nn.Linear(in_features, out_features)
        if self.rank > 0:
            self.weight_b = nn.Parameter(torch.zeros(out_features, self.rank))
            self.weight_a = nn.Parameter(torch.zeros(self.rank, in_features))    # 后面再初始化成正态分布
            self.scale = self.alpha / self.rank
            self.linear.weight.requires_grad = False

        if self.dropout > 0:
            self.dropout = nn.Dropout(dropout)
        else:
            self.dropout = nn.Identity()   # 恒等映射

        # TODO: 初始化权重
        nn.init.kaiming_uniform_(self.weight_a, a=math.sqrt(5))
        nn.init.zeros_(self.weight_b)

    def forward(self, x):
        if self.rank > 0 and self.merge:
            output = F.linear(x, self.linear.weight + self.scale * self.weight_b @ self.weight_a, self.linear.bias) # linear函数是y = xA^T + b
            output = self.dropout(output)
        else:
            output = self.dropout(self.linear(x))
        return output

# 替换手写 Transformer 模块

In [None]:
from model import Transformer

enc_voc_size = 6000
dec_voc_size = 8000
src_pad_idx = 1
tgt_pad_idx = 1
tgt_sos_idx = 2
batch_size = 32
max_len = 1024
d_model = 512
n_layers = 3
n_head = 2
ffn_hidden = 1024
drop_prob = 0.1
device = "cpu"

model = Transformer(
    src_pad_idx=src_pad_idx,
    tgt_pad_idx=tgt_pad_idx,
    enc_voc_size=enc_voc_size,
    dec_voc_size=dec_voc_size,
    max_len=max_len,
    d_model=d_model,
    n_layers=n_layers,
    n_head=n_head,
    ffn_hidden=ffn_hidden,
    drop_prob=drop_prob
).to(device)

print(model)

In [None]:
# TODO: 用LoraLinear替换原始的Linear
# 收集需要替换的模块信息
modules_to_replace = []
for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        in_features = module.in_features
        out_features = module.out_features
        rank = 32
        merge = True
        alpha = 0.5
        new_module = LoraLinear(in_features, out_features, rank, merge, alpha)
        new_module.linear.weight = module.weight  # 替换原始的权重
        new_module.linear.bias = module.bias      # 如果有偏置，也需要替换
        modules_to_replace.append((name, new_module))

# 替换模块
for name, new_module in modules_to_replace:
    # 使用 _modules 替换模块
    parent_module, attr_name = name.rsplit('.', 1)  # 获取父模块和属性名
    parent = model
    if parent_module:  # 如果有父模块
        parent = model.get_submodule(parent_module)
    setattr(parent, attr_name, new_module)  # 替换子模块
    # print(f"Replace {name} with LoraLinear")

print(model)

# PEFT框架

In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
import os

model_name_or_path = "Qwen/Qwen2.5-0.5B"
tokenizer_name_or_path = "Qwen/Qwen2.5-0.5B"

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 540,672 || all params: 494,573,440 || trainable%: 0.1093


In [14]:
# 进行微调
from torch.utils.data import Dataset, DataLoader

# 定义一个简单的文本数据集
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, return_tensors="pt", max_length=self.max_length, truncation=True, padding="max_length")
        input_ids = inputs["input_ids"].squeeze(0)
        attention_mask = inputs["attention_mask"].squeeze(0)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": input_ids  # For causal LM, labels are the same as input_ids
        }

# 示例训练数据
train_texts = [
    "Hello, how are you?",
    "I am fine, thank you!",
    "What are you doing today?",
    "I am learning about LoRA and transformers.",
    # Add more training samples here
]

# 创建数据集和数据加载器
train_dataset = TextDataset(train_texts, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# 设置训练参数
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
num_epochs = 10

# 训练循环
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

# 保存微调后的模型
model.save_pretrained("finetuned_model")
tokenizer.save_pretrained("finetuned_model")

print("Model fine-tuning completed and saved.")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1/10, Loss: 16.26947593688965
Epoch 2/10, Loss: 16.191492080688477
Epoch 3/10, Loss: 16.114389419555664
Epoch 4/10, Loss: 16.029483795166016
Epoch 5/10, Loss: 15.952570915222168
Epoch 6/10, Loss: 15.91295051574707
Epoch 7/10, Loss: 15.798820495605469
Epoch 8/10, Loss: 15.704445838928223
Epoch 9/10, Loss: 15.628195762634277
Epoch 10/10, Loss: 15.592994689941406
Model fine-tuning completed and saved.


In [None]:
# 合并模型
from peft import merge_lora_weights, LoraConfig, TaskType

model_name_or_path = "Qwen/Qwen2.5-0.5B"
tokenizer_name_or_path = "Qwen/Qwen2.5-0.5B"
model_path = "finetuned_model"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)

model = get_peft_model(model, peft_config)
model.merge_lora_weights()
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)