In [4]:
import os
os.environ["http_proxy"] = "http://127.0.0.1:7897"
os.environ["https_proxy"] = "http://127.0.0.1:7897"

# hugging-face 国内镜像源
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [None]:
import torch
from transformers import BertModel, DistilBertModel, DistilBertForQuestionAnswering
from transformers import DistilBertTokenizer, BertTokenizer
import torch.nn as nn

# 加载BERT教师模型与DistilBERT学生模型
teacher_model = BertModel.from_pretrained('bert-base-uncased')
student_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# 使用相同的Tokenizer进行词汇预处理
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
distil_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# 定义输入文本并进行编码
text = "Machine reading comprehension is essential for question-answering."
inputs = tokenizer(text, return_tensors="pt")
distil_inputs = distil_tokenizer(text, return_tensors="pt")

# 获取教师模型输出
with torch.no_grad():
    teacher_outputs = teacher_model(**inputs).last_hidden_state

# 学生模型的前向传播
student_outputs = student_model(**distil_inputs).last_hidden_state

# 定义蒸馏损失函数：使用均方误差（MSE）对齐学生与教师模型的输出
distillation_loss = nn.MSELoss()(student_outputs, teacher_outputs)

# 打印蒸馏损失
print("Distillation Loss:", distillation_loss.item())

KL散度评估

In [4]:
import torch.nn.functional as F
def kl_distillation_loss(student_logits, teacher_logits, T=2.0):

    # 计算软概率分布
    # 注意：F.kl_div 的输入期望是 log_softmax，目标是 softmax
    p_teacher = F.softmax(teacher_logits / T, dim=-1)
    p_student = F.log_softmax(student_logits / T, dim=-1)
    
    # 计算 KL 散度
    # reduction='batchmean' 是数学上标准的 KL 散度计算方式
    loss = F.kl_div(p_student, p_teacher, reduction='batchmean') * (T ** 2)
    return loss

# 4. 计算并打印损失
loss = kl_distillation_loss(student_outputs, teacher_outputs, T=2.0)
print("KL Distillation Loss:", loss.item())

KL Distillation Loss: 0.3852507472038269


循环蒸馏效果

In [5]:
from torch.optim import AdamW
from tqdm import tqdm
# 这里也可以用其他优化器
optimizer = AdamW(student_model.parameters(), lr=1e-5)
texts = ["Machine learning is the study of algorithms.",
         "Natural Language Processing involves understanding human languages."]
labels = ["It is a subset of AI.", "A field in AI focusing on language."]

# 蒸馏训练循环
for epoch in range(3):
    print(f"Epoch {epoch + 1}")
    total_loss = 0
    for text, label in zip(texts, labels):
        # 准备输入
        inputs = tokenizer(text, return_tensors="pt")
        distil_inputs = distil_tokenizer(text, return_tensors="pt")
        
        # 获取教师模型输出
        with torch.no_grad():
            teacher_outputs = teacher_model(**inputs).last_hidden_state
        
        # 获取学生模型输出
        student_outputs = student_model(**distil_inputs).last_hidden_state
        
        # 计算蒸馏损失
        loss = nn.MSELoss()(student_outputs, teacher_outputs)
        
        # 反向传播与优化
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        # 记录损失
        total_loss += loss.item()
    
    avg_loss = total_loss / len(texts)
    print(f"Average Distillation Loss: {avg_loss:.4f}")

Epoch 1
Average Distillation Loss: 0.0708
Epoch 2
Average Distillation Loss: 0.0597
Epoch 3
Average Distillation Loss: 0.0515


上面是直接进行加载dstill模型，下面要自己尝试进行distill操作

In [None]:
# 冻结和解冻BERT模型
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW

# 加载预训练的BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# 小数据进行单层微调
sentences = ["The book is great!", "The movie was terrible."]
labels = [1, 0]  # 假设1代表积极，0代表消极

# 数据预处理
inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)

# 冻结所有BERT的层
for param in model.bert.parameters():
    param.requires_grad = False

# 解冻特定的层
for param in model.bert.encoder.layer[-2:].parameters():
    param.requires_grad = True

# 定义优化器，仅优化解冻层的参数
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)

# 训练过程示例
model.train()
for epoch in range(3):  # 训练3个周期
    outputs = model(**inputs, labels=torch.tensor(labels))
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(f"Epoch {epoch+1} - Loss: {loss.item()}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Loss: 0.7049428224563599
Epoch 2 - Loss: 0.6767551898956299
Epoch 3 - Loss: 0.5498147010803223


下面对学习率设置进行学习：
学习率决定了梯度更新的步长，如何避免模型错过最优解

In [3]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW

# 加载数据集（示例数据）
data = [
    ("The company posted a significant increase in quarterly revenue.", 0),
    ("New heart disease medication approved by FDA.", 1),
    ("Stock market affected by global events.", 0),
    ("Medical advancements in treating rare diseases.", 1)
]
labels = [item[1] for item in data]
texts = [item[0] for item in data]

# 实例化Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# 将标签转换为张量
labels_tensor = torch.tensor(labels)

# 加载预训练的BERT模型并调整参数
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# 将模型设置为训练模式
model.train()

# 定义优化器和学习率调度器
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3
total_steps = len(inputs["input_ids"]) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# 将数据加载至DataLoader
train_data = DataLoader(list(zip(inputs["input_ids"], inputs["attention_mask"], labels_tensor)), batch_size=2)

# 微调BERT模型
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    total_loss = 0
    for batch in train_data:
        input_ids, attention_mask, labels = batch

        # 梯度清零
        optimizer.zero_grad()

        # 前向传播，获取损失
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # 反向传播
        loss.backward()

        # 梯度裁剪，避免梯度爆炸
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # 参数更新
        optimizer.step()

        # 更新学习率
        scheduler.step()

    avg_loss = total_loss / len(train_data)
    print(f"Average training loss: {avg_loss:.4f}")

# 测试阶段：打印模型参数信息
print("\n部分模型参数示例：")
for name, param in model.named_parameters():
    if "classifier" in name:
        print(f"{name}: {param[:2]}")
        break

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3
Average training loss: 0.6955

Epoch 2/3
Average training loss: 0.5635

Epoch 3/3
Average training loss: 0.4875

部分模型参数示例：
classifier.weight: tensor([[-0.0259, -0.0196, -0.0052,  ...,  0.0098,  0.0145, -0.0071],
        [ 0.0108,  0.0146,  0.0211,  ...,  0.0420, -0.0225,  0.0074]],
       grad_fn=<SliceBackward0>)


更加高级的微调方式，是对参数进行高效微调
[LoRA论文连接](https://arxiv.org/pdf/2106.09685)
[Pre-Tuning论文连接](https://arxiv.org/abs/2101.00190)

In [None]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim

# 初始化BERT模型和tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# 定义LoRA插入函数
class LoRA(nn.Module):
    def __init__(self, input_dim, rank):
        super(LoRA, self).__init__()
        # 定义低秩矩阵
        self.low_rank_left = nn.Parameter(torch.randn(input_dim, rank))# A
        self.low_rank_right = nn.Parameter(torch.randn(rank, input_dim))# B
        self.scaling_factor = 1.0 / (rank ** 0.5)

    def forward(self, x):
        # 低秩矩阵的插入
        lora_update = torch.matmul(self.low_rank_left, self.low_rank_right) * self.scaling_factor
        return x + torch.matmul(x, lora_update)

# 将LoRA应用到模型的encoder层
for layer in model.encoder.layer:
    layer.attention.self.query = LoRA(layer.attention.self.query.in_features, rank=8)

# 定义Prefix Tuning类
class PrefixTuning(nn.Module):
    def __init__(self, model, prefix_length=10, hidden_size=768):
        super(PrefixTuning, self).__init__()
        # 创建前缀向量
        self.prefix_embeddings = nn.Parameter(torch.randn(prefix_length, hidden_size))
        self.prefix_length = prefix_length
        self.hidden_size = hidden_size
        self.model = model

    def forward(self, input_ids, attention_mask):
        # 获取输入嵌入
        original_embeddings = self.model.embeddings(input_ids)
        
        # 将前缀添加到输入
        batch_size = input_ids.size(0)
        prefix_embeddings = self.prefix_embeddings.unsqueeze(0).expand(batch_size, -1, -1)
        modified_embeddings = torch.cat([prefix_embeddings, original_embeddings], dim=1)
        
        # 调整attention mask
        extended_attention_mask = torch.cat([torch.ones(batch_size, self.prefix_length).to(attention_mask.device), attention_mask], dim=1)
        return self.model(inputs_embeds=modified_embeddings, attention_mask=extended_attention_mask)

# 将Prefix Tuning集成到BERT中
prefix_tuning = PrefixTuning(model)
optimizer = optim.Adam(prefix_tuning.parameters(), lr=1e-5)

# 准备示例数据
text = "LoRA and Prefix Tuning are efficient methods for adapting large models."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# 模型训练流程
prefix_tuning.train()
for epoch in range(3):  # 训练3个epoch
    optimizer.zero_grad()
    outputs = prefix_tuning(input_ids=input_ids, attention_mask=attention_mask)
    last_hidden_states = outputs.last_hidden_state
    loss = (last_hidden_states ** 2).mean()
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

# 测试流程
prefix_tuning.eval()
with torch.no_grad():
    outputs = prefix_tuning(input_ids=input_ids, attention_mask=attention_mask)
    print("Output Embeddings:", outputs.last_hidden_state)



'(MaxRetryError("HTTPSConnectionPool(host='hf-mirror.com', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<HTTPSConnection(host='hf-mirror.com', port=443) at 0x768082c95640>, 'Connection to hf-mirror.com timed out. (connect timeout=10)'))"), '(Request ID: c9cc11ad-a5f9-403f-917e-fbcee580f4c2)')' thrown while requesting HEAD https://hf-mirror.com/bert-base-uncased/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError("HTTPSConnectionPool(host='hf-mirror.com', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<HTTPSConnection(host='hf-mirror.com', port=443) at 0x768082c94b90>, 'Connection to hf-mirror.com timed out. (connect timeout=10)'))"), '(Request ID: 7599ef6d-d70d-40b5-a8b9-c8933071e81e)')' thrown while requesting HEAD https://hf-mirror.com/bert-base-uncased/resolve/main/tokenizer_config.json
Retrying i

Epoch 1, Loss: 0.3534093499183655
Epoch 2, Loss: 0.2340371161699295
Epoch 3, Loss: 0.30949392914772034
Output Embeddings: tensor([[[-0.5007,  0.0067,  0.4987,  ..., -0.6998, -0.3988,  0.8621],
         [-0.3269,  0.9228, -0.1940,  ..., -0.4193,  0.0240,  0.1988],
         [-0.3774,  0.9690, -0.0234,  ..., -0.3832, -0.2031, -0.0308],
         ...,
         [-0.8675,  0.1132, -0.0237,  ..., -0.6927,  0.0622,  0.7820],
         [-0.7119,  0.6516,  0.4629,  ..., -0.1070,  0.0572,  0.9439],
         [-0.7524,  0.4928,  0.3584,  ..., -1.4589,  0.9578,  0.5290]]])


In [None]:
"""
优化版 SVD-LoRA + Prefix Tuning 混合模型
改进点：
1. 修复参数冻结问题，正确处理 bias
2. 添加设备管理优化
3. 优化训练参数筛选逻辑
4. 添加梯度累积支持
5. 添加学习率调度器
6. 添加模型保存/加载功能
7. 优化正则化计算效率
8. 添加混合精度训练支持
9. 更好的代码结构和注释
"""

import os
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler  # 混合精度训练
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

# ============== 1. 设置环境与设备 ==============
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ============== 2. 优化的 SVD-LoRA 层 ==============
class SVDLoRALinear(nn.Module):
    """
    SVD-LoRA 层 (创新点：将权重分解为 U * diag(S) * V)
    改进：
    - 正确冻结原始层参数（包括 bias）
    - 使用更稳定的初始化方法
    - 支持可配置的 dropout
    """
    def __init__(self, original_layer, rank=8, alpha=16, dropout=0.0):
        super().__init__()
        # 保存原始层的配置信息
        self.in_features = original_layer.in_features
        self.out_features = original_layer.out_features
        self.has_bias = original_layer.bias is not None
        
        # 冻结原始权重但不替换层（保留原始层结构）
        self.original_weight = original_layer.weight.data.clone()
        original_layer.weight.requires_grad = False
        
        if self.has_bias:
            self.original_bias = original_layer.bias.data.clone()
            original_layer.bias.requires_grad = False
        else:
            self.original_bias = None
        
        # SVD 分解结构：Delta W = U * diag(S) * V
        # 使用 Kaiming 初始化，提高训练稳定性
        self.U = nn.Parameter(torch.randn(self.out_features, rank) * 0.02)
        self.S = nn.Parameter(torch.ones(rank))  # 奇异值对角线
        self.V = nn.Parameter(torch.randn(rank, self.in_features) * 0.02)
        
        self.rank = rank
        self.scaling = alpha / rank
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # 原始权重路径: 使用 F.linear 进行高效计算
        if self.has_bias:
            original_output = F.linear(x, self.original_weight, self.original_bias)
        else:
            original_output = F.linear(x, self.original_weight, None)
        
        # SVD 旁路路径: x @ (V.T @ diag(S) @ U.T)
        # 优化计算顺序: (x @ V.T) * S @ U.T
        lora_input = self.dropout(x)
        lora_output = (lora_input @ self.V.t()) * self.S
        lora_output = lora_output @ self.U.t()
        
        return original_output + lora_output * self.scaling
    
    def extra_repr(self):
        return f'in_features={self.in_features}, out_features={self.out_features}, rank={self.rank}'


# ============== 3. 优化的集成模型 ==============
class HybridSVDModel(nn.Module):
    """
    集成模型 (Prefix-Tuning + SVD-LoRA)
    改进：
    - 支持可配置的 LoRA 注入位置
    - 支持可学习的 Prefix 位置编码
    - 添加 LayerNorm 保持训练稳定
    """
    def __init__(
        self, 
        model_name="bert-base-uncased", 
        prefix_len=10, 
        lora_rank=8,
        lora_alpha=16,
        lora_dropout=0.0,
        lora_targets=["query", "value"]  # 可配置注入位置
    ):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.prefix_len = prefix_len
        self.lora_rank = lora_rank
        hidden_size = self.bert.config.hidden_size
        
        # 注入 SVD-LoRA 到指定层
        self._inject_lora(lora_rank, lora_alpha, lora_dropout, lora_targets)
        
        # Prefix Embedding (连续向量)
        # 使用 BERT 的 embeddings 归一化层
        self.prefix_embedding = nn.Parameter(
            torch.randn(prefix_len, hidden_size) * 0.02
        )
        
        # Prefix LayerNorm 提高稳定性
        self.prefix_ln = nn.LayerNorm(hidden_size)
        
        # 任务头 (以分类任务为例)
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.GELU(),
            nn.Linear(hidden_size // 2, 2)
        )
    
    def _inject_lora(self, rank, alpha, dropout, targets):
        """注入 LoRA 到指定层"""
        for layer in self.bert.encoder.layer:
            if "query" in targets:
                layer.attention.self.query = SVDLoRALinear(
                    layer.attention.self.query, 
                    rank=rank, 
                    alpha=alpha,
                    dropout=dropout
                )
            if "value" in targets:
                layer.attention.self.value = SVDLoRALinear(
                    layer.attention.self.value, 
                    rank=rank, 
                    alpha=alpha,
                    dropout=dropout
                )
            # 可选：添加 key 和 output 投影的 LoRA
            # if "key" in targets:
            #     layer.attention.self.key = SVDLoRALinear(...)
            # if "output" in targets:
            #     layer.attention.output.dense = SVDLoRALinear(...)

    def forward(self, input_ids, attention_mask):
        batch_size = input_ids.shape[0]
        
        # 1. 处理 Prefix
        raw_embeds = self.bert.embeddings(input_ids)
        
        # 扩展 prefix 到 batch size
        prefix_embeds = self.prefix_embedding.unsqueeze(0).expand(batch_size, -1, -1)
        prefix_embeds = self.prefix_ln(prefix_embeds)  # LayerNorm 提高稳定性
        
        # 拼接 prefix 和原始 embeddings
        inputs_embeds = torch.cat([prefix_embeds, raw_embeds], dim=1)
        
        # 2. 扩展 Attention Mask
        prefix_mask = torch.ones(batch_size, self.prefix_len, device=inputs_embeds.device)
        full_mask = torch.cat([prefix_mask, attention_mask], dim=1)
        
        # 3. 经过 BERT (含 LoRA 旁路)
        outputs = self.bert(inputs_embeds=inputs_embeds, attention_mask=full_mask)
        
        # 取 [CLS] token 的输出进行分类
        # 注意：由于加了 prefix，[CLS] 的位置现在在 index = self.prefix_len
        cls_output = outputs.last_hidden_state[:, self.prefix_len, :]
        return self.classifier(cls_output)
    
    def get_trainable_params(self):
        """获取可训练参数"""
        trainable_params = []
        for n, p in self.named_parameters():
            # 精确匹配 LoRA 参数、Prefix 参数和 Classifier 参数
            if '.U' in n or '.V' in n or '.S' in n or 'prefix' in n or 'classifier' in n:
                if p.requires_grad:
                    trainable_params.append(p)
        return trainable_params
    
    def get_svd_params(self):
        """获取 SVD 奇异值参数（用于正则化）"""
        return [p for n, p in self.named_parameters() if '.S' in n and p.requires_grad]


# ============== 4. 训练器类 ==============
class Trainer:
    """
    训练器封装
    支持：
    - 梯度累积
    - 混合精度训练
    - 学习率调度
    - 模型保存/加载
    """
    def __init__(
        self,
        model,
        tokenizer,
        device,
        learning_rate=1e-4,
        weight_decay=0.01,
        warmup_ratio=0.1,
        gradient_accumulation_steps=4,
        use_amp=True  # 混合精度
    ):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.use_amp = use_amp and device.type == "cuda"
        
        # 优化器配置
        self.optimizer = optim.AdamW(
            model.get_trainable_params(),
            lr=learning_rate,
            weight_decay=weight_decay,
            betas=(0.9, 0.999),
            eps=1e-8
        )
        
        self.criterion = nn.CrossEntropyLoss()
        
        # 混合精度训练
        self.scaler = GradScaler() if self.use_amp else None
    
    def train_step(self, inputs, labels, svd_reg_lambda=0.01):
        """单步训练"""
        self.model.train()
        
        # 前向传播
        if self.use_amp:
            with autocast():
                logits = self.model(inputs['input_ids'], inputs['attention_mask'])
                loss = self.criterion(logits, labels)
                
                # SVD 奇异值稀疏化正则化
                svd_params = self.model.get_svd_params()
                if svd_params:
                    l1_reg = sum(torch.norm(p, 1) for p in svd_params)
                    loss = loss + svd_reg_lambda * l1_reg
        else:
            logits = self.model(inputs['input_ids'], inputs['attention_mask'])
            loss = self.criterion(logits, labels)
            
            # SVD 奇异值稀疏化正则化
            svd_params = self.model.get_svd_params()
            if svd_params:
                l1_reg = sum(torch.norm(p, 1) for p in svd_params)
                loss = loss + svd_reg_lambda * l1_reg
        
        # 梯度缩放（混合精度）
        loss = loss / self.gradient_accumulation_steps
        
        if self.use_amp:
            self.scaler.scale(loss).backward()
        else:
            loss.backward()
        
        return loss.item() * self.gradient_accumulation_steps
    
    def step(self):
        """优化器步骤"""
        if self.use_amp:
            self.scaler.unscale_(self.optimizer)
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.scaler.step(self.optimizer)
            self.scaler.update()
        else:
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
        
        self.optimizer.zero_grad()
    
    def set_scheduler(self, num_training_steps, num_warmup_steps):
        """设置学习率调度器"""
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )
    
    def save_model(self, path):
        """保存模型"""
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
        }, path)
        print(f"Model saved to {path}")
    
    def load_model(self, path):
        """加载模型"""
        checkpoint = torch.load(path, map_location=self.device)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print(f"Model loaded from {path}")


# ============== 5. 主训练流程 ==============
def main():
    # 模型配置
    model_config = {
        "model_name": "bert-base-uncased",
        "prefix_len": 10,
        "lora_rank": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.05,
        "lora_targets": ["query", "value"]
    }
    
    # 训练配置
    train_config = {
        "learning_rate": 1e-4,
        "weight_decay": 0.01,
        "warmup_ratio": 0.1,
        "gradient_accumulation_steps": 4,
        "num_epochs": 10,
        "svd_reg_lambda": 0.01
    }
    
    # 初始化 tokenizer 和模型
    tokenizer = BertTokenizer.from_pretrained(model_config["model_name"])
    model = HybridSVDModel(**model_config).to(device)
    
    # 打印可训练参数数量
    trainable_params = model.get_trainable_params()
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params_count = sum(p.numel() for p in trainable_params)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params_count:,} ({100*trainable_params_count/total_params:.2f}%)")
    
    # 初始化训练器
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        device=device,
        learning_rate=train_config["learning_rate"],
        weight_decay=train_config["weight_decay"],
        warmup_ratio=train_config["warmup_ratio"],
        gradient_accumulation_steps=train_config["gradient_accumulation_steps"]
    )
    
    # 准备示例数据
    texts = ["The movie was fantastic!", "I hated this film.", 
             "Great acting and plot!", "Boring and predictable."]
    labels = torch.tensor([1, 0, 1, 0]).to(device)
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)
    
    # 设置学习率调度器
    num_training_steps = (len(texts) // train_config["gradient_accumulation_steps"]) * train_config["num_epochs"]
    num_warmup_steps = int(num_training_steps * train_config["warmup_ratio"])
    trainer.set_scheduler(num_training_steps, num_warmup_steps)
    
    # 训练循环
    print("\n" + "="*50)
    print("Starting training...")
    print("="*50)
    
    global_step = 0
    for epoch in range(train_config["num_epochs"]):
        epoch_loss = 0
        
        # 模拟多个 batch
        for step in range(0, len(texts), 2):
            batch_inputs = {
                'input_ids': inputs['input_ids'][step:step+2],
                'attention_mask': inputs['attention_mask'][step:step+2]
            }
            batch_labels = labels[step:step+2]
            
            loss = trainer.train_step(batch_inputs, batch_labels, train_config["svd_reg_lambda"])
            epoch_loss += loss
            
            # 梯度累积步骤
            if (step + 2) % (2 * train_config["gradient_accumulation_steps"]) == 0:
                trainer.step()
                trainer.scheduler.step()
                global_step += 1
        
        # 打印 epoch 信息
        avg_loss = epoch_loss / (len(texts) // 2)
        current_lr = trainer.scheduler.get_last_lr()[0]
        print(f"Epoch {epoch+1}/{train_config['num_epochs']} | Loss: {avg_loss:.4f} | LR: {current_lr:.2e}")
    
    # 观察奇异值 (模型解释性)
    print("\n" + "="*50)
    print("Top 5 Singular Values in Layer 0 Query:")
    print(model.bert.encoder.layer[0].attention.self.query.S[:5].detach().cpu().numpy())
    
    # 保存模型
    trainer.save_model("hybrid_svd_model.pt")
    
    return model, trainer


if __name__ == "__main__":
    # 需要导入 F
    import torch.nn.functional as F
    model, trainer = main()