### 利用huggingface中预训练模型，实现文本分类模型定制和微调

1. 加载预训练模型定制输出端任务
2. 原始数据进行清洗转换
   - 清理停用词或非法字符
3. 构建Dataset和DataLoader
   - DataLoader的collate_fn参数，在回调函数中使用tokenizer转换模型输入数据
5. 创建模型，损失函数、优化器
6. 训练模型
7. 观察损失调参迭代
8. 模型保存

In [1]:
import zipfile 
import pandas as pd 
import torch 
from torch.utils.data import Dataset, DataLoader 
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoModelForMaskedLM
#设置分词器
tokenizer=  AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')
# 设置文本分类任务(5个类别，对应1-5星)
model = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-chinese', num_labels=5)
model1 = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-chinese', num_labels=5)
print(model)
#这里是处理压缩包的核心代码
def extract_and_load_data(excel_file_path):
    #kaggle上直接读取就好
    df=pd.read_excel(excel_file_path)
    df=df.dropna(subset=['评价内容(content)','评分（总分5分）(score)']).reset_index(drop=True)
    df['评分（总分5分）(score)']=df['评分（总分5分）(score)']-1
    return df['评价内容(content)'].tolist(),df['评分（总分5分）(score)'].tolist()

#自定义构建数据集常用方法
class CommentDataset(Dataset):
    """自定义数据集"""
    def __init__(self,texts,labels):
        self.texts=texts
        self.labels=labels
    def __len__(self):
        return len(self.texts)
    def __getitem__(self,idx):
        return self.texts[idx],self.labels[idx]
'''
class CommentDataset(Dataset):
    """自定义评论数据集"""
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]
'''
def build_collate(tokenizer):
    def collate_fn(batch):
        # 文本分类语料：输入语句，类别标签
        sentents,labels = zip(*batch)
    
        # tokenizer转换
        model_inputs = tokenizer(sentents, return_tensors='pt', padding = True,  truncation = True)
        labels = torch.tensor(labels)

        return model_inputs, labels
    return collate_fn
    
ZIP_FILE_PATH = '/kaggle/input/jd_comment_with_label/jd_comment_data.xlsx'  # 你的文件路径
MODEL_NAME = 'bert-base-chinese'  # 替换为实际使用的模型
BATCH_SIZE = 16
# 1. 加载数据
texts, labels = extract_and_load_data(ZIP_FILE_PATH)
# 2. 初始化tokenizer和dataset
dataset = CommentDataset(texts, labels)
# DataLoader
dl = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=build_collate(tokenizer))
# 使用预训练bert模型时，学习率不能太大!!! 推荐1e-4或1e-5 
print(123)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

2025-05-22 12:27:06.814956: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747916827.059298      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747916827.134606      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#定制相同的优化器和损失函数并冻结model1的参数，以此来进行对比
optimizer1 = torch.optim.Adam(model.parameters(), lr=1e-5)
optimizer2 = torch.optim.Adam(model1.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)
model1.bert.trainable=False
for epoch in range(5):
        model.train()
        model1.train()
        total_loss1=0
        total_loss2=0
        #model.to(device)
        #model1.to(device)
        for batch in dl:
            X, y = batch 
            #X ={key: value.to(device) for key, value in X.items()}
            #y=y.to(device)
            logits1=model(**X)
            logits2=model1(**X)
            # logits: [B,out_feature]
            #此处输出的是一个对象，需要提取实际的logits属性才醒
            loss1 = loss_fn(logits1.logits,y)
            loss2 = loss_fn(logits2.logits,y)
            optimizer1.zero_grad()
            optimizer2.zero_grad()
            loss1.backward()
            loss2.backward()
            optimizer1.step()
            optimizer2.step()
            total_loss1 += loss1.item()
            total_loss2 += loss2.item()
        print(f"Epoch {epoch+1}, Loss1: {total_loss1/len(dataloader):.4f}")
        print(f"Epoch {epoch+1}, Loss2: {total_loss2/len(dataloader):.4f}")
#bert模型参数保存
torch.save(model.state_dict(), 'bert.pth')

In [None]:
new_model = AutoModelForSequenceClassification.from_pretrained(
        'google-bert/bert-base-chinese',
        5  # 必须与训练时的num_labels一致（如5）
    )
    
    # 加载训练好的参数（state_dict）
new_model.load_state_dict(torch.load('bert.pth'))
# 移动模型到目标设备（CPU/GPU）
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
new_model = model.to(device)
# 开启评估模式（关闭dropout等随机层）
new_model.eval()

In [None]:
correct = 0
total = 0
with torch.no_grad():  # 评估时不计算梯度（节省内存）
    for batch in dataloader:
        model_inputs, labels = batch
        # 数据移动到设备
        model_inputs = {k: v.to(device) for k, v in model_inputs.items()}
        labels = labels.to(device)
        
        # 前向传播获取预测logits
        outputs = model(**model_inputs)
        logits = outputs.logits  # 形状：[batch_size, num_labels]
        
        # 计算预测标签（取概率最大的索引）
        preds = logits.argmax(dim=1)
        
        # 统计正确数
        correct += (preds == labels).sum().item()
        total += labels.size(0)
accuracy = correct / total