# 第三章

本章节将通过使用BERT-mini在waimai_10k的数据集当中进行微调，并且在最后给出微调的具体结果可供大家自行测试！

# 数据预处理

从文档中我们了解到，BERT 模型在输入时会在文本序列首部加入特殊 token <CLS>，并且需要将文本转换为模型可接受的输入格式。我们将使用waimai_10k.csv数据集，该数据集通常包含外卖评论和对应的情感标签（如积极或消极）。

In [1]:
import pandas as pd
from transformers import BertTokenizer

# 加载数据集
data = pd.read_csv('/kaggle/input/waimai-10k/waimai_10k.csv')

# 加载BERTmini的分词器
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-mini')

# 定义函数将文本转换为模型输入
def tokenize_text(text):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    return inputs

# 对数据集中的文本进行分词处理
input_ids = []
attention_masks = []

for text in data['review']:
    inputs = tokenize_text(text)
    input_ids.append(inputs['input_ids'])
    attention_masks.append(inputs['attention_mask'])

# 将标签转换为张量
import torch
labels = torch.tensor(data['label'].values)

vocab.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

# 加载 BERTmini 模型

根据文档当中，BERT 模型架构由 Embedding、Encoder 加上 prediction_heads 组成。我们直接加载预训练的 BERTmini 模型，并修改 prediction_heads 以适应我们的分类任务。

In [2]:
from transformers import BertForSequenceClassification

# 加载预训练的BERTmini模型
model = BertForSequenceClassification.from_pretrained('prajjwal1/bert-mini', num_labels=2)

2025-07-13 04:51:05.579802: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752382265.915600      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752382266.011473      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 划分数据集

为了评估模型的性能，我们将数据集划分为训练集和验证集。

In [3]:
from sklearn.model_selection import train_test_split

# 将输入数据转换为张量
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# 划分训练集和验证集
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2023, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2023, test_size=0.2)

# 创建数据加载器
使用```DataLoader```来批量加载数据，提高训练效率。

In [4]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# 创建训练集和验证集的数据集
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=16)

# 微调模型

根据教程文档，微调其实和训练时更新模型参数的策略一致，只不过是在特定的任务中，通过更少的训练数据，更小的batch_size上进行训练，用小成本达到我们想要的效果，值得注意的是模型更新参数的幅度更小。

In [5]:
import torch.nn as nn
import torch.optim as optim
from transformers import BertForSequenceClassification
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

# 定义训练轮数
epochs = 3

# 训练模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}: Average training loss = {avg_train_loss}')

    # 验证模型
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += (logits.argmax(axis=1) == label_ids).mean()

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print(f'Epoch {epoch+1}: Validation Loss = {avg_val_loss}, Validation Accuracy = {avg_val_accuracy}')

model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Epoch 1: Average training loss = 0.5812016006310781
Epoch 1: Validation Loss = 0.5427104336023331, Validation Accuracy = 0.7267857142857143
Epoch 2: Average training loss = 0.51846508299311
Epoch 2: Validation Loss = 0.5284070480863253, Validation Accuracy = 0.7367857142857143
Epoch 3: Average training loss = 0.5003033414731423
Epoch 3: Validation Loss = 0.5085438886781534, Validation Accuracy = 0.7560119047619047


In [6]:
new_reviews = [
    "这家店的外卖太好吃了，下次还会点！",
    "这外卖太难吃了，再也不会买了。"
]

# 对新数据进行分词处理
input_ids = []
attention_masks = []

for review in new_reviews:
    inputs = tokenizer(review, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    input_ids.append(inputs['input_ids'])
    attention_masks.append(inputs['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# 创建数据集和数据加载器
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=16)
# 将模型设置为评估模式
model.eval()

# 存储预测结果和概率
predictions = []
probabilities = []
for batch in prediction_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        preds = torch.argmax(logits, dim=1)

    predictions.extend(preds.cpu().numpy())
    probabilities.extend(probs.cpu().numpy())
# 展示预测结果
for i in range(len(new_reviews)):
    print(f"文本: {new_reviews[i]}")
    print(f"预测标签: {'积极' if predictions[i] == 1 else '消极'}")
    print(f"积极概率: {probabilities[i][1]:.4f}, 消极概率: {probabilities[i][0]:.4f}")
    print("-" * 50)

文本: 这家店的外卖太好吃了，下次还会点！
预测标签: 消极
积极概率: 0.2427, 消极概率: 0.7573
--------------------------------------------------
文本: 这外卖太难吃了，再也不会买了。
预测标签: 消极
积极概率: 0.1318, 消极概率: 0.8682
--------------------------------------------------


# 总结

通过以上步骤，我们结合文档中 BERT 模型的相关知识，使用预训练的 BERT-mini 模型在waimai_10k.csv数据集上进行了微调。在微调过程中，我们遵循了预训练 - 微调的范式，利用已有的预训练模型在特定任务上进行了高效的训练。最终，我们可以通过验证集的准确率来评估模型在该任务上的性能。