直接使用Hugging Face的transformers库原生接口加载预训练BERTBertForSequenceClassification），并基于IMDB数据集进行​​微调​​。特点是代码简洁，依赖库的封装接口，适合快速实现标准任务。

In [2]:
import os
import sys
import logging
import time

import pandas as pd
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

# sklearn.metrics提供了大量用于分类、回归、聚类等任务的评估指标，以及一些工具函数来帮助分析模型的预测结果
# sklearn.model_selection主要用于模型选择和评估。它提供了多种方法来帮助你进行数据集划分、交叉验证、超参数调优等任务
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#注：在新版本中AdamW已经不存在，推荐版本：pip install transformers==4.36.2
from transformers.optimization import AdamW
from transformers import BertTokenizerFast, BertForSequenceClassification
from tqdm import tqdm

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
# argv[0]返回当前文件路径
# basename返回文件名
program = os.path.basename(sys.argv[0])
# 以文件名创建日志记录器对象。若记录器已存在则返回现有对象
logger = logging.getLogger(program)

# 配置日志系统的基础设置（如输出格式）
# %(asctime)s:时间戳  %(levelname)s日志级别   %(message)s消息内容
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info(r"running %s" % ''.join(sys.argv))

2025-05-06 22:49:00,319: INFO: running d:\Anaconda\envs\nlp\lib\site-packages\ipykernel_launcher.py--f=c:\Users\Administrator\AppData\Roaming\jupyter\runtime\kernel-v369fcaf924f9af8139c5bc274a2c2467f11969760.json


加载数据

In [4]:
labeledTrainDataPath = r"D:\workplace\NLP_learning\dataset\labeledTrainData.tsv"
testDataPath = r"D:\workplace\NLP_learning\dataset\testData.tsv"

# header:指定哪一行为列名
# delimiter: 指定字段之间的分隔符(\t for tsv and , for csv)
# quoting: 控制引号的处理方式,3对应csv.QUOTE_NONE，表示不处理引号，将引号视为普通字符
def ReadData(path):
    return pd.read_csv(path, header=0, delimiter="\t", quoting=3)
train = ReadData(labeledTrainDataPath)
test = ReadData(testDataPath)
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


利用bert来tokenizer

In [5]:
train_texts, train_labels, test_texts = [], [], []
for i, review in enumerate(train["review"]):
    train_texts.append(review)
    train_labels.append(train["sentiment"][i])

for review in test["review"]:
    test_texts.append(review)

# train_test_split会同步划分X和Y
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

# bert-base-uncased 是 Hugging Face Transformers 库中预定义的BERT模型名称
# 它代表一个特定配置的 BERT 模型
# uncased输入文本转为小写.适用场景:大小写无关的任务（如情感分析）

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# tokenizer返回类似字典的数据结构 
""" 
    {
    'input_ids':      [[句子1的token IDs], [句子2的token IDs], ...],    文本转换为词典中的ID
    'attention_mask': [[句子1的mask], [句子2的mask], ...],              标记哪些是真实token（1） vs 填充部分（0）
    'token_type_ids': [[句子1的segment IDs], ...]                       区分句子A/B（单句任务通常全0）
    }   
"""
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

print(type(train_encodings))



<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [6]:
print("第一条样本的编码:")
print({k: v[0] for k, v in train_encodings.items()})

第一条样本的编码:
{'input_ids': [101, 1000, 2023, 5046, 3185, 2003, 2061, 10231, 1010, 1996, 5889, 2064, 2025, 2552, 3426, 2027, 3849, 2000, 2022, 3752, 2013, 1037, 2338, 1998, 1996, 2466, 2003, 2061, 1006, 10587, 4783, 1007, 5365, 1012, 1012, 1996, 2069, 3364, 2040, 2106, 1037, 7929, 3105, 2001, 5292, 12462, 4103, 22479, 26036, 2063, 1012, 1012, 1017, 1013, 2184, 2065, 2017, 2215, 1037, 2428, 2204, 5046, 3185, 3422, 8937, 1010, 2307, 5889, 1998, 1037, 2514, 24146, 2466, 1023, 1013, 2184, 1000, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

使用DataLoader

In [7]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        # dict.items()返回字典中所有键值对的视图对象，格式为 (key, value) 元组。
        # item是一个字典，保存了每个键的第idx个样本
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]) 
        return item
    
    def __len__(self):
        return len(self.labels)
    
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, num_samples=0):
        self.encodings = encodings
        self.num_samples = num_samples

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return self.num_samples
    
train_dataset = TrainDataset(train_encodings, train_labels)
val_dataset = TrainDataset(val_encodings, val_labels)
test_dataset = TestDataset(test_encodings, num_samples=len(test_texts))

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


DataLoader小总结

batch的格式由两个因素决定：
- TrainDataset.__getitem__的返回值​
- ​DataLoader的自动批处理功能​​（将多个样本的字典按字段堆叠）
```
示例：假设batch_size=2，原始数据如下
样本1: {'input_ids': [101, 2023, 3185, 102], 'attention_mask': [1,1,1,1], 'labels': 1}
样本2: {'input_ids': [101, 1045, 2134, 102], 'attention_mask': [1,1,1,1], 'labels': 0}
输出的batch：
{
    'input_ids': tensor([
        [101, 2023, 3185, 102],  # 样本1
        [101, 1045, 2134, 102]   # 样本2
    ]),
    'attention_mask': tensor([
        [1, 1, 1, 1], 
        [1, 1, 1, 1]
    ]),
    'labels': tensor([1, 0])  # 样本1和样本2的标签
}

创建模型

In [None]:
# python的三元运算符 [结果1] if [条件] else [结果2]
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# 在预训练BERT模型的基础上，自动追加一个适合分类任务的全连接神经网络层​​
# 该模型默认num_labels=2，若是多分类问题可通过num_labels参数设置
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
# 将模型权重从CPU内存移动到GPU显存
model.to(device)
# 用于将模型设置为​​训练模式​​。这个调用会改变模型在前向传播和反向传播时的行为
model.train()

optim = optim.AdamW(model.parameters(), lr=5e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
for epoch in range(3):
    start = time.time()
    train_loss, val_losses = 0, 0
    train_acc, val_acc = 0, 0
    n, m = 0, 0

    with tqdm(total=len(train_loader), desc="Epoch %d" % epoch) as pbar:
        for batch in train_loader:
            n += 1
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optim.step()
            train_acc += accuracy_score(torch.argmax(outputs.logits.cpu().data, dim=1), labels.cpu())
            train_loss += loss.cpu()

            pbar.set_postfix({'epoch': '%d' % (epoch),
                                'train loss': '%.4f' % (train_loss.data / n),
                                'train acc': '%.2f' % (train_acc / n)
                                })
            pbar.update(1)

        with torch.no_grad():
            for batch in val_loader:
                m += 1
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss = outputs.loss
                val_acc += accuracy_score(torch.argmax(outputs.logits.cpu().data, dim=1), labels.cpu())
                val_losses += val_loss
        end = time.time()
        runtime = end - start
        pbar.set_postfix({'epoch': '%d' % (epoch),
                            'train loss': '%.4f' % (train_loss.data / n),
                            'train acc': '%.2f' % (train_acc / n),
                            'val loss': '%.4f' % (val_losses.data / m),
                            'val acc': '%.2f' % (val_acc / m),
                            'time': '%.2f' % (runtime)})

        # print('epoch: %d, train loss: %.4f, train acc: %.2f, val loss: %.4f, val acc: %.2f, time: %.2f' %
        #       (epoch, train_loss.data / n, train_acc / n, val_losses.data / m, val_acc / m, runtime))


Epoch 0: 100%|██████████| 2500/2500 [12:54<00:00,  3.23it/s, epoch=0, train loss=0.2822, train acc=0.88, val loss=0.2362, val acc=0.91, time=774.32]
Epoch 1: 100%|██████████| 2500/2500 [12:39<00:00,  3.29it/s, epoch=1, train loss=0.1681, train acc=0.94, val loss=0.2186, val acc=0.92, time=759.89]
Epoch 2: 100%|██████████| 2500/2500 [12:42<00:00,  3.28it/s, epoch=2, train loss=0.1050, train acc=0.96, val loss=0.2714, val acc=0.90, time=762.98]


In [10]:
test_pred = []
with torch.no_grad():
    with tqdm(total=len(test_loader), desc='Predction') as pbar:
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            # test_pred.extent
            test_pred.extend(torch.argmax(outputs.logits.cpu().data, dim=1).numpy().tolist())

            pbar.update(1)

result_output = pd.DataFrame(data={"id": test["id"], "sentiment": test_pred})
result_output.to_csv("./result/bert_native.csv", index=False, quoting=3)
logging.info('result saved!')

Predction: 100%|██████████| 1563/1563 [07:41<00:00,  3.38it/s]
2025-05-06 23:35:14,791: INFO: result saved!
