In [1]:
from datasets import load_dataset

# 加载 CoNLL 2003 数据集
dataset = load_dataset("conll2003")
train_dataset = dataset['train']


# 设置设备，检查是否可以使用 GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [5]:
from transformers import BertTokenizer, BertModel
import torch

# 加载预训练的 BERT 模型和 tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = BertModel.from_pretrained('bert-base-cased')


In [6]:
# 查看训练数据集中有多少个独特的标签
all_labels = []
for batch in train_dataset:
    all_labels.extend(batch['ner_tags'])

unique_labels = set(all_labels)
print(f"Unique labels in dataset: {unique_labels}")
print(f"Number of unique labels: {len(unique_labels)}")
num_labels = len(unique_labels)  # 将 num_labels 设置为实际标签数量


Unique labels in dataset: {0, 1, 2, 3, 4, 5, 6, 7, 8}
Number of unique labels: 9


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel

class SimpleNERModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_labels):
        super(SimpleNERModel, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)  # 第一个全连接层（中间层）
        self.relu = nn.ReLU()  # ReLU 激活函数
        self.fc2 = nn.Linear(hidden_dim, num_labels)  # 输出层

    def forward(self, embeddings):
        x = self.fc1(embeddings)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# 初始化 BERT 模型和 tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = BertModel.from_pretrained('bert-base-cased').to(device)

# 加载数据
from datasets import load_dataset
dataset = load_dataset("conll2003")
train_dataset = dataset['train']

# 模型参数
embedding_dim = 768  # BERT embedding 的维度
hidden_dim = 128  # 中间层的维度

# 初始化模型
model = SimpleNERModel(embedding_dim, hidden_dim, num_labels).to(device)
optimizer = optim.Adam(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()


In [18]:
from torch.utils.data import DataLoader
epochs = 20
pad_token_label_id = -100

def get_embeddings(tokens):
    inputs = tokenizer(tokens, return_tensors="pt", is_split_into_words=True, padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state, inputs['attention_mask']  # 返回 embeddings 和 attention_mask

def pad_labels(labels, max_length, pad_token_label_id):
    """对 labels 进行 padding，长度补齐到 max_length"""
    labels = labels + [pad_token_label_id] * (max_length - len(labels))
    return labels

# 自定义 collate_fn 函数来对 tokens 和 labels 进行 padding
def collate_fn(batch):
    tokens = [item['tokens'] for item in batch]
    labels = [item['ner_tags'] for item in batch]

    max_length = max(len(token_list) for token_list in tokens)

    padded_tokens = [token_list + [''] * (max_length - len(token_list)) for token_list in tokens]
    padded_labels = [label_list + [pad_token_label_id] * (max_length - len(label_list)) for label_list in labels]

    return {'tokens': padded_tokens, 'ner_tags': padded_labels}

batch_size = 1024
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# 开始训练
for epoch in range(epochs):
    batch_counter = 0
    for batch in train_dataloader:
        tokens = batch['tokens']
        labels = batch['ner_tags']

        # 获取 tokens 的 embeddings 和 attention mask
        embeddings, attention_mask = get_embeddings(tokens)

        # 对 labels 进行 padding，长度与 embeddings 的 sequence length 匹配
        padded_labels = [pad_labels(label_list, embeddings.size(1), pad_token_label_id) for label_list in labels]
        padded_labels = torch.tensor(padded_labels).to(device)  # 将 labels 转移到 GPU

        # 转换为 tensor 并展平
        outputs = model(embeddings)
        outputs = outputs.view(-1, num_labels)  # 将输出展平为 [batch_size * seq_length, num_labels]

        padded_labels = padded_labels.view(-1)  # 将 labels 展平为 [batch_size * seq_length]

        # 计算损失
        loss = loss_fn(outputs, padded_labels)

        # 反向传播并更新权重
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 输出当前 batch 计数器和损失值
        batch_counter += 1
        print(f'Epoch {epoch+1}, Batch {batch_counter}, Loss: {loss.item()}')

    print(f'Epoch {epoch+1} completed.')


Epoch 1, Batch 1, Loss: 0.7691693902015686
Epoch 1, Batch 2, Loss: 0.7697692513465881
Epoch 1, Batch 3, Loss: 0.7938210368156433
Epoch 1, Batch 4, Loss: 0.7496778964996338
Epoch 1, Batch 5, Loss: 0.7899244427680969
Epoch 1, Batch 6, Loss: 0.7413403987884521
Epoch 1, Batch 7, Loss: 0.7822661995887756
Epoch 1, Batch 8, Loss: 0.7747786045074463
Epoch 1, Batch 9, Loss: 0.7699350714683533
Epoch 1, Batch 10, Loss: 0.7893337607383728
Epoch 1, Batch 11, Loss: 0.7792841196060181
Epoch 1, Batch 12, Loss: 0.7833759188652039
Epoch 1, Batch 13, Loss: 0.7742283344268799
Epoch 1, Batch 14, Loss: 0.7822579145431519
Epoch 1 completed.
Epoch 2, Batch 1, Loss: 0.7663796544075012
Epoch 2, Batch 2, Loss: 0.7588261365890503
Epoch 2, Batch 3, Loss: 0.7753508687019348
Epoch 2, Batch 4, Loss: 0.7632139921188354
Epoch 2, Batch 5, Loss: 0.762175440788269
Epoch 2, Batch 6, Loss: 0.7678371071815491
Epoch 2, Batch 7, Loss: 0.782738447189331
Epoch 2, Batch 8, Loss: 0.7580418586730957
Epoch 2, Batch 9, Loss: 0.773213