In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForTokenClassification,TrainingArguments,Trainer,DataCollatorForTokenClassification
import torch
import numpy as np
import os
os.environ["WANDB_DISABLED"] = "true"

2025-05-29 12:44:38.705042: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748522678.881520      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748522678.933854      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

使用设备: cuda


In [3]:
# 加载数据集
dataset = load_dataset("doushabao4766/msra_ner_k_V3")
train_dataset = dataset["train"]
test_dataset = dataset["test"]
# 查看训练集第一条样本
print(train_dataset[0])

# 查看特征（字段）名称
print(train_dataset.features)

# 定义标签映射（根据数据集实际标签调整）
label_list = ["O", "B-ORG", "I-ORG", "B-PER", "I-PER", "B-LOC", "I-LOC"]  # 示例标签
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

# 加载分词器和模型
model_name = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

README.md:   0%|          | 0.00/697 [00:00<?, ?B/s]

(…)-00000-of-00001-42717a92413393f9.parquet:   0%|          | 0.00/13.9M [00:00<?, ?B/s]

(…)-00000-of-00001-8899cab5fdab45bc.parquet:   0%|          | 0.00/946k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45001 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3443 [00:00<?, ? examples/s]

{'id': '0', 'tokens': ['当', '希', '望', '工', '程', '救', '助', '的', '百', '万', '儿', '童', '成', '长', '起', '来', '，', '科', '教', '兴', '国', '蔚', '然', '成', '风', '时', '，', '今', '天', '有', '收', '藏', '价', '值', '的', '书', '你', '没', '买', '，', '明', '日', '就', '叫', '你', '悔', '不', '当', '初', '！'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'knowledge': ''}
{'id': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None), 'knowledge': Value(dtype='string', id=None)}


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# 加载分词器和模型（保持不变）
model_name = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id
)
model = model.to(device)
# 优化1：仅使用数据子集（减少90%数据量）
train_dataset = train_dataset.select(range(min(20000, len(train_dataset))))  # 仅取前100条
test_dataset = test_dataset.select(range(min(8000, len(test_dataset))))      # 仅取前20条

# 优化2：简化标签对齐逻辑 + 减少序列长度
def tokenize_and_align_labels(examples):
    # 优化3：缩短max_length到64（减少50%计算量）
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=64,  # 从128减少到64
        add_special_tokens=True
    )
    
    # 优化4：向量化标签处理（避免循环）
    labels = []
    word_ids_list = [tokenized_inputs.word_ids(i) for i in range(len(examples["ner_tags"]))]
    
    for i, label_seq in enumerate(examples["ner_tags"]):
        label_ids = [
            -100 if word_idx is None else label_seq[word_idx]
            for word_idx in word_ids_list[i]
        ]
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# 优化5：禁用进度条显示 + 降低映射并行度
tokenized_train = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    batch_size=8,          # 减小批处理大小
    num_proc=1,            # 禁用多进程（减少内存开销）
    load_from_cache_file=False  # 避免缓存处理
)

tokenized_test = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    batch_size=8,
    num_proc=1,
    load_from_cache_file=False
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3443 [00:00<?, ? examples/s]

In [5]:
# 训练参数
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    report_to = "none"
)

# 数据整理器
data_collator = DataCollatorForTokenClassification(tokenizer)

# 训练器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
)

# 训练模型
trainer.train()

# 定义预测函数
def predict_entities(text):
    chars = list(text)
    inputs = tokenizer(
        chars,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().cpu().tolist()
    
    # 提取实体
    entities = []
    current_entity = None
    for idx, (word, pred_id) in enumerate(zip(text, predictions)):
        label = id2label.get(pred_id, "O")
        if label.startswith("B-"):
            if current_entity:
                entities.append(current_entity)
            current_entity = {"entity": label[2:], "content": word}
        elif label.startswith("I-"):
            if current_entity and current_entity["entity"] == label[2:]:
                current_entity["content"] += word
            else:
                current_entity = None  # 忽略不匹配的 I- 标签
        else:
            if current_entity:
                entities.append(current_entity)
                current_entity = None
    if current_entity:
        entities.append(current_entity)
    return entities



Step,Training Loss
500,0.0609
1000,0.0181
1500,0.0093




In [6]:
# 测试输入输出
text = "双方确定了今后发展中美关系的指导方针。"
entities = predict_entities(text)
print(entities)

[{'entity': 'LOC', 'content': '美'}, {'entity': 'LOC', 'content': '关'}]
