In [4]:
%%writefile week12_ddp.py

import os
from transformers import DataCollatorForTokenClassification, TrainingArguments, Trainer
import numpy as np
from datasets import ClassLabel
import evaluate
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
from datasets import load_dataset
import torch.distributed as dist
import torch.multiprocessing as mp

# 设置分布式环境
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

# 清理分布式环境
def cleanup():
    dist.destroy_process_group()

def train(rank, world_size):
    setup(rank, world_size)
    ds = load_dataset('doushabao4766/msra_ner_k_V3')
    label_list = ds["train"].features["ner_tags"].feature.names  # ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
    
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for i, label in enumerate(label_list)}
    
    # 加载预训练的BERT模型和分词器
    # model = AutoModelForTokenClassification.from_pretrained("google-bert/bert-base-chinese")
    model = AutoModelForTokenClassification.from_pretrained(
        "google-bert/bert-base-chinese",
        num_labels=7,      # 设置为 7
        id2label=id2label,
        label2id=label2id
    )
    model.to(rank)
    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")
    
    
    
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    
        all_labels = []
        for i, word_ids in enumerate(tokenized_inputs.word_ids(batch_index=i) for i in range(len(examples["tokens"]))):
            labels = []
            previous_word_idx = None
            for word_idx in word_ids:
                if word_idx is None:
                    labels.append(-100)
                elif word_idx != previous_word_idx:
                    labels.append(examples["ner_tags"][i][word_idx])
                else:
                    labels.append(examples["ner_tags"][i][word_idx])
                previous_word_idx = word_idx
            all_labels.append(labels)
    
        tokenized_inputs["labels"] = all_labels
        return tokenized_inputs
    
    # 应用预处理
    tokenized_datasets = ds.map(tokenize_and_align_labels, batched=True)
    
    # 训练参数
    training_args = TrainingArguments(
        output_dir="./ner_model",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=50,
        report_to="none",  # 不使用wandb等
        local_rank=rank,   # 当前进程 RANK
        fp16=True,               # 使用混合精度
        lr_scheduler_type='linear',  # 动态学习率
        warmup_steps=100,        # 预热步数
        ddp_find_unused_parameters=False  # 优化DDP性能
    )
    
    # 数据收集器
    data_collator = DataCollatorForTokenClassification(tokenizer)
    
    # 评估指标
    metric = evaluate.load("seqeval")  # 加载序列标注指标
    
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)
        true_predictions = [
            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        results = metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    # 训练
    trainer.train()
    
    # 保存模型
    trainer.save_model("bert-msra-ner")

def main():
    world_size = torch.cuda.device_count()
    mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)

if __name__ == "__main__":
    main()

Overwriting week12_ddp.py


In [5]:
!pip install -q evaluate seqeval
!python week12_ddp.py

2025-06-12 15:57:14.105863: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749743834.129113     239 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749743834.136200     239 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-12 15:57:23.763183: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749743843.785603     253 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749743843.792842     253 cuda_blas.cc:1

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification  # 或其他模型类
from datasets import load_dataset

ds = load_dataset('doushabao4766/msra_ner_k_V3')
label_list = ds["train"].features["ner_tags"].feature.names  # ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/ner_model/checkpoint-1407")
model = AutoModelForTokenClassification.from_pretrained("/kaggle/working/ner_model/checkpoint-1407")

def ner_inference(text: str):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.to(device)

    words = list(text)

    inputs = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True
    )
    word_ids = inputs.word_ids()  # 先拿 word_ids
    inputs = {k: v.to(device) for k, v in inputs.items()}  # 转设备

    with torch.no_grad():
        outputs = model(**inputs)
    pred_ids = outputs.logits.argmax(dim=-1)[0].tolist()

    entities = []
    cur_entity = []
    cur_type = None

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue

    
        label = id2label[pred_ids[idx]]
        prefix, ent_type = (label.split("-", 1) + [None])[:2]

        if prefix == "B":
            if cur_entity:
                entities.append({"entity": cur_type, "content": "".join(cur_entity)})
            cur_entity = [words[word_idx]]
            cur_type = ent_type

        elif prefix == "I" and cur_type == ent_type:
            cur_entity.append(words[word_idx])

        else:
            if cur_entity:
                entities.append({"entity": cur_type, "content": "".join(cur_entity)})
                cur_entity = []
                cur_type = None

    if cur_entity:
        entities.append({"entity": cur_type, "content": "".join(cur_entity)})

    return entities

# —— 测试一下 —— 
text = "双方确定了今后发展中美关系的指导方针。"
print(ner_inference(text))

[{'entity': 'LOC', 'content': '中'}, {'entity': 'LOC', 'content': '美'}]
