In [15]:
%%writefile ner_ddp.py

# 导包
import os
from transformers import AutoModelForTokenClassification , AutoTokenizer 
from transformers import DataCollatorForTokenClassification , TrainingArguments , Trainer
from datasets import load_dataset
import numpy as np
import evaluate   # pip install evaluate
import seqeval  # pip install seqeval
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import get_linear_schedule_with_warmup
import torch.optim as optim
import torch
from tqdm import tqdm
from transformers import pipeline
import torch.distributed as dist
import torch.multiprocessing as mp

# 设置分布式环境
def setup(rank , world_size):
    os.environ['MASTER_ADDP'] = 'localhost'
    od.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl" , rank = rank , worid_size = world_size)

# 清理分布式环境
def cleanup():
    dist.destory_process_group()

def train(rank , world_size):
    setup(rank , world_size)
    # 加载数据
    ds = load_dataset('doushabao4766/msra_ner_k_V3')
    
    # 加载分词器
    tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

    # 构建映射标签
    entites = list({'per' , 'loc' , 'org'})
    tags = ['O']
    for entity in entites:
        tags.append('B-' + entity.upper())  # upper()方法是转换为大写
        tags.append('I-' + entity.upper())

    # 创建构建方法 [tag + [0] * (512 - len(tag)) for tag in item['ner_tags']]
    def data_input_proc(item):
        input_data = tokenizer(item['tokens'],
                              truncation = True ,  # 超过最大长度允许截断防止溢出
                              max_length = 512 ,   #最大512
                              add_special_tokens = False ,  # 禁止添加特殊标记  确保标签对其
                              is_split_into_words = True) # 因为该数据集已经按照字符划分，所以用id_split_into_words = True 表明一个字符一个字符的传入
        # 设置标签映射（超过512 截断）
        labels = [lbl[:512] for lbl in item['ner_tags']]
        input_data['labels'] = labels
        return input_data
    ds1 = ds.map(data_input_proc , batched = True)

    # 选择模型需要输入的列 将其转换为 torch张量类型
    ds1.set_format('torch' , columns = ['input_ids' ,  # token 索引序列
                                        'token_type_ids' ,  # 段落标记
                                        'attention_mask' ,  # 注意力掩码
                                        'labels']) # NER标签序列
    local_rank = rank

    # 构建模型初始化可读标签参数，
    id2lbl = {i:tag for i,tag in enumerate(tags)}
    lbl2id = {tag:i for i,tag in enumerate(tags)}
    
    model = AutoModelForTokenClassification.from_pretrained("bert-base-chinese" , # 预训练模型
                                                           num_labels = len(tags) ,  # 输出的分类数量
                                                           id2label = id2lbl , 
                                                           label2id = lbl2id)
    model.to(local_rank)

    # 构建评估函数
    def compute_metric(result):
        # 传入的result是一个元祖 (predicts,labels)
    
        # 加载序列标注评估指标库
        seqeval = evaluate.load('seqeval')
        # 解构模型输出的结果
        predicts,labels = result
        # 沿着axis = 2 的维度 取最大值索引 然后将predicts转换为预测标签ID
        predicts = np.argmax(predicts , axis = 2)
        # 准备评估数据 将数字ID转换为文本标签 并且过滤填充数值-100
        predicts = [[tags[p] for p,l in zip(ps,ls) if l != -100]
                     for ps,ls in zip(predicts,labels)]
        labels = [[tags[l] for p,l in zip(ps,ls) if l != -100]
                     for ps,ls in zip(predicts,labels)]
        results = seqeval.compute(predictions = predicts , references = labels)
        return results

        # 批量处理序列数据 动态填充长度保证对齐
        data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer , padding = True)

        # 设置训练相关参数
        args = TrainingArguments(
            output_dir = 'ner_train' , # 设置模型输出目录
            num_train_epochs = 3 , # 训练轮数
            #save_safetensor = False # 模型禁止保存safe格式 可以用troch.load加载
            per_device_train_batch_size = 32 , # 训练批次
            per_device_eval_batch_size = 32 ,  # 评估批次
            report_to = 'tensorboard' , # 设置训练输出记录为tensorboard
            eval_strategy = 'epoch' ,  # 每轮评估一次
            local_rank = local_rank ,  # 设置当前进程RANK 
            fp16 = True ,  # 使用混合精度
            lr_scheduler_type = 'linear' ,  # 设置动态学习率
            warmup_steps = 100 ,  # 预热的步数
            ddp_find_unused_parameters = False # 优化DDP性能
        )

    # 模型训练
    trainer = Trainer(
        model = model ,  # 指定模型
        args = args , # 指定设置参数
        train_dataset = ds1['train'] ,  # 输入训练数据
        eval_dataset = ds1['test'] ,  # 输入评估数据
        compute_metrics = compute_metric , # 指定评估函数
        data_collator = data_collator  # 指定数据收集器
        )
    trainer.train()

def main():
    world_size = torch.cuda.device_count()
    mp.spawn(train , args = (world_size) , nprocs = world_size , join = True)

if __name__ == "__main__":
    main()

    pipeline = pipeline('token-classification', 'ner_train/checkpoint-2112')

    text = pipeline('双方确定了今后发展中美关系的指导方针')
    

Overwriting ner_ddp.py


In [16]:
!python ner_ddp.py

2025-06-12 14:46:09.634288: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749739569.657559     150 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749739569.664672     150 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-12 14:46:19.306809: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749739579.329691     165 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749739579.336684     165 cuda_blas.cc:1