In [4]:
!pip -q install evaluate seqeval

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.9.

In [6]:
%%writefile ner_ddp.py

from transformers import AutoModelForTokenClassification,AutoTokenizer,DataCollatorForTokenClassification,TrainingArguments,Trainer
from datasets import load_dataset
import torch
import evaluate
import seqeval
import numpy as np
import os 
import torch.distributed as dist
import torch.multiprocessing as mp

#设置分布式环境
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = "localhost"
    os.environ['MASTER_PORT'] = "12355"
    dist.init_process_group('nccl',rank=rank, world_size=world_size)

#清理分布式环境
def cleanup():
    dist.destroy_process_group()

def train(rank, world_size):
    setup(rank, world_size)

    #加载datasets
    ds = load_dataset("doushabao4766/msra_ner_k_V3")
    
    #创建tags
    tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']
    
    #加载tokenizer
    tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
    
    def data_input_proc(item):
        input_data = tokenizer(item['tokens'],
                               #自动截断超过512
                              truncation=True,
                               #关闭[cls]&[sep]
                              add_special_tokens=False,
                              max_length=512,
                               #告诉tokenizer已经分好词，无需再次分词
                              is_split_into_words=True)
        labels = [lbl[:512]for lbl in item['ner_tags']]
        input_data['labels'] = labels
        return input_data
    
    #map不改变原始数据集
    ds1 = ds.map(data_input_proc,batched=True)
    
    #转为pytorch类型
    ds1.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    
    #创建标签映射字典
    id2label = {i: tags for i,tags in enumerate(tags)}
    label2id = {tags: i for i,tags in enumerate(tags)}
    
    model = AutoModelForTokenClassification.from_pretrained('bert-base-chinese',
                                                           num_labels=len(tags),
                                                           id2label=id2label,
                                                           label2id=label2id)
    model.to(rank)
    #TrainingArguments
    args = TrainingArguments(output_dir='msra_ner_train',
                            num_train_epochs=1,
                            per_device_train_batch_size=8,
                            per_device_eval_batch_size=8,
                            report_to='tensorboard',
                             #可选epoch step#eval_steps=400
                            eval_strategy='epoch',
                             #当前进程rank
                            local_rank=rank,
                             #使用混合精度
                            fp16=True,
                             #动态学习率
                            lr_scheduler_type='linear',
                             #预热步数
                            warmup_steps=100,
                             #优化ddp性能
                            ddp_find_unused_parameters=False)
    
    #metric方法
    def compute_metric(result):
        # result 是一个tuple (predicts, labels)
    
        #获取评估对象
        seqeval = evaluate.load("seqeval")
        predicts,labels = result
        #axis=2选择每个位置概率最高的标签
        predicts = np.argmax(predicts,axis=2)
    
        #准备评估数据
        predicts = [[tags[p]for p,l in zip(ps,ls) if l != -100]
                    for ps,ls in zip(predicts,labels)]
        labels = [[tags[l]for p,l in zip(ps,ls) if l != -100]
                    for ps,ls in zip(predicts,labels)]
        #根据预测标签和真实标签，自动计算序列标注任务的评估指标（精确率、召回率、F1分数等）
        results = seqeval.compute(predictions=predicts, references=labels)
    
        return results
    
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)
    
    #Trainer
    trainer = Trainer(model,
                     args,
                     train_dataset=ds1['train'],
                     eval_dataset=ds1['test'],
                     data_collator=data_collator,
                      compute_metrics=compute_metric)
    
    trainer.train()

def main():
    world_size = torch.cuda.device_count()
    mp.spawn(train, args=(world_size,),nprocs=world_size,join=True)

if __name__ == '__main__':
    main()

Overwriting ner_ddp.py


In [7]:
!python ner_ddp.py

2025-06-12 13:14:37.508472: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749734077.532322     584 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749734077.539396     584 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-12 13:14:48.005683: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-12 13:14:48.018796: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:17497

In [22]:
from transformers import pipeline
pipeline = pipeline('token-classification', '/kaggle/working/msra_ner_train/checkpoint-2813')

result = pipeline('双方确定了今后发展中美关系的指导方针')

print(result)

for item in result:
    print(item['entity'],item['word'])

Device set to use cuda:0


[{'entity': 'B-ORG', 'score': 0.9986308, 'index': 10, 'word': '中', 'start': 9, 'end': 10}, {'entity': 'B-ORG', 'score': 0.9980075, 'index': 11, 'word': '美', 'start': 10, 'end': 11}]
B-ORG 中
B-ORG 美


B-ORG 中
B-ORG 美
