1. 利用上周NER模型训练任务代码，复现课堂案例中：动态学习率、混合精度、DDP训练实现。
2. 利用课堂案例，实现分布式DDP模型训练。存盘后加载实现推理。

In [1]:
!pip -q install evaluate seqeval

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.9

In [7]:
%%writefile ner_ddp2.py

from transformers import AutoModelForTokenClassification,AutoTokenizer,DataCollatorForTokenClassification,TrainingArguments,Trainer
from datasets import load_dataset
import torch
import evaluate
import seqeval
import numpy as np
import os 
import torch.distributed as dist
import torch.multiprocessing as mp

#设置分布式环境
def setup(rank,world_size):
    os.environ['MASTER_ADDR'] ='localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group('nccl',rank=rank,world_size=world_size)

#清理分布式环境
def cleanup():
    dist.destory_process_group()

def train(rank,world_size):
    setup(rank,world_size)

    #加载datasets
    ds = load_dataset('nlhappy/CLUE-NER')

    #entity_index
    entites = ['O'] + list({'movie', 'name', 'game', 'address', 'position', \
               'company', 'scene', 'book', 'organization', 'government'})
    tags = ['O']
    for entity in entites[1:]:
        tags.append('B-' + entity.upper())
        tags.append('I-' + entity.upper())
    
    entity_index = {entity:i for i, entity in enumerate(entites)}

    tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

    def entity_tags_proc(item):
        # item即是dataset中记录
        text_len = len(item['text'])  # 根据文本长度生成tags列表
        tags = [0] * text_len    # 初始值为‘O’
        # 遍历实体列表，所有实体类别标记填入tags
        entites = item['ents']
        for ent in entites:
            indices = ent['indices']  # 实体索引
            label = ent['label']   # 实体名
            tags[indices[0]] = entity_index[label] * 2 - 1
            for idx in indices[1:]:
                tags[idx] = entity_index[label] * 2
        return {'ent_tag': tags}

    # 使用自定义回调函数处理数据集记录
    ds1 = ds.map(entity_tags_proc)

    def data_input_proc(item):
        # 输入文本先拆分为字符，再转换为模型输入的token索引
        batch_texts = [list(text) for text in item['text']]
        # 导入拆分为字符的文本列表时，需要设置参数is_split_into_words=True
        input_data = tokenizer(batch_texts, truncation=True, add_special_tokens=False, max_length=512, 
                               is_split_into_words=True, padding='max_length')
        input_data['labels'] = [tag + [0] * (512 - len(tag)) for tag in item['ent_tag']]
        return input_data

    ds2 = ds1.map(data_input_proc,batched=True)

    id2lbl = {i:tag for i, tag in enumerate(tags)}
    lbl2id = {tag:i for i, tag in enumerate(tags)}
    
    model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese', 
                                                            num_labels=21,
                                                            id2label=id2lbl,
                                                            label2id=lbl2id)
    model.to(rank)

    args = TrainingArguments(
         output_dir="ner_train_DDP",
        num_train_epochs = 1,
        save_safetensors=False,
        per_device_train_batch_size=16,  # 训练批次
        per_device_eval_batch_size=16,
        report_to='tensorboard',
        eval_strategy="epoch",
        local_rank=rank,
        fp16=True, 
        lr_scheduler_type='linear', # 动态学习率
        warmup_steps=100,        # 预热步数
        ddp_find_unused_parameters=False  # 优化DDP性能
    )

    def compute_metric(result):
        # result 是一个tuple (predicts, labels)
        
        # 获取评估对象
        seqeval = evaluate.load('seqeval')
        predicts,labels = result
        predicts = np.argmax(predicts, axis=2)
        
        # 准备评估数据
        predicts = [[tags[p] for p,l in zip(ps,ls) if l != -100]
                     for ps,ls in zip(predicts,labels)]
        labels = [[tags[l] for p,l in zip(ps,ls) if l != -100]
                     for ps,ls in zip(predicts,labels)]
        results = seqeval.compute(predictions=predicts, references=labels)
    
        return results

    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

    trainer = Trainer(
        model,
        args,
        train_dataset=ds2['train'],
        eval_dataset=ds2['validation'],
        data_collator=data_collator,
        compute_metrics=compute_metric
    )
    trainer.train()

def main():
    world_size = torch.cuda.device_count()
    mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)

if __name__ == "__main__":
    main()

Overwriting ner_ddp2.py


In [8]:
!python ner_ddp2.py

2025-06-13 14:21:03.598500: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749824463.621986    2903 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749824463.630382    2903 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-13 14:21:13.794616: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749824473.816968    2917 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749824473.823970    2917 cuda_blas.cc:1

In [14]:
from transformers import pipeline
pipeline = pipeline('token-classification', '/kaggle/working/ner_train_DDP/checkpoint-336')

resutls = pipeline("加勒比海盗3：世界尽头》的去年同期成绩死死甩在身后，后者则即将赶超《变形金刚》")
resutls

Device set to use cuda:0


[{'entity': 'B-MOVIE',
  'score': 0.3152406,
  'index': 1,
  'word': '加',
  'start': 0,
  'end': 1},
 {'entity': 'I-MOVIE',
  'score': 0.5219301,
  'index': 2,
  'word': '勒',
  'start': 1,
  'end': 2},
 {'entity': 'I-MOVIE',
  'score': 0.49803004,
  'index': 3,
  'word': '比',
  'start': 2,
  'end': 3},
 {'entity': 'I-MOVIE',
  'score': 0.40631694,
  'index': 4,
  'word': '海',
  'start': 3,
  'end': 4},
 {'entity': 'I-GAME',
  'score': 0.4993898,
  'index': 5,
  'word': '盗',
  'start': 4,
  'end': 5},
 {'entity': 'I-GAME',
  'score': 0.5256959,
  'index': 6,
  'word': '3',
  'start': 5,
  'end': 6},
 {'entity': 'I-GAME',
  'score': 0.38661325,
  'index': 7,
  'word': '：',
  'start': 6,
  'end': 7},
 {'entity': 'I-MOVIE',
  'score': 0.27445993,
  'index': 8,
  'word': '世',
  'start': 7,
  'end': 8},
 {'entity': 'I-MOVIE',
  'score': 0.5715637,
  'index': 9,
  'word': '界',
  'start': 8,
  'end': 9},
 {'entity': 'I-MOVIE',
  'score': 0.42640588,
  'index': 10,
  'word': '尽',
  'start': 9,


In [15]:
for item in resutls:
    ite = item['entity'],item['word']
    print(ite)

('B-MOVIE', '加')
('I-MOVIE', '勒')
('I-MOVIE', '比')
('I-MOVIE', '海')
('I-GAME', '盗')
('I-GAME', '3')
('I-GAME', '：')
('I-MOVIE', '世')
('I-MOVIE', '界')
('I-MOVIE', '尽')
('I-MOVIE', '头')
('I-MOVIE', '》')
('B-MOVIE', '《')
('B-MOVIE', '变')
('I-GAME', '形')
('I-GAME', '金')
('I-GAME', '刚')
('I-GAME', '》')
