In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
###利用上周NER模型训练任务代码，复现课堂案例中：动态学习率、混合精度、DDP训练实现。

In [1]:
# 升级numpy（如果cesium已兼容numpy 2.0+）
!pip install numpy>=2.0

# 降级rich
!pip install rich<14

# 降级所有NVIDIA CUDA库（根据torch 2.6.0+cu124的要求）
!pip install nvidia-cublas-cu12==12.4.5.8 \
            nvidia-cudnn-cu12==9.1.0.70 \
            nvidia-cufft-cu12==11.2.1.3 \
            nvidia-curand-cu12==10.3.5.147 \
            nvidia-cusolver-cu12==11.6.1.9 \
            nvidia-cusparse-cu12==12.3.1.170 \
            nvidia-nvjitlink-cu12==12.4.127

# 升级fsspec
!pip install fsspec==2025.3.2

/bin/bash: line 1: 14: No such file or directory
Collecting nvidia-cublas-cu12==12.4.5.8
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cufft-cu12==11.2.1.3
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5

In [2]:
!pip -q install evaluate seqeval

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.[0m[31m
[0m

In [9]:
%%writefile week12_01_ner_ddp.py

import os
import numpy as np
from transformers import AutoModelForTokenClassification, AutoTokenizer,DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import torch
import evaluate  # pip install evaluate
import seqeval   # pip install seqeval
from datasets import load_dataset
import torch.distributed as dist
import torch.multiprocessing as mp

# 设置分布式环境
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

# 清理分布式环境
def cleanup():
    dist.destroy_process_group()
    

def train(rank, world_size):
    setup(rank, world_size)
    # 数据集
    ds = load_dataset("doushabao4766/msra_ner_k_V3")
    tokenizer=AutoTokenizer.from_pretrained('bert-base-chinese')

    ##entity_index
    entites=['0','PER','ORG','LOC']
    tags=['0']
    for entity in entites[1:]:
        tags.append('B-'+entity.upper())
        tags.append('I-'+entity.upper())
    entity_index={entity:i for i,entity in enumerate(entites)}

    
    
    def data_input_proc(item):
        # 输入文本先拆分为字符，再转换为模型输入的token索引
        # batch_texts = [list(text) for text in item['text']]
        # 导入拆分为字符的文本列表时，需要设置参数is_split_into_words=True
        input_data=tokenizer(item['tokens'],
                         truncation=True,
                         add_special_tokens=False,
                         max_length=512,
                         is_split_into_words=True, ###所有文本都拆分成字符列表
                            return_offsets_mapping=True)
        labels=[lbl[:512] for lbl in item['ner_tags']] ##截断超过512的标签
        input_data['labels']=labels
        return input_data

    ds2 = ds.map(data_input_proc, batched=True)  # batch_size 1000
    
    
    local_rank = rank
    
    id2lbl = {i:tag for i, tag in enumerate(tags)}
    lbl2id = {tag:i for i, tag in enumerate(tags)}
    
    model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese', 
                                                            num_labels=7,
                                                            id2label=id2lbl,
                                                            label2id=lbl2id)
    model.to(local_rank)
    
    args = TrainingArguments(
        output_dir="ner_train",  # 模型训练工作目录（tensorboard，临时模型存盘文件，日志）
        num_train_epochs = 3,    # 训练 epoch
        save_safetensors=False,  # 设置False保存文件可以通过torch.load加载
        per_device_train_batch_size=8,  # 训练批次
        per_device_eval_batch_size=16,
        report_to='tensorboard',  # 训练输出记录
        eval_strategy="epoch",
        local_rank=local_rank,   # 当前进程 RANK
        fp16=True,               # 使用混合精度
        lr_scheduler_type='linear',  # 动态学习率
        warmup_steps=100,        # 预热步数
        ddp_find_unused_parameters=False  # 优化DDP性能
    )
    
    def compute_metric(result):
        # result 是一个tuple (predicts, labels)
        
        # 获取评估对象
        seqeval = evaluate.load('seqeval')
        predicts,labels = result
        predicts = np.argmax(predicts, axis=2)
        
        # 准备评估数据
        predicts = [[tags[p] for p,l in zip(ps,ls) if l != -100]
                     for ps,ls in zip(predicts,labels)]
        labels = [[tags[l] for p,l in zip(ps,ls) if l != -100]
                     for ps,ls in zip(predicts,labels)]
        results = seqeval.compute(predictions=predicts, references=labels)
    
        return results
    
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)
    
    trainer = Trainer(
        model,
        args,
        train_dataset=ds2['train'],
        eval_dataset=ds2['test'],
        data_collator=data_collator,
        compute_metrics=compute_metric
    )
    
    trainer.train()
    cleanup()

def main():
    world_size = torch.cuda.device_count()
    mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)

if __name__ == "__main__":
    main()

Overwriting week12_01_ner_ddp.py


In [10]:
!python week12_01_ner_ddp.py

2025-06-12 11:21:02.327581: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749727262.350854    1577 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749727262.358033    1577 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-12 11:21:11.907329: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749727271.932015    1592 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749727271.938984    1592 cuda_blas.cc:1

In [11]:
from transformers import pipeline

2025-06-12 12:55:19.364387: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749732919.388136     115 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749732919.395364     115 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [12]:
pipeline=pipeline('token-classification','ner_train/checkpoint-8439')

Device set to use cuda:0


In [13]:
pipeline('双方确定了今后发展中美关系的指导方针')

[{'entity': '0', 'score': 1.0, 'index': 1, 'word': '双', 'start': 0, 'end': 1},
 {'entity': '0', 'score': 1.0, 'index': 2, 'word': '方', 'start': 1, 'end': 2},
 {'entity': '0', 'score': 1.0, 'index': 3, 'word': '确', 'start': 2, 'end': 3},
 {'entity': '0', 'score': 1.0, 'index': 4, 'word': '定', 'start': 3, 'end': 4},
 {'entity': '0', 'score': 1.0, 'index': 5, 'word': '了', 'start': 4, 'end': 5},
 {'entity': '0', 'score': 1.0, 'index': 6, 'word': '今', 'start': 5, 'end': 6},
 {'entity': '0', 'score': 1.0, 'index': 7, 'word': '后', 'start': 6, 'end': 7},
 {'entity': '0', 'score': 1.0, 'index': 8, 'word': '发', 'start': 7, 'end': 8},
 {'entity': '0', 'score': 1.0, 'index': 9, 'word': '展', 'start': 8, 'end': 9},
 {'entity': 'B-LOC',
  'score': 0.99999964,
  'index': 10,
  'word': '中',
  'start': 9,
  'end': 10},
 {'entity': 'B-LOC',
  'score': 0.99999964,
  'index': 11,
  'word': '美',
  'start': 10,
  'end': 11},
 {'entity': '0',
  'score': 1.0,
  'index': 12,
  'word': '关',
  'start': 11,
  'end

In [14]:
pipeline('2023年7月我在纽约参加学术会议。')

[{'entity': '0',
  'score': 1.0,
  'index': 1,
  'word': '202',
  'start': 0,
  'end': 3},
 {'entity': '0',
  'score': 1.0,
  'index': 2,
  'word': '##3',
  'start': 3,
  'end': 4},
 {'entity': '0', 'score': 1.0, 'index': 3, 'word': '年', 'start': 4, 'end': 5},
 {'entity': '0', 'score': 1.0, 'index': 4, 'word': '7', 'start': 5, 'end': 6},
 {'entity': '0', 'score': 1.0, 'index': 5, 'word': '月', 'start': 6, 'end': 7},
 {'entity': '0', 'score': 1.0, 'index': 6, 'word': '我', 'start': 7, 'end': 8},
 {'entity': '0', 'score': 1.0, 'index': 7, 'word': '在', 'start': 8, 'end': 9},
 {'entity': 'B-LOC',
  'score': 0.99999976,
  'index': 8,
  'word': '纽',
  'start': 9,
  'end': 10},
 {'entity': 'I-LOC',
  'score': 0.99999976,
  'index': 9,
  'word': '约',
  'start': 10,
  'end': 11},
 {'entity': '0',
  'score': 1.0,
  'index': 10,
  'word': '参',
  'start': 11,
  'end': 12},
 {'entity': '0',
  'score': 1.0,
  'index': 11,
  'word': '加',
  'start': 12,
  'end': 13},
 {'entity': '0',
  'score': 1.0,
  '