In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [69]:
!pip -q install evaluate
!pip -q install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [70]:
from transformers import AutoModelForTokenClassification, AutoTokenizer,DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import torch
import evaluate  # pip install evaluate
import seqeval   # pip install seqeval
from datasets import load_dataset

In [71]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

In [136]:
# 加载hf中dataset
ds = load_dataset('doushabao4766/msra_ner_k_V3')
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 45001
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 3443
    })
})

In [137]:
def data_input_proc(item):
    # 导入拆分为字符的文本列表时，需要设置参数is_split_into_words=True
    input_data = tokenizer(item['tokens'], truncation=True, add_special_tokens=False, max_length=512, 
                           is_split_into_words=True, padding='max_length')
    input_data['labels'] = [(tag + [0] * (512 - len(tag)))[:512] for tag in item['ner_tags']]
    return input_data
    

ds2 = ds.map(data_input_proc, batched=True)  # batched 每次传入自定义方法样本数量多个
ds2.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/45001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3443 [00:00<?, ? examples/s]

In [106]:
id2lbl = {'0': 'O','1': 'B-PER','2': 'I-PER','3': 'B-ORG','4': 'I-ORG','5': 'B-LOC','6': 'I-LOC'}
lbl2id = {id2lbl[tag]:tag for tag in id2lbl}

In [107]:
model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese', 
                                                        num_labels=len(id2lbl),
                                                        id2label=id2lbl,
                                                        label2id=lbl2id)
model.to('cuda')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

## 动态学习率

In [140]:
# dataLoader
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
import torch.optim as optim

train_dl = DataLoader(ds2['train'], shuffle=True, batch_size=16)

# 模型参数分组
param_optimizer = list(model.named_parameters())
bert_params, classifier_params = [],[]

for name,params in param_optimizer:
    if 'bert' in name:
        bert_params.append(params)
    else:
        classifier_params.append(params)

param_groups = [
    {'params':bert_params, 'lr':1e-5},
    {'params':classifier_params, 'weight_decay':0.1, 'lr':1e-3}
]

# optimizer
optimizer = optim.AdamW(param_groups) # 优化器

# 学习率调度器
train_steps = len(train_dl) * 5
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=100, 
                                            num_training_steps=train_steps)

In [11]:
from tqdm import tqdm

DEVICE='cuda'

for epoch in range(5):
    model.train()
    tpbar = tqdm(train_dl)
    for items in tpbar:
        items = {k:v.to(DEVICE) for k,v in items.items()}
        optimizer.zero_grad()
        outputs = model(**items)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
    
        tpbar.set_description(f'Epoch:{epoch+1} ' + 
                          f'bert_lr:{scheduler.get_lr()[0]} ' + 
                          f'classifier_lr:{scheduler.get_lr()[1]} '+
                          f'Loss:{loss.item():.4f}')

Epoch:1 bert_lr:9.000000000000001e-07 classifier_lr:8.999999999999999e-05 Loss:1.8119:   0%|          | 9/2813 [00:15<1:20:12,  1.72s/it]


KeyboardInterrupt: 

## 混合精度

In [142]:
from tqdm import tqdm
import torch

DEVICE='cuda'

# 梯度计算缩放器
scaler = torch.GradScaler()

for epoch in range(5):
    model.train()
    tpbar = tqdm(train_dl)
    for items in tpbar:
        items = {k:v.to(DEVICE) for k,v in items.items()}
        optimizer.zero_grad()

        with torch.autocast(device_type='cuda'):
            outputs = model(**items)
        loss = outputs.loss

        # 缩放loss后，调用backward
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
    
        tpbar.set_description(f'Epoch:{epoch+1} ' + 
                          f'bert_lr:{scheduler.get_lr()[0]} ' + 
                          f'classifier_lr:{scheduler.get_lr()[1]} '+
                          f'Loss:{loss.item():.4f}')

Epoch:1 bert_lr:8.627998567848193e-06 classifier_lr:0.0008627998567848192 Loss:0.0029:  72%|███████▏  | 2016/2813 [14:43<05:49,  2.28it/s]


KeyboardInterrupt: 

## 分布式

In [151]:
%%writefile ner_ddp.py

import os
import numpy as np
from transformers import AutoModelForTokenClassification, AutoTokenizer,DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import torch
import evaluate  # pip install evaluate
import seqeval   # pip install seqeval
from datasets import load_dataset
import torch.distributed as dist
import torch.multiprocessing as mp

# 设置分布式环境
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

# 清理分布式环境
def cleanup():
    dist.destroy_process_group()
    

def train(rank, world_size):
    setup(rank, world_size)
    # 数据集
    ds = load_dataset('doushabao4766/msra_ner_k_V3')

    tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')
    
    def data_input_proc(item):
        # 导入拆分为字符的文本列表时，需要设置参数is_split_into_words=True
        input_data = tokenizer(item['tokens'], truncation=True, add_special_tokens=False, max_length=512, 
                               is_split_into_words=True, padding='max_length')
        input_data['labels'] = [(tag + [0] * (512 - len(tag)))[:512] for tag in item['ner_tags']]
        return input_data
        
    ds2 = ds.map(data_input_proc, batched=True)  # batched 每次传入自定义方法样本数量多个
    ds2.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    
    id2lbl = {'0': 'O','1': 'B-PER','2': 'I-PER','3': 'B-ORG','4': 'I-ORG','5': 'B-LOC','6': 'I-LOC'}
    lbl2id = {id2lbl[tag]:tag for tag in id2lbl}
    
    local_rank = rank
    
    model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese', 
                                                            num_labels=len(id2lbl),
                                                            id2label=id2lbl,
                                                            label2id=lbl2id)
    model.to(local_rank)
    
    args = TrainingArguments(
        output_dir="ner_train",  # 模型训练工作目录（tensorboard，临时模型存盘文件，日志）
        num_train_epochs = 3,    # 训练 epoch
        save_safetensors=False,  # 设置False保存文件可以通过torch.load加载
        per_device_train_batch_size=16,  # 训练批次
        per_device_eval_batch_size=16,
        report_to='tensorboard',  # 训练输出记录
        eval_strategy="epoch",
        local_rank=local_rank,   # 当前进程 RANK
        fp16=True,               # 使用混合精度
        lr_scheduler_type='linear',  # 动态学习率
        warmup_steps=100,        # 预热步数
        ddp_find_unused_parameters=False  # 优化DDP性能
    )
    
    def compute_metric(result):
        # result 是一个tuple (predicts, labels)
        
        # 获取评估对象
        seqeval = evaluate.load('seqeval')
        predicts,labels = result
        predicts = np.argmax(predicts, axis=2)
        
        # 准备评估数据
        predicts = [[id2lbl[str(p)] for p,l in zip(ps,ls) if l != -100]
                     for ps,ls in zip(predicts,labels)]
        labels = [[id2lbl[str(l)] for p,l in zip(ps,ls) if l != -100]
                     for ps,ls in zip(predicts,labels)]
        results = seqeval.compute(predictions=predicts, references=labels)
    
        return results
    
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)
    
    trainer = Trainer(
        model,
        args,
        train_dataset=ds2['train'],
        eval_dataset=ds2['test'],
        data_collator=data_collator,
        compute_metrics=compute_metric
    )
    
    trainer.train()

def main():
    world_size = torch.cuda.device_count()
    mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)

if __name__ == "__main__":
    main()

Overwriting ner_ddp.py


In [152]:
!python ner_ddp.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2025-06-14 10:22:55.881491: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749896575.905887     539 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749896575.913490     539 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-14 10:23:05.150857: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749896585.174502     553 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749896585.181345     553 cuda_blas.cc:1