1、手动实现动态学习率、混合精度、DDP训练

In [37]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from torch.utils.data import DataLoader
import torch.optim as optim
from tqdm import tqdm
import torch
from transformers import get_linear_schedule_with_warmup

In [38]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'google-bert/bert-base-chinese'
EPOCHES = 3
# 数据预处理
ds = load_dataset("doushabao4766/msra_ner_k_V3")
# 对ds中的数据进行过滤:过滤掉tokens为空的数据
def data_filter(item):
    return len(item['tokens']) > 0
ds['train'] = ds['train'].filter(data_filter)
ds['test'] = ds['test'].filter(data_filter)
tags = ds['train'].features['ner_tags'].feature.names
entites = ['O', 'PER', 'ORG', 'LOC']
entity_index = {e:i for i,e in enumerate(entites)}
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = tokenizer.model_max_length # 512

In [39]:
def data_input_proc(item):
    input_data_list = []
    # 对tokens进行分词,而不是将tokens合并成句子再分词,因为合并成句子再分词会导致input_ids的长度和ner_tags的长度不一致
    # is_split_into_words=True已经分词不需要再分词，https://hf.cloudwisdom.top/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__
    input_data = tokenizer(item['tokens'], 
                           truncation=True, 
                           add_special_tokens=False, 
                           max_length=512, 
                           is_split_into_words=True,
                           padding='max_length')
    # 对ner_tags的长度也进行截取和input_data长度一致
    ner_tags = [n[:512] for n in item['ner_tags']]
    ner_tags = [n+[0]*(512-len(n)) for n in ner_tags]
    # DataCollatorForTokenClassification中需要有labels这个标签
    input_data['labels'] = ner_tags
    return input_data

In [40]:
ds = ds.map(data_input_proc, batched=True)
ds.set_format(type="torch", columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/3442 [00:00<?, ? examples/s]

In [41]:
def gen_dl_model():
    train_dl = DataLoader(ds['train'], shuffle=True, batch_size=16)
    id2label = {i:tag for i, tag in enumerate(tags)}
    label2id = {tag:i for i, tag in enumerate(tags)}
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=7, id2label=id2label, label2id=label2id)
    return train_dl, model

In [46]:
# 模型训练
train_dl, model = gen_dl_model()
model.to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
for epoch in range(EPOCHES):
    model.train()
    tpbar = tqdm(train_dl)
    for items in tpbar:
        items = {k:v.to(DEVICE) for k,v in items.items()}
        optimizer.zero_grad()
        outputs = model(**items)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        tpbar.set_description(f'epoch={epoch+1},loss={loss.item()}')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
epoch=1,loss=0.07638794183731079:   0%|          | 3/2813 [00:06<1:45:42,  2.26s/it]


KeyboardInterrupt: 

In [45]:
# 动态学习率
train_dl, model = gen_dl_model()
model.to(DEVICE)
model_named_params = list(model.named_parameters())
bert_params, classifier_params = [], []
for name, params in model_named_params:
    if 'bert' in name:
        bert_params.append(params)
    else:
        classifier_params.append(params)
param_groups = [
    {'params': bert_params, 'lr':1e-5},
    {'params': classifier_params, 'lr':1e-3}
]
optimizer = optim.AdamW(param_groups)
# 学习率调度器
train_steps = len(train_dl) * EPOCHES
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=train_steps
)
for epoch in range(EPOCHES):
    model.train()
    tpbar = tqdm(train_dl)
    for items in tpbar:
        items = {k:v.to(DEVICE) for k,v in items.items()}
        optimizer.zero_grad()
        outputs = model(**items)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        tpbar.set_description(f'epoch={epoch+1},' + 
                             f'loss={loss.item()},' + 
                             f'bert_lr={scheduler.get_lr()[0]},' + 
                             f'classifier_lr={scheduler.get_lr()[1]}')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
epoch=1,loss=1.4927350282669067,bert_lr=1.7000000000000002e-06,classifier_lr=0.00017:   1%|          | 17/2813 [00:32<1:29:46,  1.93s/it]               


KeyboardInterrupt: 

In [48]:
# 混合精度训练
train_dl, model = gen_dl_model()
model.to(DEVICE)
# 梯度计算缩放器
scaler = torch.GradScaler()

model_named_params = list(model.named_parameters())
bert_params, classifier_params = [], []
for name, params in model_named_params:
    if 'bert' in name:
        bert_params.append(params)
    else:
        classifier_params.append(params)
param_groups = [
    {'params': bert_params, 'lr':1e-5},
    {'params': classifier_params, 'lr':1e-3}
]
optimizer = optim.AdamW(param_groups)
# 学习率调度器
train_steps = len(train_dl) * EPOCHES
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=train_steps
)
for epoch in range(EPOCHES):
    model.train()
    tpbar = tqdm(train_dl)
    for items in tpbar:
        items = {k:v.to(DEVICE) for k,v in items.items()}
        optimizer.zero_grad()
        with torch.autocast(device_type = 'cuda'):
            outputs = model(**items)
        loss = outputs.loss
        # 缩放loss后，反向传播
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        tpbar.set_description(f'epoch={epoch+1},' + 
                             f'loss={loss.item()},' + 
                             f'bert_lr={scheduler.get_lr()[0]},' + 
                             f'classifier_lr={scheduler.get_lr()[1]}')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
epoch=1,loss=0.09300730377435684,bert_lr=4.1e-06,classifier_lr=0.00041:   1%|▏         | 41/2813 [00:19<21:34,  2.14it/s]                             


KeyboardInterrupt: 

分布式训练

In [5]:
%%writefile ddp_simple.py

import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from torch.utils.data import DataLoader, DistributedSampler
import torch.optim as optim
from tqdm import tqdm
import torch
from transformers import get_linear_schedule_with_warmup
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

# 对ds中的数据进行过滤:过滤掉tokens为空的数据
def data_filter(item):
    return len(item['tokens']) > 0

def data_input_proc_fn(tokenizer):
    def data_input_proc(item):
        input_data_list = []
        # 对tokens进行分词,而不是将tokens合并成句子再分词,因为合并成句子再分词会导致input_ids的长度和ner_tags的长度不一致
        # is_split_into_words=True已经分词不需要再分词，https://hf.cloudwisdom.top/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__
        input_data = tokenizer(item['tokens'], 
                               truncation=True, 
                               add_special_tokens=False, 
                               max_length=512, 
                               is_split_into_words=True,
                               padding='max_length')
        # 对ner_tags的长度也进行截取和input_data长度一致
        ner_tags = [n[:512] for n in item['ner_tags']]
        ner_tags = [n+[0]*(512-len(n)) for n in ner_tags]
        # DataCollatorForTokenClassification中需要有labels这个标签
        input_data['labels'] = ner_tags
        return input_data
    return data_input_proc

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group('nccl', rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

# 模型训练
def train(rank, world_size):
    setup(rank, world_size)
    
    model_name = 'google-bert/bert-base-chinese'
    EPOCHES = 3
    # 数据预处理
    ds = load_dataset("doushabao4766/msra_ner_k_V3")
    ds['train'] = ds['train'].filter(data_filter)
    ds['test'] = ds['test'].filter(data_filter)
    tags = ds['train'].features['ner_tags'].feature.names
    entites = ['O', 'PER', 'ORG', 'LOC']
    entity_index = {e:i for i,e in enumerate(entites)}
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    max_length = tokenizer.model_max_length # 512
    ds = ds.map(data_input_proc_fn(tokenizer), batched=True)
    ds.set_format(type="torch", columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    
    # 构建模型
    id2label = {i:tag for i, tag in enumerate(tags)}
    label2id = {tag:i for i, tag in enumerate(tags)}
    model_tmp = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=7, id2label=id2label, label2id=label2id)
    model_tmp.to(rank)
    model = DDP(model_tmp, device_ids=[rank])
    # 分布式训练采样器
    sampler = DistributedSampler(ds['train'], num_replicas=world_size, rank=rank)
    train_dl = DataLoader(ds['train'], sampler=sampler, batch_size=16)

    # 动态学习率
    model_named_params = list(model.named_parameters())
    bert_params, classifier_params = [], []
    for name, params in model_named_params:
        if 'bert' in name:
            bert_params.append(params)
        else:
            classifier_params.append(params)
    param_groups = [
        {'params': bert_params, 'lr':1e-5},
        {'params': classifier_params, 'lr':1e-3}
    ]
    optimizer = optim.AdamW(param_groups)
    # 学习率调度器
    train_steps = len(train_dl) * EPOCHES
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=100,
        num_training_steps=train_steps
    )

    # 梯度计算缩放器(混合精度)
    scaler = torch.GradScaler()
    
    for epoch in range(EPOCHES):
        model.train()
        tpbar = tqdm(train_dl)
        for items in tpbar:
            items = {k:v.to(rank) for k,v in items.items()}
            optimizer.zero_grad()
            with torch.autocast(device_type = 'cuda'):
                outputs = model(**items)
            loss = outputs.loss
            # 缩放loss后，反向传播
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            tpbar.set_description(f'rank={rank},' + 
                                 f'epoch={epoch+1},' + 
                                 f'loss={loss.item()},' + 
                                 f'bert_lr={scheduler.get_lr()[0]},' + 
                                 f'classifier_lr={scheduler.get_lr()[1]}')
    cleanup()

def main():
    world_size = torch.cuda.device_count()
    mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)

if __name__ == '__main__':
    main()

Overwriting ddp_simple.py


In [6]:
!python ddp_simple.py

2025-06-14 11:41:03.050989: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749901263.076372     249 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749901263.084284     249 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-14 11:41:13.539285: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749901273.566147     264 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749901273.574525     264 cuda_blas.cc:1