In [45]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [46]:
!pip -q install evaluate seqeval

In [64]:
# 导包
from transformers import AutoModelForTokenClassification , AutoTokenizer 
from transformers import DataCollatorForTokenClassification , TrainingArguments , Trainer
from datasets import load_dataset
import numpy as np
import evaluate   # pip install evaluate
import seqeval  # pip install seqeval
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import get_linear_schedule_with_warmup
import torch.optim as optim
import torch
from tqdm import tqdm

In [48]:
def load_and_prepare_data():
    """加载并预处理数据集"""
    # 加载数据集
    ds = load_dataset("doushabao4766/msra_ner_k_V3")

    # 直接从数据集中提取已有标签（避免与原数据不一致）
    tags = [
        'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'
    ]
    # 提取实体类型 PER/ORG/LOC 及 O
    entity_types = ['O'] + sorted(list(set(tag.split('-')[-1] for tag in tags if tag != 'O')))

    entity_index = {entity: i for i, entity in enumerate(entity_types)}
    
    return ds, tags, entity_index

In [65]:
# 加载数据
ds = load_dataset('doushabao4766/msra_ner_k_V3')

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

for items in ds['train']:
    print(items['tokens'])
    print(items['ner_tags'])
    break

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

['当', '希', '望', '工', '程', '救', '助', '的', '百', '万', '儿', '童', '成', '长', '起', '来', '，', '科', '教', '兴', '国', '蔚', '然', '成', '风', '时', '，', '今', '天', '有', '收', '藏', '价', '值', '的', '书', '你', '没', '买', '，', '明', '日', '就', '叫', '你', '悔', '不', '当', '初', '！']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [66]:
# 查看tag标签数量
tags_id = set()
for tags in ds['train']:
    tags_id.update(tags['ner_tags'])

tags_id

{0, 1, 2, 3, 4, 5, 6}

In [67]:
# 构建映射标签
entites = list({'per' , 'loc' , 'org'})
tags = ['O']
for entity in entites:
    tags.append('B-' + entity.upper())  # upper()方法是转换为大写
    tags.append('I-' + entity.upper())
tags

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [68]:

# 创建构建方法 [tag + [0] * (512 - len(tag)) for tag in item['ner_tags']]
def data_input_proc(item):
    input_data = tokenizer(item['tokens'],
                          truncation = True ,  # 超过最大长度允许截断防止溢出
                          max_length = 512 ,   #最大512
                          add_special_tokens = False ,  # 禁止添加特殊标记  确保标签对其
                          is_split_into_words = True) # 因为该数据集已经按照字符划分，所以用id_split_into_words = True 表明一个字符一个字符的传入
    # 设置标签映射（超过512 截断）
    labels = [lbl[:512] for lbl in item['ner_tags']]
    input_data['labels'] = labels
    return input_data
ds1 = ds.map(data_input_proc , batched = True)

Map:   0%|          | 0/45001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3443 [00:00<?, ? examples/s]

In [69]:
for item in ds1['train']:
    print(item['tokens'])
    print(item['ner_tags'])
    print(item['knowledge'])
    print(item['input_ids'])
    print(item['token_type_ids'])
    print(item['attention_mask'])
    print(item['labels'])
    break

['当', '希', '望', '工', '程', '救', '助', '的', '百', '万', '儿', '童', '成', '长', '起', '来', '，', '科', '教', '兴', '国', '蔚', '然', '成', '风', '时', '，', '今', '天', '有', '收', '藏', '价', '值', '的', '书', '你', '没', '买', '，', '明', '日', '就', '叫', '你', '悔', '不', '当', '初', '！']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

[2496, 2361, 3307, 2339, 4923, 3131, 1221, 4638, 4636, 674, 1036, 4997, 2768, 7270, 6629, 3341, 8024, 4906, 3136, 1069, 1744, 5917, 4197, 2768, 7599, 3198, 8024, 791, 1921, 3300, 3119, 5966, 817, 966, 4638, 741, 872, 3766, 743, 8024, 3209, 3189, 2218, 1373, 872, 2637, 679, 2496, 1159, 8013]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0,

In [70]:
# 选择模型需要输入的列 将其转换为 torch张量类型
ds1.set_format('torch' , columns = ['input_ids' ,  # token 索引序列
                                    'token_type_ids' ,  # 段落标记
                                    'attention_mask' ,  # 注意力掩码
                                    'labels']) # NER标签序列
for item in ds1['train']:
    print(item)
    break

{'input_ids': tensor([2496, 2361, 3307, 2339, 4923, 3131, 1221, 4638, 4636,  674, 1036, 4997,
        2768, 7270, 6629, 3341, 8024, 4906, 3136, 1069, 1744, 5917, 4197, 2768,
        7599, 3198, 8024,  791, 1921, 3300, 3119, 5966,  817,  966, 4638,  741,
         872, 3766,  743, 8024, 3209, 3189, 2218, 1373,  872, 2637,  679, 2496,
        1159, 8013]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]), 'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0])}


In [71]:
# 构建模型初始化可读标签参数，
id2lbl = {i:tag for i,tag in enumerate(tags)}
lbl2id = {tag:i for i,tag in enumerate(tags)}

model = AutoModelForTokenClassification.from_pretrained("bert-base-chinese" , # 预训练模型
                                                       num_labels = len(tags) ,  # 输出的分类数量
                                                       id2label = id2lbl , 
                                                       label2id = lbl2id)
model

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [72]:
# 自动填充对其
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer , padding = True)
# 在DataLoader中使用
train_dl = DataLoader(
    ds1['train'], 
    batch_size=16,
    shuffle = True,
    collate_fn = data_collator
)


model.to('cuda')

# 模型参数分组

# 获取模型参数
param_optimizer = list(model.named_parameters())
bert_params, classifier_params = [],[]

for name,params in param_optimizer:
    # 获取预训练模型
    if 'bert' in name:
        bert_params.append(params)
    else:
        classifier_params.append(params)

param_groups = [
    {'params':bert_params, 'lr':1e-5},  # 预训练模型的学习率较低 保持稳定性
    {'params':classifier_params, 'weight_decay':0.1, 'lr':1e-3} # 新的分类层学习率较高 更好的学习，'weight_decay':0.1 使用正则化L2
]

# optimizer
optimizer = optim.AdamW(param_groups) # 优化器

# 学习率调度器

# 步长 从初始设置值到0 衰减需要的步长
train_steps = len(train_dl) * 5
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            # 预热 从0到初始设置值的步长
                                            num_warmup_steps=100, 
                                            # 衰减 从初始设置值到0 衰减需要的步长
                                            num_training_steps=train_steps)


for item in train_dl:
    print(item['input_ids'].shape, 
          item['token_type_ids'].shape, 
          item['attention_mask'].shape, 
          item['labels'].shape)
    break

DEVICE='cuda'

for epoch in range(5):
    model.train()
    tpbar = tqdm(train_dl)
    for items in tpbar:
        # 张量移动到指定的设备商
        items = {k:v.to(DEVICE) for k,v in items.items()}
        # 数据传入模型
        outputs = model(**items)
        # 计算损失
        loss = outputs.loss
        # 反向传播计算梯度
        loss.backward()
        # 更新模型参数的梯度
        optimizer.step()
        # 更新学习率
        scheduler.step()
        # 模型参数的梯度清零
        optimizer.zero_grad()
    
        tpbar.set_description(f'Epoch:{epoch+1} ' + 
                          f'bert_lr:{scheduler.get_lr()[0]} ' + 
                          f'classifier_lr:{scheduler.get_lr()[1]} '+
                          f'Loss:{loss.item():.4f}')

torch.Size([16, 90]) torch.Size([16, 90]) torch.Size([16, 90]) torch.Size([16, 90])


Epoch:1 bert_lr:8.057286072323668e-06 classifier_lr:0.0008057286072323666 Loss:0.0498: 100%|██████████| 2813/2813 [16:35<00:00,  2.83it/s]
Epoch:2 bert_lr:6.042964554242751e-06 classifier_lr:0.000604296455424275 Loss:0.0105: 100%|██████████| 2813/2813 [16:37<00:00,  2.82it/s]  
Epoch:3 bert_lr:4.028643036161834e-06 classifier_lr:0.0004028643036161833 Loss:0.0121: 100%|██████████| 2813/2813 [16:38<00:00,  2.82it/s]  
Epoch:4 bert_lr:2.014321518080917e-06 classifier_lr:0.00020143215180809166 Loss:0.0001: 100%|██████████| 2813/2813 [16:33<00:00,  2.83it/s] 
Epoch:5 bert_lr:0.0 classifier_lr:0.0 Loss:0.0001: 100%|██████████| 2813/2813 [16:41<00:00,  2.81it/s]                                      


In [73]:
def compute_metric(result):
    # 传入的result是一个元祖 (predicts,labels)

    # 加载序列标注评估指标库
    seqeval = evaluate.load('seqeval')
    # 解构模型输出的结果
    predicts,labels = result
    # 沿着axis = 2 的维度 取最大值索引 然后将predicts转换为预测标签ID
    predicts = np.argmax(predicts , axis = 2)
    # 准备评估数据 将数字ID转换为文本标签 并且过滤填充数值-100
    predicts = [[tags[p] for p,l in zip(ps,ls) if l != -100]
                 for ps,ls in zip(predicts,labels)]
    labels = [[tags[l] for p,l in zip(ps,ls) if l != -100]
                 for ps,ls in zip(predicts,labels)]
    results = seqeval.compute(predictions = predicts , references = labels)
    return results

In [74]:
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer , padding = True)

In [75]:
args = TrainingArguments(
    output_dir = 'ner_train' , # 设置模型输出目录
    num_train_epochs = 3 , # 训练轮数
    #save_safetensor = False # 模型禁止保存safe格式 可以用troch.load加载
    per_device_train_batch_size = 32 , # 训练批次
    per_device_eval_batch_size = 32 ,  # 评估批次
    report_to = 'tensorboard' , # 设置训练输出记录为tensorboard
    eval_strategy = 'epoch'  # 每轮评估一次
)

In [76]:
trainer = Trainer(
    model = model ,  # 指定模型
    args = args , # 指定设置参数
    train_dataset = ds1['train'] ,  # 输入训练数据
    eval_dataset = ds1['test'] ,  # 输入评估数据
    compute_metrics = compute_metric , # 指定评估函数
    data_collator = data_collator  # 指定数据收集器
)

In [77]:

trainer.train()



Epoch,Training Loss,Validation Loss,Loc,Org,Per,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.0144,0.033448,"{'precision': 0.9479092841956059, 'recall': 0.9379382889200562, 'f1': 0.9428974268593585, 'number': 2852}","{'precision': 0.8187456926257753, 'recall': 0.9, 'f1': 0.8574521833273187, 'number': 1320}","{'precision': 0.9413298565840938, 'recall': 0.9607451763140386, 'f1': 0.9509384260783669, 'number': 1503}",0.913897,0.935154,0.924403,0.991583
2,0.0102,0.031717,"{'precision': 0.9583184894905593, 'recall': 0.9431977559607293, 'f1': 0.950698003180774, 'number': 2852}","{'precision': 0.8731778425655977, 'recall': 0.9075757575757576, 'f1': 0.8900445765230313, 'number': 1320}","{'precision': 0.9519104084321476, 'recall': 0.9614105123087159, 'f1': 0.9566368752068851, 'number': 1503}",0.936107,0.939736,0.937918,0.993087
3,0.003,0.038207,"{'precision': 0.9593292900463789, 'recall': 0.9428471248246845, 'f1': 0.9510167992926614, 'number': 2852}","{'precision': 0.8776097912167027, 'recall': 0.9234848484848485, 'f1': 0.8999630860095975, 'number': 1320}","{'precision': 0.9572086899275839, 'recall': 0.9673985362608117, 'f1': 0.9622766379880874, 'number': 1503}",0.93889,0.944846,0.941858,0.993504




Downloading builder script: 0.00B [00:00, ?B/s]

Trainer is attempting to log a value of "{'precision': 0.9479092841956059, 'recall': 0.9379382889200562, 'f1': 0.9428974268593585, 'number': 2852}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8187456926257753, 'recall': 0.9, 'f1': 0.8574521833273187, 'number': 1320}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9413298565840938, 'recall': 0.9607451763140386, 'f1': 0.9509384260783669, 'number': 1503}" of type <class 'dict'> for key "eval/PER" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9583184894905593, 'recall': 0.9431977559607293, 'f1': 0.95069

TrainOutput(global_step=2112, training_loss=0.008169087835333565, metrics={'train_runtime': 2324.0708, 'train_samples_per_second': 58.089, 'train_steps_per_second': 0.909, 'total_flos': 1.180990200098808e+16, 'train_loss': 0.008169087835333565, 'epoch': 3.0})

In [81]:
from transformers import pipeline

pipeline = pipeline('token-classification', 'ner_train/checkpoint-2112')

text = pipeline('双方确定了今后发展中美关系的指导方针')

Device set to use cuda:0
