1. 参考课堂案例，使用指定的数据集，编写代码实现ner模型训练和推理。
https://huggingface.co/datasets/doushabao4766/msra_ner_k_V3
doushabao4766/msra_ner_k_V3
2. 完成预测结果的实体抽取。
    输入：“双方确定了今后发展中美关系的指导方针。”
    输出：[{"entity":"ORG","content":"中"},{"entity":"ORG","content":"美"}]
3. 整理Dataset、Trainer、TrainingArgument、DataCollator、Evaluate 知识点，总结文档

In [85]:
from transformers import AutoModelForTokenClassification,AutoTokenizer,DataCollatorForTokenClassification
from transformers import TrainingArguments,Trainer
import torch
import evaluate
import seqeval
from datasets import load_dataset
import numpy as np

In [86]:
ds = load_dataset("doushabao4766/msra_ner_k_V3")
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 45001
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 3443
    })
})

In [87]:
for item in ds['train']:
    break
 
print(item)

{'id': '0', 'tokens': ['当', '希', '望', '工', '程', '救', '助', '的', '百', '万', '儿', '童', '成', '长', '起', '来', '，', '科', '教', '兴', '国', '蔚', '然', '成', '风', '时', '，', '今', '天', '有', '收', '藏', '价', '值', '的', '书', '你', '没', '买', '，', '明', '日', '就', '叫', '你', '悔', '不', '当', '初', '！'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'knowledge': ''}


****实体映射字典****

In [88]:
#验证tag标签数量
tags_id = set()
for items in ds['train']:
    tags_id.update(items['ner_tags'])

tags_id


{0, 1, 2, 3, 4, 5, 6}

In [89]:
# #entity_index
# entites = ['O'] + list({'PER','LOC','ORG',})
# tags = ['O']
# for entity in entites[1:]:
#     tags.append('B-' + entity.upper())
#     tags.append('I-' + entity.upper())

# entity_index = {entity:i for i, entity in enumerate(entites)}

# print(entity_index)
# print(tags)

In [90]:
tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']

In [91]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

def data_input_proc(item):
    input_data = tokenizer(item['tokens'],
                           #自动截断超过512部分
                          truncation=True,
                            #关闭[cls]&[sep]
                          add_special_tokens=False,
                          max_length=512,
                           #告诉tokenizer已经分好词，无需再次分词
                          is_split_into_words=True )
    labels = [lbl[:512] for lbl in item['ner_tags']]
    input_data['labels'] = labels
    return input_data
    
ds1 = ds.map(data_input_proc,batched=True)

In [92]:
ds1.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

for item in ds1['train']:
   
    print(item)
    break

{'input_ids': tensor([2496, 2361, 3307, 2339, 4923, 3131, 1221, 4638, 4636,  674, 1036, 4997,
        2768, 7270, 6629, 3341, 8024, 4906, 3136, 1069, 1744, 5917, 4197, 2768,
        7599, 3198, 8024,  791, 1921, 3300, 3119, 5966,  817,  966, 4638,  741,
         872, 3766,  743, 8024, 3209, 3189, 2218, 1373,  872, 2637,  679, 2496,
        1159, 8013]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]), 'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0])}


**构建模型对象**

In [93]:
# 创建标签映射字典
id2label = {i: tags for i, tags in enumerate(tags)}
label2id = {tags: i for i, tags in enumerate(tags)}

model = AutoModelForTokenClassification.from_pretrained('bert-base-chinese', 
                                                       num_labels = len(tags),
                                                       id2label=id2label,
                                                       label2id=label2id)
model

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

**模型训练TranningArguments**

In [112]:
args = TrainingArguments(output_dir = 'msra_ner_train', # 模型训练工作目录（tensorboard，临时模型存盘文件，日志）
                        num_train_epochs=1,
                        per_device_train_batch_size=32,
                        per_device_eval_batch_size=32,
                        report_to='tensorboard',
                         #可选epoch
                        eval_strategy='steps',
                        eval_steps=400)

**模型训练 Trainer**

In [113]:
#metric方法
def compute_metric(result):
    # result 是一个tuple (predicts, labels)

    #获取评估对象
    seqeval = evaluate.load("seqeval")
    predicts,labels = result
    predicts = np.argmax(predicts,axis=2)

    #准备评估数据
    predicts = [[tags[p]for p,l in zip(ps,ls) if l != -100]
                for ps,ls in zip(predicts,labels)]
    labels = [[tags[l]for p,l in zip(ps,ls) if l != -100]
                for ps,ls in zip(predicts,labels)]
    #根据预测标签和真实标签，自动计算序列标注任务的评估指标（精确率、召回率、F1分数等）
    results = seqeval.compute(predictions=predicts, references=labels)

    return results
    

In [96]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

In [114]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds1['train'],
    eval_dataset=ds1['test'],
    data_collator=data_collator,
    compute_metrics=compute_metric
)

In [115]:
trainer.train()

Step,Training Loss,Validation Loss,Loc,Org,Per,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
400,No log,0.045489,"{'precision': 0.8651603498542274, 'recall': 0.8992424242424243, 'f1': 0.8818722139673105, 'number': 1320}","{'precision': 0.9432019879304224, 'recall': 0.9316269284712483, 'f1': 0.9373787264067738, 'number': 2852}","{'precision': 0.9446666666666667, 'recall': 0.9427811044577512, 'f1': 0.9437229437229436, 'number': 1503}",0.924767,0.927048,0.925906,0.991742
800,0.005700,0.041782,"{'precision': 0.8575498575498576, 'recall': 0.9121212121212121, 'f1': 0.8839941262848753, 'number': 1320}","{'precision': 0.9576361694553222, 'recall': 0.9431977559607293, 'f1': 0.950362126832715, 'number': 2852}","{'precision': 0.9353099730458221, 'recall': 0.9234863606121091, 'f1': 0.929360562437228, 'number': 1503}",0.927155,0.930749,0.928948,0.992241
1200,0.004000,0.040397,"{'precision': 0.8785454545454545, 'recall': 0.9151515151515152, 'f1': 0.8964749536178107, 'number': 1320}","{'precision': 0.9540148567385921, 'recall': 0.9456521739130435, 'f1': 0.9498151082937137, 'number': 2852}","{'precision': 0.9392117568470274, 'recall': 0.9354624085163007, 'f1': 0.9373333333333332, 'number': 1503}",0.931918,0.935859,0.933884,0.992799


Trainer is attempting to log a value of "{'precision': 0.8651603498542274, 'recall': 0.8992424242424243, 'f1': 0.8818722139673105, 'number': 1320}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9432019879304224, 'recall': 0.9316269284712483, 'f1': 0.9373787264067738, 'number': 2852}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9446666666666667, 'recall': 0.9427811044577512, 'f1': 0.9437229437229436, 'number': 1503}" of type <class 'dict'> for key "eval/PER" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8575498575498576, 'recall': 0.9121212121212121

TrainOutput(global_step=1407, training_loss=0.0044942315622857045, metrics={'train_runtime': 754.939, 'train_samples_per_second': 59.609, 'train_steps_per_second': 1.864, 'total_flos': 3241443989101428.0, 'train_loss': 0.0044942315622857045, 'epoch': 1.0})

In [106]:
from transformers import pipeline
pipeline = pipeline('token-classification', '/kaggle/working/msra_ner_train/checkpoint-4221')

Device set to use cuda:0


In [107]:
pipeline('双方确定了今后发展中美关系的指导方针')

[{'entity': 'B-ORG',
  'score': 0.9988446,
  'index': 10,
  'word': '中',
  'start': 9,
  'end': 10},
 {'entity': 'B-ORG',
  'score': 0.9980627,
  'index': 11,
  'word': '美',
  'start': 10,
  'end': 11}]