In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer,DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import torch
import evaluate  # pip install evaluate
import seqeval   # pip install seqeval
from datasets import load_dataset

## 实体映射数据集词典准备

In [None]:
from datasets import load_dataset

ds = load_dataset("doushabao4766/msra_ner_k_V3")
ds

In [None]:
'''
dataset_info:
  features:
    - name: id
      dtype: string
    - name: tokens
      sequence: string
    - name: ner_tags
      sequence:
        class_label:
          names:
            '0': O
            '1': B-PER
            '2': I-PER
            '3': B-ORG
            '4': I-ORG
            '5': B-LOC
            '6': I-LOC
    - name: knowledge
      dtype: string
  splits:
    - name: train
      num_bytes: 56718636
'''
entites = ['O'] + list({'PER,ORG','LOC'})
#数据本身有ner_tags且tags为上面三个，因此数据部分只需要重构标签，并且保证分词后与原tags对齐即可.
tags = ['O']
for entity in entites[1:]:
    tags.append('B-' + entity.upper())
    tags.append('I-' + entity.upper())

entity_index = {entity:i for i, entity in enumerate(entites)}

In [None]:
def data_input_proc(item):
    # 输入文本转换模型输入token索引
    #由于原文本已经有ner_tags，因此此处只需要简单处理拿过来用即可，将text换成原始的tokens
    input_data = tokenizer(item['tokens'], truncation=True, add_special_tokens=False, max_length=512)
    input_data['labels']=item['ner_tags']
    return input_data

ds2 = ds1.map(data_input_proc, batched=True)  # batched 每次传入自定义方法样本数量多个

In [None]:
args = TrainingArguments(
    output_dir="ner_train",  # 模型训练工作目录（tensorboard，临时模型存盘文件，日志）
    num_train_epochs = 3,    # 训练 epoch
    save_safetensors=False,  # 设置False保存文件可以通过torch.load加载
    per_device_train_batch_size=32,  # 训练批次
    per_device_eval_batch_size=32,
    report_to='tensorboard',  # 训练输出记录
    eval_strategy="epoch",
)

In [None]:
id2lbl = {i:tag for i, tag in enumerate(tags)}
lbl2id = {tag:i for i, tag in enumerate(tags)}

model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese', 
                                                        num_labels=21,
                                                        id2label=id2lbl,
                                                        label2id=lbl2id)
model

In [None]:
# metric 方法
def compute_metric(result):
    # result 是一个tuple (predicts, labels)
    
    # 获取评估对象
    seqeval = evaluate.load('seqeval')
    predicts,labels = result
    predicts = np.argmax(predicts, axis=2)
    
    # 准备评估数据
    predicts = [[tags[p] for p,l in zip(ps,ls) if l != -100]
                 for ps,ls in zip(predicts,labels)]
    labels = [[tags[l] for p,l in zip(ps,ls) if l != -100]
                 for ps,ls in zip(predicts,labels)]
    results = seqeval.compute(predictions=predicts, references=labels)

    return results


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)
trainer = Trainer(
    model,
    args,
    train_dataset=ds['train'],
    eval_dataset=ds['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metric
)
trainer.train()

In [None]:
#模型预测使用,目前这块暂时未想到如何通过result关联到原来的文本中，这是作业的一个遗留问题.下面为待定的一个输出形式，思路是通过访问不为0的label_ids去找到应该被识别的实体标签
sentence=["双方确定了今后发展中美关系的指导方针"]
result=trainer.predict(["双方确定了今后发展中美关系的指导方针"])
for i in result.labels_id[0]:
    if i!=0:
        k=i//2 
        if i%2==0:
            k=k+1
        print(f"entity:{entites[i]},content:{result.}")