
1. 参考课堂案例，使用指定的数据集，编写代码实现ner模型训练和推流。
https://huggingface.co/datasets/doushabao4766/msra_ner_k_V3
2. 完成预测结果的实体抽取。
    输入：“双方确定了今后发展中美关系的指导方针。”
    输出：[{"entity":"ORG","content":"中"},{"entity":"ORG","content":"美"}]
3. 整理Dataset、Trainer、TrainingArgument、DataCollator、Evaluate 知识点，总结文档

In [None]:
import os
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
)
from transformers import TrainingArguments, Trainer
import evaluate
from datasets import DownloadConfig, load_dataset
import numpy as np
current_dir = f"/kaggle/"

In [1]:
# 加载hf中dataset
ds = load_dataset('msra_ner')
ds

NameError: name 'load_dataset' is not defined

In [None]:
for items in ds['train']:
    print(items['tokens'])
    print(items['ner_tags'])
    break

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

字体映射字典
'O':0
'B-PER':1
'I-PER':2
'B-LOC':3
'I-LOC':4
'B-ORG':5
'I-ORG':6

In [None]:
# 验证tag标签数量
tags_id = set()
for items in ds["train"]:
    tags_id.update(items["ner_tags"])

tags_id

In [None]:
# entity_index
entites = ["0"] + list({"PER", "LOC", "ORG"})
tags = ["0"]

for entity in entites[1:]:
    tags.append("B-" + entity.upper())
    tags.append("I-" + entity.upper())

entity_index = {entity: i for i, entity in enumerate(entites)}
entity_index
tags

In [None]:
def data_input_proc(item):
    # 文本已经分为字符,且tag索引也已经提供
    # 数据预处理
    # 导入已经拆分为字符的文本列表,需要设置参数is_split_into_words=True
    input_data = tokenizer(
        item["tokens"],
        is_split_into_words=True,
        truncation=True,
        add_special_tokens=False,
        max_length=512,
        return_offsets_mapping=True,
    )

    labels = [lbl[:512] for lbl in item["ner_tags"]]
    # 将标签转换为索引
    input_data["labels"] = labels
    return input_data


In [None]:
ds1 = ds.map(data_input_proc, batched=True)

ds1.set_format(
    type="torch",
    columns=["input_ids", "token_type_ids", "attention_mask", "labels"],
)

In [None]:
for item in ds1["train"]:
    print(item)
    break

In [None]:
# 构建模型对象
id2lbl = {i: tag for i, tag in enumerate(tags)}
lbl2id = {tag: i for i, tag in enumerate(tags)}
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-chinese",
    num_labels=len(tags),
    id2label=id2lbl,
    label2id=lbl2id,
)
model

In [None]:
# 模型训练 TrainingArguments
args = TrainingArguments(
    output_dir="msra_ner_train",  # 模型保存路径
    num_train_epochs=3,  # 训练轮数
    save_safetensors=False,  # 设置false 保存文件可以通过torch.load()加载
    per_device_train_batch_size=32,  # 训练batch_size
    per_device_eval_batch_size=32,  # 验证batch_size
    report_to="tensorboard",  # 日志记录方式
    eval_strategy="epoch",  # 验证策略
)

In [None]:
# 模型训练 Trainer
def compute_metric(result):
    # 获取评估对象
    seqeval = evaluate.load("seqeval")
    predicts, labels = result
    predicts = np.argmax(predicts, axis=2)

    # 准备评估数据
    predicts = [
        [tags[p] for p, l in zip(ps, ls) if l != -100]
        for ps, ls in zip(predicts, labels)
    ]
    labels = [
        [tags[l] for p, l in zip(ps, ls) if l != -100]
        for ps, ls in zip(predicts, labels)
    ]
    # 计算评估指标
    results = seqeval.compute(predictions=predicts, references=labels)

    return results

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)
# 实例化Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds1["train"],
    eval_dataset=ds1["test"],
    data_collator=data_collator,
    compute_metrics=compute_metric,
)

In [None]:
# 模型训练
trainer.train()

In [None]:
# 模型推理
from transformers import pipeline
pipeline = pipeline('token-classification', "msra_ner_train/checkpoint-2112")

In [None]:
pipeline("双方确定了今后发展中美关系的指导方针")