In [1]:
import datasets
import evaluate
import numpy as np
import pandas as pd
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
EPOCHS = 5  # 模型学习训练集的轮次
BATCH_SIZE = 8  # 模型并行学习的样本数量
PRETRAINED_MODEL_NAME_OR_PATH = "../models/chinese-lert-base"  # 预训练模型路径
CSV_PATH = (
    "../datasets/label_studio实体识别数据集.csv"  # label-studio标注的实体识别数据集路径
)
OUPUT_CHECKPOINT_PATH = (
    "../models/token_classification_checkpoint"  # 训练过程中产生的模型文件
)
OUTPUT_MODEL_PATH = "../models/token_classification_model"  # 微调后的文本分类模型路径
VAL_SIZE = 0.1  # 验证集的占比
MAX_LENGTH = 256  # 输入文本的token长度，lert模型最大只能是512
SEQEVAL_PATH = "../common/seqeval.py"  # 评估脚本本地路径

In [3]:
# 读取从label-stuido打标好的序列标注文件
data = pd.read_csv(CSV_PATH)

# 数据处理和转换
formatted_data = []
label_set = set()  # 用集合存储标签，用于获取标签和数字的映射关系
for idx, row in data.iterrows():
    tokens = list(row["text"])  # 将文本拆分为单词列表
    ner_tags = ["O"] * len(tokens)  # 初始化标签列表，默认为非实体('0')
    if isinstance(row["label"], str):
        entities = eval(row["label"])  # 提取实体注释
        for entity in entities:
            start = entity["start"]
            end = entity["end"]
            label = entity["labels"][0]
            label_set.add(label)
            ner_tags[start] = f"B-{label}"  # 实体开头位置标记为'B-实体类别'
            for i in range(start + 1, end):
                ner_tags[i] = f"I-{label}"  # 实体内部位置标记为'I-实体类别'
    formatted_data.append({"id": str(idx), "tokens": tokens, "ner_tags": ner_tags})

# 获取标签和数字的映射关系
label_name_list = ["O"]
for label_name in label_set:
    start_label_name = f"B-{label_name}"
    end_label_name = f"I-{label_name}"
    label_name_list.append(start_label_name)
    label_name_list.append(end_label_name)
label2id = {}
id2label = {}
for i, category in enumerate(label_name_list):
    label2id[category] = i
    id2label[i] = category

print("标签映射关系：", label2id)

标签映射关系： {'O': 0, 'B-姓名': 1, 'I-姓名': 2, 'B-地址': 3, 'I-地址': 4}


In [4]:
# 将中文标签转换为数字
id_list = []
tokens_nested_list = []
ner_tags_nested_list = []
for item in formatted_data:
    id_list.append(item["id"])
    tokens_nested_list.append(item["tokens"])
    ner_tags_list = [label2id[str_label] for str_label in item["ner_tags"]]
    ner_tags_nested_list.append(ner_tags_list)

# 将数据转换为datasets.Dataset对象
data_dict = {
    "id": id_list,
    "tokens": tokens_nested_list,
    "ner_tags": ner_tags_nested_list,
}

dataset = datasets.Dataset.from_dict(data_dict)

# 使用 train_test_split 函数来拆分数据集
split_data = dataset.train_test_split(test_size=VAL_SIZE)
print(split_data)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 90
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 10
    })
})


In [5]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH, use_fast=True)


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
        is_split_into_words=True,
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(
            batch_index=i
        )  # 将标记映射到它们各自的单词。
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # 将特殊字符设置为 -100。
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # 仅标记给定单词的第一个标记。
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_wnut = split_data.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 90/90 [00:00<00:00, 3103.52 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 1628.35 examples/s]


In [6]:
seqeval = evaluate.load(SEQEVAL_PATH)


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_name_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_name_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "accuracy": results["overall_accuracy"],
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
    }

In [7]:
model = AutoModelForTokenClassification.from_pretrained(
    PRETRAINED_MODEL_NAME_OR_PATH,
    num_labels=len(label_name_list),
    id2label=id2label,
    label2id=label2id,
)

# 使得模型的tensor连续，避免报错
for param in model.parameters():
    param.data = param.data.contiguous()

training_args = TrainingArguments(
    output_dir=OUPUT_CHECKPOINT_PATH,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    fp16=True,
)

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, max_length=MAX_LENGTH, padding="max_length"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
model.half()
model.save_pretrained(OUTPUT_MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_MODEL_PATH)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ../models/chinese-lert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.129822,0.964232,0.0,0.0,0.0
2,No log,0.094467,0.961252,0.166667,0.222222,0.190476
3,No log,0.071128,0.967213,0.222222,0.222222,0.222222
4,No log,0.066515,0.971684,0.3,0.333333,0.315789
5,No log,0.065245,0.971684,0.3,0.333333,0.315789


  _warn_prf(average, modifier, msg_start, len(result))


('../models/token_classification_model\\tokenizer_config.json',
 '../models/token_classification_model\\special_tokens_map.json',
 '../models/token_classification_model\\vocab.txt',
 '../models/token_classification_model\\added_tokens.json',
 '../models/token_classification_model\\tokenizer.json')

In [8]:
import torch
from transformers import pipeline
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    OUTPUT_MODEL_PATH,
    truncation=True,
    max_length=MAX_LENGTH,
    padding="max_length",
    use_fast=True,
)

ner = pipeline(
    task="ner",
    model=OUTPUT_MODEL_PATH,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    aggregation_strategy="simple",
)

print(ner("猩红女巫太好看啦！钢铁侠也不赖"))

Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': '姓名', 'score': np.float32(0.81164455), 'word': '猩 红 女 巫', 'start': 0, 'end': 4}, {'entity_group': '姓名', 'score': np.float32(0.8348796), 'word': '钢 铁 侠', 'start': 9, 'end': 12}]
