In [7]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
import evaluate
import seqeval
import numpy as np

In [8]:
ds = load_dataset("doushabao4766/msra_ner_k_V3")
ds

README.md:   0%|          | 0.00/697 [00:00<?, ?B/s]

(…)-00000-of-00001-42717a92413393f9.parquet:   0%|          | 0.00/13.9M [00:00<?, ?B/s]

(…)-00000-of-00001-8899cab5fdab45bc.parquet:   0%|          | 0.00/946k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45001 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3443 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 45001
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 3443
    })
})

In [9]:
# print(ds['train']['text'][0])
print(ds['train']['tokens'][0])
print(ds['train']['ner_tags'][0])

['当', '希', '望', '工', '程', '救', '助', '的', '百', '万', '儿', '童', '成', '长', '起', '来', '，', '科', '教', '兴', '国', '蔚', '然', '成', '风', '时', '，', '今', '天', '有', '收', '藏', '价', '值', '的', '书', '你', '没', '买', '，', '明', '日', '就', '叫', '你', '悔', '不', '当', '初', '！']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [10]:
#entity_index
entites = ['O','PER','ORG','LOC']
tags = ['O']
for entity in entites[1:]:
    tags.append('B-' + entity.upper())
    tags.append('I-' + entity.upper())
entity_index = {entity:i for i, entity in enumerate(entites)}
entity_index
print(len(tags))

7


In [11]:
id2lbl = {i: tag for i, tag in enumerate(tags)}
lbl2id = {lbl: i for i, lbl in enumerate(tags)}
model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese', num_labels=7, id2label=id2lbl, label2id=lbl2id)
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')
model = model.to('cuda')

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

In [12]:
def data_input_process(item):
    tokenized = tokenizer(
        item["tokens"],
        truncation=True,
        is_split_into_words=True,
    )
    word_ids = tokenized.word_ids()
    labels = []
    for i in word_ids:
        if i is None:
            labels.append(-100)
        else:
            labels.append(item['ner_tags'][i])

    tokenized['labels'] = labels
    return tokenized

In [13]:
ds1= ds.map(data_input_process)

Map:   0%|          | 0/45001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3443 [00:00<?, ? examples/s]

In [14]:
ds1

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 45001
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3443
    })
})

In [15]:
ds1.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [20]:
args = TrainingArguments(
    output_dir = "ner_train",
    num_train_epochs = 3,
    save_safetensors=False, # 设置False保存为pytorch模型，才能用torch.load()加载
    per_device_train_batch_size=32,
    report_to = 'tensorboard',
)

In [21]:
# metric
def compute_metrics(result):
    # result是一个包含预测和标签的元组
    # 评估对象
    seqeval = evaluate.load('seqeval')
    predictions, labels = result
    predictions = np.argmax(predictions, axis=2)
    # 评估索引
    predicts = [[tags[p] for p,l in zip(ps, ls) if l != -100] for ps,ls in zip(predictions, labels)]
    labels = [[tags[l] for p,l in zip(ps, ls) if l != -100] for ps,ls in zip(predictions, labels)]
    # 计算评估指标
    results = seqeval.compute(predictions=predicts, references=labels)
    return results

In [22]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = ds1['train'],
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

In [23]:
trainer.train()



Step,Training Loss
500,0.0405
1000,0.0161
1500,0.0097
2000,0.0053




TrainOutput(global_step=2112, training_loss=0.017159820223848026, metrics={'train_runtime': 2271.2109, 'train_samples_per_second': 59.441, 'train_steps_per_second': 0.93, 'total_flos': 1.1941498425154188e+16, 'train_loss': 0.017159820223848026, 'epoch': 3.0})

In [38]:
model.eval()
model.to('cpu')
sentence = "双方确定了今后发展中美关系的指导方针。"
tokens = list(sentence)  # 一句话一个字一个字地切开
encoding = tokenizer(tokens, return_tensors="pt", is_split_into_words=True)
with torch.no_grad():
    outputs = model(**encoding)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1).squeeze().tolist()
predictions

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [41]:
word_ids = encoding.word_ids()
entities = []
for idx, word_idx in enumerate(word_ids):
    if word_idx is None:
        continue
    label_id = predictions[idx]
    label = id2lbl[label_id]
    if label.startswith("B-"):
        entity_type = label[2:]
        entity_char = tokens[word_idx]
        entities.append({"entity": entity_type, "content": entity_char})
print(entities)

[{'entity': 'LOC', 'content': '中'}, {'entity': 'LOC', 'content': '美'}]
