In [22]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [7]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, pipeline
from transformers import TrainingArguments, Trainer
import evaluate  # pip install evaluate
import seqeval   # pip install seqeval

In [8]:
# 数据预处理
ds = load_dataset("doushabao4766/msra_ner_k_V3")

README.md:   0%|          | 0.00/697 [00:00<?, ?B/s]

(…)-00000-of-00001-42717a92413393f9.parquet:   0%|          | 0.00/13.9M [00:00<?, ?B/s]

(…)-00000-of-00001-8899cab5fdab45bc.parquet:   0%|          | 0.00/946k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45001 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3443 [00:00<?, ? examples/s]

In [9]:
# 对ds中的数据进行过滤:过滤掉tokens为空的数据
def data_filter(item):
    return len(item['tokens']) > 0
ds['train'] = ds['train'].filter(data_filter)
ds['test'] = ds['test'].filter(data_filter)
ds

Filter:   0%|          | 0/45001 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3443 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 3442
    })
})

In [10]:
tags = ds['train'].features['ner_tags'].feature.names
entites = ['O', 'PER', 'ORG', 'LOC']
entity_index = {e:i for i,e in enumerate(entites)}
model_name = 'google-bert/bert-base-chinese'

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = tokenizer.model_max_length # 512

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

In [12]:
def data_input_proc(item):
    input_data_list = []
    # 对tokens进行分词,而不是将tokens合并成句子再分词,因为合并成句子再分词会导致input_ids的长度和ner_tags的长度不一致
    for one in item['tokens']:
        # 截取最大长度否则在模型训练时会提示:The size of tensor a (706) must match the size of tensor b (512) at non-singleton dimension 1
        if len(one) > max_length:
            one = one[:512]
        input_data_one = tokenizer(one, truncation=True, add_special_tokens=False, max_length=512)
        adjust_input_data = {key: [i for idarr in value for i in idarr] for key,value in input_data_one.items()}
        input_data_list.append(adjust_input_data)
    input_data = {}
    for input_data_one in input_data_list:
        for k,v in input_data_one.items():
            input_data.setdefault(k, []).append(v)
    # 对ner_tags的长度也进行截取和input_data长度一致
    ner_tags = [n[:512] for n in item['ner_tags']]
    # DataCollatorForTokenClassification中需要有labels这个标签
    input_data['labels'] = ner_tags
    return input_data

In [13]:
ds = ds.map(data_input_proc, batched=True)
# subds = ds['train'].select(range(10))

Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3442 [00:00<?, ? examples/s]

In [14]:
ds.set_format(type="torch", columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [36]:
# 模型训练
train_args = TrainingArguments(
    output_dir='ner_train',
    num_train_epochs=3,
    save_safetensors=False,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    report_to='tensorboard',
    eval_strategy="epoch",
    learning_rate=1e-4
)

In [37]:
id2label = {i:tag for i, tag in enumerate(tags)}
label2id = {tag:i for i, tag in enumerate(tags)}
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=7, id2label=id2label, label2id=label2id)
# label_pad_token_id默认为-100
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
# result是EvalPrediction类型的一个简单数据类,包含两个主要属性:predictions,label_ids,一般都是numpy数组
def compute_metrics(result):
    predicts,labels = result
    # predicts.shape = (样本数量, padding后的sequence_length, num_labels)
    # labels.shape = (样本数量, padding后的sequence_length)
    # 获取评估对象
    seqeval = evaluate.load('seqeval')
    predicts = np.argmax(predicts, axis=2)
    # 准备评估数据
    predicts = [[tags[p] for p,l in zip(ps,ls) if l != -100]
                 for ps,ls in zip(predicts,labels)]
    labels = [[tags[l] for l in ls if l != -100]
                 for ls in labels]
    results = seqeval.compute(predictions=predicts, references=labels)
    return results

In [39]:
trainer = Trainer(
    model, 
    train_args,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [40]:
# 开始训练
trainer.train()



Epoch,Training Loss,Validation Loss,Loc,Org,Per,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.059,0.030972,"{'precision': 0.9194050501556554, 'recall': 0.9319775596072931, 'f1': 0.9256486157060768, 'number': 2852}","{'precision': 0.7833787465940054, 'recall': 0.8712121212121212, 'f1': 0.8249641319942611, 'number': 1320}","{'precision': 0.92488860598345, 'recall': 0.9667332002661344, 'f1': 0.9453480806766428, 'number': 1503}",0.887184,0.927048,0.906678,0.990814
2,0.0197,0.027452,"{'precision': 0.9533005617977528, 'recall': 0.9519635343618513, 'f1': 0.9526315789473685, 'number': 2852}","{'precision': 0.8420684835779175, 'recall': 0.9128787878787878, 'f1': 0.8760450745183569, 'number': 1320}","{'precision': 0.9534120734908137, 'recall': 0.9667332002661344, 'f1': 0.9600264288074001, 'number': 1503}",0.9259,0.946784,0.936226,0.992511
3,0.0055,0.028544,"{'precision': 0.9590540063536886, 'recall': 0.9526647966339411, 'f1': 0.95584872471416, 'number': 2852}","{'precision': 0.888807607900512, 'recall': 0.9204545454545454, 'f1': 0.9043542984741347, 'number': 1320}","{'precision': 0.9530733641771315, 'recall': 0.959414504324684, 'f1': 0.9562334217506632, 'number': 1503}",0.940662,0.94696,0.9438,0.993486


Trainer is attempting to log a value of "{'precision': 0.9194050501556554, 'recall': 0.9319775596072931, 'f1': 0.9256486157060768, 'number': 2852}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.7833787465940054, 'recall': 0.8712121212121212, 'f1': 0.8249641319942611, 'number': 1320}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.92488860598345, 'recall': 0.9667332002661344, 'f1': 0.9453480806766428, 'number': 1503}" of type <class 'dict'> for key "eval/PER" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9533005617977528, 'recall': 0.9519635343618513, 

TrainOutput(global_step=2112, training_loss=0.02307896087454124, metrics={'train_runtime': 2184.6791, 'train_samples_per_second': 61.794, 'train_steps_per_second': 0.967, 'total_flos': 1.1824996653769728e+16, 'train_loss': 0.02307896087454124, 'epoch': 3.0})

In [None]:
# 进行命名实体识别
ner = pipeline('ner', model=model, tokenizer=tokenizer)
seq = '双方确定了今后发展中美关系的指导方针。'
ner_result = ner(seq)
print(ner_result)

Device set to use cuda:0


[{'entity': 'B-LOC', 'score': 0.9998104, 'index': 10, 'word': '中', 'start': 9, 'end': 10}, {'entity': 'B-LOC', 'score': 0.99977416, 'index': 11, 'word': '美', 'start': 10, 'end': 11}]
