# 数据集介绍

来源https://storage.googleapis.com/cluebenchmark/tasks/cluener_public.zip

# 导入包

In [1]:
import torch,json
import sys,os
import pandas as pd
import random
import numpy as np
from nlp_basictasks.tasks import Ner
from nlp_basictasks.evaluation import nerEvaluator
from nlp_basictasks.readers.ner import InputExample

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


# 获取数据

In [2]:
def _create_examples(input_path,mode):
    examples = []
    with open(input_path, 'r') as f:
        idx = 0
        for line in f:
            json_d = {}
            line = json.loads(line.strip())
            text = line['text']
            label_entities = line.get('label', None)
            words = list(text)
            labels = ['O'] * len(words)
            if label_entities is not None:
                for key, value in label_entities.items():
                    for sub_name, sub_index in value.items():
                        for start_index, end_index in sub_index:
                            assert ''.join(words[start_index:end_index + 1]) == sub_name
                            if start_index == end_index:
                                labels[start_index] = 'S-' + key
                            else:
                                labels[start_index] = 'B-' + key
                                labels[start_index + 1:end_index + 1] = ['I-' + key] * (len(sub_name) - 1)
            json_d['id'] = f"{mode}_{idx}"
            json_d['context'] = " ".join(words)
            json_d['tag'] = " ".join(labels)
            json_d['raw_context'] = "".join(words)
            idx += 1
            examples.append(json_d)
    return examples

In [3]:
data=_create_examples('/data/nfs14/nfs/aisearch/asr/xhsun/datasets/cluener/train.json',mode='train')

In [4]:
len(data)

10748

In [5]:
data[0]

{'id': 'train_0',
 'context': '浙 商 银 行 企 业 信 贷 部 叶 老 桂 博 士 则 从 另 一 个 角 度 对 五 道 门 槛 进 行 了 解 读 。 叶 老 桂 认 为 ， 对 目 前 国 内 商 业 银 行 而 言 ，',
 'tag': 'B-company I-company I-company I-company O O O O O B-name I-name I-name O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O',
 'raw_context': '浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为，对目前国内商业银行而言，'}

# 构造训练集和验证集

In [6]:
train_examples=[]
for example in data:
    seq_in=example['context'].strip().split(' ')
    seq_out=example['tag'].strip().split(' ')
    assert len(seq_in)==len(seq_out)
    train_examples.append(InputExample(seq_in=seq_in,seq_out=seq_out))

dev_examples=train_examples[-2000:]
dev_seq_in=[]
dev_seq_out=[]
for example in dev_examples:
    dev_seq_in.append(example.seq_in)
    dev_seq_out.append(example.seq_out)
train_examples=train_examples[:-2000]

In [7]:
# output_path="/data/nfs14/nfs/aisearch/asr/xhsun/CompetitionModel/tmp_ner"
# if not os.path.exists(os.path.join(output_path,'label2id.json')):
label_set=set()
for examples in data:
    label_list=examples['tag'].strip().split(' ')
    for label in label_list:
        label_set.add(label)
label2id={'[PAD]':0}
for label in label_set:
    label2id[label]=len(label2id)

In [9]:
print(label2id)

{'[PAD]': 0, 'B-position': 1, 'I-name': 2, 'B-organization': 3, 'I-game': 4, 'I-book': 5, 'S-address': 6, 'B-government': 7, 'S-position': 8, 'I-scene': 9, 'S-company': 10, 'B-name': 11, 'I-company': 12, 'B-book': 13, 'B-address': 14, 'I-government': 15, 'S-name': 16, 'B-movie': 17, 'O': 18, 'B-game': 19, 'I-position': 20, 'I-address': 21, 'B-company': 22, 'B-scene': 23, 'I-movie': 24, 'I-organization': 25}


# 定义路径加载模型

In [10]:
model_path='/data/nfs14/nfs/aisearch/asr/xhsun/CommonModel/chinese-roberta-wwm/'
ner_model=Ner(model_path,label2id=label2id,use_crf=True,use_bilstm=True,device='cpu',batch_first=True)

2021-08-25 09:32:18 - INFO - __init__ - 53 : The label2id is
 {"[PAD]": 0, "B-position": 1, "I-name": 2, "B-organization": 3, "I-game": 4, "I-book": 5, "S-address": 6, "B-government": 7, "S-position": 8, "I-scene": 9, "S-company": 10, "B-name": 11, "I-company": 12, "B-book": 13, "B-address": 14, "I-government": 15, "S-name": 16, "B-movie": 17, "O": 18, "B-game": 19, "I-position": 20, "I-address": 21, "B-company": 22, "B-scene": 23, "I-movie": 24, "I-organization": 25}
2021-08-25 09:32:18 - INFO - __init__ - 270 : Loading model from /data/nfs14/nfs/aisearch/asr/xhsun/CommonModel/chinese-roberta-wwm/, which is from huggingface model
2021-08-25 09:32:18 - INFO - get_config_dict - 177 : loading configuration file /data/nfs14/nfs/aisearch/asr/xhsun/CommonModel/chinese-roberta-wwm/config.json
2021-08-25 09:32:18 - INFO - from_pretrained - 404 : loading bert model file /data/nfs14/nfs/aisearch/asr/xhsun/CommonModel/chinese-roberta-wwm/
2021-08-25 09:32:18 - INFO - from_pretrained - 423 : Bert

In [11]:
from torch.utils.data import DataLoader
batch_size=32
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
evaluator=nerEvaluator(label2id=label2id,seq_in=dev_seq_in,seq_out=dev_seq_out)

2021-08-25 09:32:38 - INFO - __init__ - 39 : Total evaluate nums : 2000
2021-08-25 09:32:38 - INFO - __init__ - 40 : input is string : False, input is list : True
2021-08-25 09:32:38 - INFO - __init__ - 41 : seq in and out like : 
['并', '且', '更', '有', '动', '力', '练', '习', 'W', 'A', 'R', '3', '，', '为', '一', '战', '成', '名', '而', '准', '备', '着', '，']	['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-game', 'I-game', 'I-game', 'I-game', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
2021-08-25 09:32:38 - INFO - __init__ - 42 : In this evaluator, slot contains ([PAD] B-position I-name B-organization I-game I-book S-address B-government S-position I-scene S-company B-name I-company B-book B-address I-government S-name B-movie O B-game I-position I-address B-company B-scene I-movie I-organization)


# 训练模型

In [12]:
ner_model.fit(train_dataloader=train_dataloader,evaluator=evaluator,epochs=5,
              output_path='/data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/')

2021-08-25 09:34:26 - INFO - fit - 155 : label2id has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/label2id.json
2021-08-25 09:34:26 - INFO - fit - 164 : 一个epoch 下，每隔54个step会输出一次loss，每隔137个step会评估一次模型


HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-08-25 09:34:29 - INFO - __call__ - 59 : nerEvaluator: Evaluating the model on  dataset in epoch 0 after 1 steps:
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        PAD]     0.0000    0.0000    0.0000         0
     address     0.0001    0.0019    0.0002       528
        book     0.0000    0.0000    0.0000       193
     company     0.0000    0.0000    0.0000       521
        game     0.0000    0.0000    0.0000       409
  government     0.0000    0.0000    0.0000       339
       movie     0.0000    0.0000    0.0000       212
        name     0.0021    0.0271    0.0039       700
organization     0.0000    0.0000    0.0000       648
    position     0.0000    0.0000    0.0000       584
       scene     0.0000    0.0000    0.0000       270

   micro avg     0.0004    0.0045    0.0007      4404
   macro avg     0.0002    0.0026    0.0004      4404
weighted avg     0.0003    0.0045    0.0006      4404



2021-08-25 09:34:53 - INFO - save_pretrained - 509 : Model weights saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/pytorch_model.bin
2021-08-25 09:34:53 - INFO - save_pretrained - 150 : Configuration saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/config.json
2021-08-25 09:34:53 - INFO - save_vocab - 51 : Vocab saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/vocab.txt
2021-08-25 09:34:53 - INFO - fit - 239 : In epoch 0, training_step 0, the eval score is 0.0006749801724574341, previous eval score is -9999999, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/
2021-08-25 09:36:21 - INFO - fit - 217 : Epoch : 0, train_step : 54/1370, loss_value : 2.9803987785621926 
2021-08-25 09:37:43 - INFO - fit - 217 : Epoch : 0, train_step : 108/1370, loss_value : 1.4688917133543227 
2021-08-25 09:38:27 - INFO - __call__ - 59 : nerEvaluator: Evaluating the model on  dataset in epoch 0 after 138 steps:
  _warn_prf(average, modifier, msg_start, len(r

              precision    recall  f1-score   support

        PAD]     0.0000    0.0000    0.0000         0
     address     0.0609    0.0795    0.0690       528
        book     0.0000    0.0000    0.0000       193
     company     0.1073    0.0653    0.0811       521
        game     0.1019    0.1663    0.1264       409
  government     0.0625    0.0413    0.0497       339
       movie     0.0000    0.0000    0.0000       212
        name     0.0949    0.0500    0.0655       700
organization     0.0291    0.0139    0.0188       648
    position     0.0000    0.0000    0.0000       584
       scene     0.0000    0.0000    0.0000       270

   micro avg     0.0784    0.0459    0.0579      4404
   macro avg     0.0415    0.0378    0.0373      4404
weighted avg     0.0536    0.0459    0.0466      4404



2021-08-25 09:38:50 - INFO - save_pretrained - 509 : Model weights saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/pytorch_model.bin
2021-08-25 09:38:50 - INFO - save_pretrained - 150 : Configuration saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/config.json
2021-08-25 09:38:50 - INFO - save_vocab - 51 : Vocab saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/vocab.txt
2021-08-25 09:38:50 - INFO - fit - 239 : In epoch 0, training_step 137, the eval score is 0.05787136513393496, previous eval score is 0.0006749801724574341, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/
2021-08-25 09:39:28 - INFO - fit - 217 : Epoch : 0, train_step : 162/1370, loss_value : 0.9069536041330408 
2021-08-25 09:40:54 - INFO - fit - 217 : Epoch : 0, train_step : 216/1370, loss_value : 0.599122573932012 
2021-08-25 09:42:15 - INFO - fit - 217 : Epoch : 0, train_step : 270/1370, loss_value : 0.43696941435337067 





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-08-25 09:42:24 - INFO - __call__ - 59 : nerEvaluator: Evaluating the model on  dataset in epoch 1 after 1 steps:


              precision    recall  f1-score   support

     address     0.3419    0.4261    0.3794       528
        book     0.5514    0.6114    0.5799       193
     company     0.7122    0.5509    0.6212       521
        game     0.6258    0.7482    0.6815       409
  government     0.4694    0.5664    0.5134       339
       movie     0.5144    0.5047    0.5095       212
        name     0.7239    0.8614    0.7867       700
organization     0.7296    0.7037    0.7164       648
    position     0.5124    0.3904    0.4431       584
       scene     0.2251    0.1593    0.1866       270

   micro avg     0.5732    0.5824    0.5778      4404
   macro avg     0.5406    0.5522    0.5418      4404
weighted avg     0.5726    0.5824    0.5724      4404



2021-08-25 09:42:47 - INFO - save_pretrained - 509 : Model weights saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/pytorch_model.bin
2021-08-25 09:42:47 - INFO - save_pretrained - 150 : Configuration saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/config.json
2021-08-25 09:42:47 - INFO - save_vocab - 51 : Vocab saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/vocab.txt
2021-08-25 09:42:48 - INFO - fit - 239 : In epoch 1, training_step 0, the eval score is 0.5777677666403874, previous eval score is 0.05787136513393496, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/
2021-08-25 09:44:14 - INFO - fit - 217 : Epoch : 1, train_step : 108/1370, loss_value : 0.3421557576016144 
2021-08-25 09:45:38 - INFO - fit - 217 : Epoch : 1, train_step : 216/1370, loss_value : 0.29418859272091474 
2021-08-25 09:46:26 - INFO - __call__ - 59 : nerEvaluator: Evaluating the model on  dataset in epoch 1 after 138 steps:


              precision    recall  f1-score   support

     address     0.4474    0.5076    0.4756       528
        book     0.6596    0.6425    0.6509       193
     company     0.6845    0.7121    0.6980       521
        game     0.6273    0.8191    0.7105       409
  government     0.7224    0.7906    0.7549       339
       movie     0.5885    0.5330    0.5594       212
        name     0.8478    0.8671    0.8573       700
organization     0.7605    0.7840    0.7720       648
    position     0.7251    0.6866    0.7054       584
       scene     0.4274    0.5556    0.4831       270

   micro avg     0.6672    0.7141    0.6898      4404
   macro avg     0.6490    0.6898    0.6667      4404
weighted avg     0.6747    0.7141    0.6922      4404



2021-08-25 09:46:51 - INFO - save_pretrained - 509 : Model weights saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/pytorch_model.bin
2021-08-25 09:46:51 - INFO - save_pretrained - 150 : Configuration saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/config.json
2021-08-25 09:46:51 - INFO - save_vocab - 51 : Vocab saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/vocab.txt
2021-08-25 09:46:51 - INFO - fit - 239 : In epoch 1, training_step 274, the eval score is 0.6898442640930028, previous eval score is 0.5777677666403874, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/
2021-08-25 09:47:30 - INFO - fit - 217 : Epoch : 1, train_step : 324/1370, loss_value : 0.26097009772503815 
2021-08-25 09:48:53 - INFO - fit - 217 : Epoch : 1, train_step : 432/1370, loss_value : 0.22804903873690852 
2021-08-25 09:50:17 - INFO - fit - 217 : Epoch : 1, train_step : 540/1370, loss_value : 0.24189564640875216 





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-08-25 09:50:24 - INFO - __call__ - 59 : nerEvaluator: Evaluating the model on  dataset in epoch 2 after 1 steps:


              precision    recall  f1-score   support

     address     0.5145    0.5038    0.5091       528
        book     0.7824    0.6891    0.7328       193
     company     0.6979    0.7716    0.7329       521
        game     0.7158    0.8191    0.7640       409
  government     0.6977    0.8171    0.7527       339
       movie     0.7629    0.6981    0.7291       212
        name     0.8148    0.8800    0.8462       700
organization     0.7807    0.7747    0.7777       648
    position     0.7478    0.7209    0.7341       584
       scene     0.5516    0.5741    0.5626       270

   micro avg     0.7130    0.7391    0.7258      4404
   macro avg     0.7066    0.7248    0.7141      4404
weighted avg     0.7128    0.7391    0.7246      4404



2021-08-25 09:50:47 - INFO - save_pretrained - 509 : Model weights saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/pytorch_model.bin
2021-08-25 09:50:47 - INFO - save_pretrained - 150 : Configuration saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/config.json
2021-08-25 09:50:47 - INFO - save_vocab - 51 : Vocab saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/vocab.txt
2021-08-25 09:50:47 - INFO - fit - 239 : In epoch 2, training_step 0, the eval score is 0.7258334262459584, previous eval score is 0.6898442640930028, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/
2021-08-25 09:52:12 - INFO - fit - 217 : Epoch : 2, train_step : 162/1370, loss_value : 0.18043310374573426 
2021-08-25 09:53:38 - INFO - fit - 217 : Epoch : 2, train_step : 324/1370, loss_value : 0.17925456262848996 
2021-08-25 09:54:26 - INFO - __call__ - 59 : nerEvaluator: Evaluating the model on  dataset in epoch 2 after 138 steps:


              precision    recall  f1-score   support

     address     0.5174    0.5644    0.5399       528
        book     0.6897    0.7254    0.7071       193
     company     0.6959    0.7774    0.7344       521
        game     0.6963    0.8240    0.7548       409
  government     0.6874    0.8496    0.7599       339
       movie     0.6927    0.7123    0.7023       212
        name     0.8154    0.8771    0.8451       700
organization     0.7611    0.7963    0.7783       648
    position     0.7076    0.7997    0.7508       584
       scene     0.5259    0.6778    0.5922       270

   micro avg     0.6907    0.7718    0.7290      4404
   macro avg     0.6789    0.7604    0.7165      4404
weighted avg     0.6931    0.7718    0.7297      4404



2021-08-25 09:54:50 - INFO - save_pretrained - 509 : Model weights saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/pytorch_model.bin
2021-08-25 09:54:50 - INFO - save_pretrained - 150 : Configuration saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/config.json
2021-08-25 09:54:50 - INFO - save_vocab - 51 : Vocab saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/vocab.txt
2021-08-25 09:54:51 - INFO - fit - 239 : In epoch 2, training_step 411, the eval score is 0.7290080428954424, previous eval score is 0.7258334262459584, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/
2021-08-25 09:55:30 - INFO - fit - 217 : Epoch : 2, train_step : 486/1370, loss_value : 0.16929205534634767 
2021-08-25 09:56:57 - INFO - fit - 217 : Epoch : 2, train_step : 648/1370, loss_value : 0.17633160962550729 
2021-08-25 09:58:24 - INFO - fit - 217 : Epoch : 2, train_step : 810/1370, loss_value : 0.16995186607042947 





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-08-25 09:58:32 - INFO - __call__ - 59 : nerEvaluator: Evaluating the model on  dataset in epoch 3 after 1 steps:


              precision    recall  f1-score   support

     address     0.5153    0.5417    0.5282       528
        book     0.7421    0.8497    0.7923       193
     company     0.7271    0.7774    0.7514       521
        game     0.7799    0.8142    0.7967       409
  government     0.7083    0.8525    0.7738       339
       movie     0.7500    0.7075    0.7282       212
        name     0.8556    0.8886    0.8718       700
organization     0.7761    0.8025    0.7891       648
    position     0.6982    0.7962    0.7440       584
       scene     0.5514    0.6556    0.5990       270

   micro avg     0.7178    0.7745    0.7451      4404
   macro avg     0.7104    0.7686    0.7374      4404
weighted avg     0.7200    0.7745    0.7456      4404



2021-08-25 09:58:56 - INFO - save_pretrained - 509 : Model weights saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/pytorch_model.bin
2021-08-25 09:58:56 - INFO - save_pretrained - 150 : Configuration saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/config.json
2021-08-25 09:58:56 - INFO - save_vocab - 51 : Vocab saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/vocab.txt
2021-08-25 09:58:56 - INFO - fit - 239 : In epoch 3, training_step 0, the eval score is 0.7450851900393184, previous eval score is 0.7290080428954424, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/
2021-08-25 10:00:24 - INFO - fit - 217 : Epoch : 3, train_step : 216/1370, loss_value : 0.13424867681331104 
2021-08-25 10:01:52 - INFO - fit - 217 : Epoch : 3, train_step : 432/1370, loss_value : 0.1345812348579919 
2021-08-25 10:02:38 - INFO - __call__ - 59 : nerEvaluator: Evaluating the model on  dataset in epoch 3 after 138 steps:


              precision    recall  f1-score   support

     address     0.5127    0.6098    0.5571       528
        book     0.7740    0.8342    0.8030       193
     company     0.7265    0.7850    0.7546       521
        game     0.7638    0.8460    0.8028       409
  government     0.7111    0.8348    0.7680       339
       movie     0.7500    0.7075    0.7282       212
        name     0.8658    0.8943    0.8798       700
organization     0.7881    0.7978    0.7929       648
    position     0.7198    0.7654    0.7419       584
       scene     0.6385    0.6148    0.6264       270

   micro avg     0.7276    0.7782    0.7520      4404
   macro avg     0.7250    0.7690    0.7455      4404
weighted avg     0.7313    0.7782    0.7533      4404



2021-08-25 10:03:03 - INFO - save_pretrained - 509 : Model weights saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/pytorch_model.bin
2021-08-25 10:03:03 - INFO - save_pretrained - 150 : Configuration saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/config.json
2021-08-25 10:03:03 - INFO - save_vocab - 51 : Vocab saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/vocab.txt
2021-08-25 10:03:03 - INFO - fit - 239 : In epoch 3, training_step 548, the eval score is 0.7520298441957427, previous eval score is 0.7450851900393184, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/
2021-08-25 10:03:41 - INFO - fit - 217 : Epoch : 3, train_step : 648/1370, loss_value : 0.14171323028427582 
2021-08-25 10:05:05 - INFO - fit - 217 : Epoch : 3, train_step : 864/1370, loss_value : 0.13317134300315822 
2021-08-25 10:06:30 - INFO - fit - 217 : Epoch : 3, train_step : 1080/1370, loss_value : 0.13435397876633537 





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-08-25 10:06:38 - INFO - __call__ - 59 : nerEvaluator: Evaluating the model on  dataset in epoch 4 after 1 steps:
2021-08-25 10:07:00 - INFO - fit - 243 : No improvement over previous best eval score (0.749591 vs 0.752030), patience = 9


              precision    recall  f1-score   support

     address     0.5222    0.5795    0.5494       528
        book     0.7910    0.8238    0.8071       193
     company     0.7163    0.7754    0.7447       521
        game     0.7549    0.8509    0.8000       409
  government     0.7250    0.8555    0.7848       339
       movie     0.7730    0.6745    0.7204       212
        name     0.8514    0.9000    0.8750       700
organization     0.7737    0.8071    0.7900       648
    position     0.6804    0.7911    0.7316       584
       scene     0.6320    0.6296    0.6308       270

   micro avg     0.7215    0.7800    0.7496      4404
   macro avg     0.7220    0.7687    0.7434      4404
weighted avg     0.7233    0.7800    0.7497      4404



2021-08-25 10:08:24 - INFO - fit - 217 : Epoch : 4, train_step : 270/1370, loss_value : 0.11987927142116758 
2021-08-25 10:09:53 - INFO - fit - 217 : Epoch : 4, train_step : 540/1370, loss_value : 0.11430554354080448 
2021-08-25 10:10:40 - INFO - __call__ - 59 : nerEvaluator: Evaluating the model on  dataset in epoch 4 after 138 steps:


              precision    recall  f1-score   support

     address     0.5371    0.5890    0.5619       528
        book     0.7570    0.8394    0.7961       193
     company     0.7566    0.7697    0.7631       521
        game     0.7903    0.8386    0.8138       409
  government     0.7366    0.8496    0.7890       339
       movie     0.7600    0.7170    0.7379       212
        name     0.8542    0.8871    0.8704       700
organization     0.7768    0.8164    0.7961       648
    position     0.7023    0.7877    0.7425       584
       scene     0.6151    0.6630    0.6381       270

   micro avg     0.7329    0.7825    0.7569      4404
   macro avg     0.7286    0.7757    0.7509      4404
weighted avg     0.7347    0.7825    0.7574      4404



2021-08-25 10:11:05 - INFO - save_pretrained - 509 : Model weights saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/pytorch_model.bin
2021-08-25 10:11:05 - INFO - save_pretrained - 150 : Configuration saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/config.json
2021-08-25 10:11:05 - INFO - save_vocab - 51 : Vocab saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/BERT/vocab.txt
2021-08-25 10:11:05 - INFO - fit - 239 : In epoch 4, training_step 685, the eval score is 0.7568636064133538, previous eval score is 0.7520298441957427, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/tmp_model/
2021-08-25 10:11:43 - INFO - fit - 217 : Epoch : 4, train_step : 810/1370, loss_value : 0.11183709927179196 
2021-08-25 10:13:10 - INFO - fit - 217 : Epoch : 4, train_step : 1080/1370, loss_value : 0.10980823525676021 
2021-08-25 10:14:38 - INFO - fit - 217 : Epoch : 4, train_step : 1350/1370, loss_value : 0.11510282023637383 






# 模型预测

In [25]:
print(data[1401]['raw_context'])

美军有3万人阵亡。与此同时，SAS队长普莱斯带领小队找出幕后黑手扎卡耶夫，


In [26]:
print(data[1401]['tag'].split(' '))

['B-government', 'I-government', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-position', 'I-position', 'B-name', 'I-name', 'I-name', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-name', 'I-name', 'I-name', 'I-name', 'O']


In [27]:
print(ner_model.predict("美军有3万人阵亡。与此同时，SAS队长普莱斯带领小队找出幕后黑手扎卡耶夫，"))

[['B-government', 'I-government', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-position', 'B-position', 'I-position', 'B-name', 'I-name', 'I-name', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-name', 'I-name', 'I-name', 'I-name', 'O']]
