In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import gc
import json
import jieba
import torch
import pickle
import codecs
import torch.nn as nn
import torch.optim as optim
import pandas as pd

import sys
sys.path.append('/home/shencj/workspace/code/nlp/Frame/ark-nlp-0.0.5/')

from ark_nlp.model.ner.global_pointer_bert import GlobalPointerBert
from ark_nlp.model.ner.global_pointer_bert import GlobalPointerBertConfig
from ark_nlp.model.ner.global_pointer_bert import Dataset
from ark_nlp.model.ner.global_pointer_bert import Task
from ark_nlp.model.ner.global_pointer_bert import get_default_model_optimizer
from ark_nlp.model.ner.global_pointer_bert import Tokenizer
from ark_nlp.factory.utils.seed import set_seed
set_seed(42)

In [2]:
# 目录地址

train_data_path = '/home/shencj/workspace/data/medical/CBLUE/CMeIE/CMeIE_train.json'
dev_data_path = '/home/shencj/workspace/data/medical/CBLUE/CMeIE/CMeIE_dev.json'

### 一、数据读入与处理

#### 1. 数据读入

In [3]:
def data_preprocess(data_path):

    data_list = []

    with codecs.open(data_path, mode='r', encoding='utf8') as f:
        lines = f.readlines()
        for index_, line_ in enumerate(lines):
            record_ = {}
            line_ = json.loads(line_.strip())
            record_['text'] = line_['text']
            record_['entities'] = []
            for triple_ in line_['spo_list']:
                record_['entities'].append([
                    triple_['subject'],
                    '疾病',
                    record_['text'].index(triple_['subject']),
                    record_['text'].index(triple_['subject'])+ len(triple_['subject']) - 1,
                ])
                record_['entities'].append([
                    triple_['object']['@value'],
                    triple_['object_type']['@value'],
                    record_['text'].index(triple_['object']['@value']),
                    record_['text'].index(triple_['object']['@value']) + len(triple_['object']['@value']) - 1,
                ])
            record_['entities'] = list(set([tuple(entity) for entity in record_['entities']]))
            record_['entities'] = sorted(record_['entities'], key = lambda x: x[2])
            record_['label'] = [{'entity': entity_[0], 'type': entity_[1], 'start_idx': entity_[2], 'end_idx': entity_[3]} for entity_ in record_['entities']]
            data_list.append(record_)
    return data_list

train_data_list = data_preprocess(train_data_path)
train_data_df = pd.DataFrame(train_data_list)

dev_data_list = data_preprocess(dev_data_path)
dev_data_df = pd.DataFrame(dev_data_list)

In [4]:
train_data_df['label'][0]

[{'entity': '产后抑郁症', 'type': '疾病', 'start_idx': 0, 'end_idx': 4},
 {'entity': '轻度情绪失调', 'type': '疾病', 'start_idx': 14, 'end_idx': 19}]

In [5]:
ner_train_dataset = Dataset(train_data_df)
ner_dev_dataset = Dataset(dev_data_df)

In [6]:
# del train_data_df, dev_data_df
# gc.collect()

#### 2. 词典创建和生成分词器

In [7]:
# 可以先创建词典，再加载入分词器
# 也可以使用分词器自动加载
# bert_vocab = transformers.AutoTokenizer.from_pretrained('nghuyong/ernie-1.0')
# tokenizer = TransfomerTokenizer(bert_vocab, max_seq_len=30)

In [8]:
tokenizer = Tokenizer(vocab='nghuyong/ernie-1.0', max_seq_len=128)

#### 3. ID化

In [9]:
ner_train_dataset.convert_to_ids(tokenizer)
ner_dev_dataset.convert_to_ids(tokenizer)

<br>

### 二、模型构建

#### 1. 模型参数设置

In [10]:
config = GlobalPointerBertConfig.from_pretrained('nghuyong/ernie-1.0', 
                                                 num_labels=len(ner_train_dataset.cat2id))

#### 2. 模型创建

In [11]:
torch.cuda.empty_cache()

In [12]:
dl_module = GlobalPointerBert.from_pretrained('nghuyong/ernie-1.0', 
                                              config=config)

Some weights of the model checkpoint at nghuyong/ernie-1.0 were not used when initializing GlobalPointerBert: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing GlobalPointerBert from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GlobalPointerBert from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GlobalPointerBert were not initialized from the model checkpoint at nghuyong/ernie-1.0 and are newly initialized: ['classifier.weight', 'class

<br>

### 三、任务构建

#### 1. 任务参数和必要部件设定

In [13]:
# 设置运行次数
num_epoches = 1
batch_size = 32

In [14]:
optimizer = get_default_model_optimizer(dl_module)

#### 2. 任务创建

In [15]:
model = Task(dl_module, optimizer, 'gpce', cuda_device=0)

#### 3. 训练

In [16]:
model.fit(ner_train_dataset, 
          ner_dev_dataset,
          lr=5e-5,
          epochs=7, 
          batch_size=batch_size,
         )

 22%|██▏       | 100/449 [00:35<02:00,  2.89it/s]

[99/449],train loss is:2.422450


 45%|████▍     | 200/449 [01:09<01:25,  2.91it/s]

[199/449],train loss is:1.520836


 67%|██████▋   | 300/449 [01:44<00:51,  2.89it/s]

[299/449],train loss is:1.183863


 89%|████████▉ | 400/449 [02:19<00:17,  2.87it/s]

[399/449],train loss is:1.006858


100%|██████████| 449/449 [02:36<00:00,  2.87it/s]


epoch:[0],train loss is:0.947977 

eval loss is 0.442798, precision is:9350.0, recall is:27499.0, f1_score is:0.6800247281719335


 22%|██▏       | 100/449 [00:34<02:03,  2.82it/s]

[99/449],train loss is:0.410804


 45%|████▍     | 200/449 [01:10<01:28,  2.82it/s]

[199/449],train loss is:0.411750


 67%|██████▋   | 300/449 [01:45<00:53,  2.79it/s]

[299/449],train loss is:0.407424


 89%|████████▉ | 400/449 [02:21<00:17,  2.81it/s]

[399/449],train loss is:0.405122


100%|██████████| 449/449 [02:39<00:00,  2.82it/s]


epoch:[1],train loss is:0.403318 

eval loss is 0.390152, precision is:9162.0, recall is:25854.0, f1_score is:0.7087491297284753


 22%|██▏       | 100/449 [00:36<02:06,  2.75it/s]

[99/449],train loss is:0.336541


 45%|████▍     | 200/449 [01:12<01:31,  2.71it/s]

[199/449],train loss is:0.334079


 67%|██████▋   | 300/449 [01:49<00:54,  2.76it/s]

[299/449],train loss is:0.336347


 89%|████████▉ | 400/449 [02:26<00:17,  2.73it/s]

[399/449],train loss is:0.336749


100%|██████████| 449/449 [02:44<00:00,  2.72it/s]


epoch:[2],train loss is:0.336270 

eval loss is 0.376740, precision is:9935.0, recall is:27123.0, f1_score is:0.7325885779596653


 22%|██▏       | 100/449 [00:36<02:08,  2.72it/s]

[99/449],train loss is:0.275779


 45%|████▍     | 200/449 [01:13<01:30,  2.74it/s]

[199/449],train loss is:0.280637


 67%|██████▋   | 300/449 [01:50<00:54,  2.74it/s]

[299/449],train loss is:0.280101


 89%|████████▉ | 400/449 [02:27<00:18,  2.70it/s]

[399/449],train loss is:0.282460


100%|██████████| 449/449 [02:45<00:00,  2.71it/s]


epoch:[3],train loss is:0.284255 

eval loss is 0.380837, precision is:10007.0, recall is:27035.0, f1_score is:0.7402996116145737


 22%|██▏       | 100/449 [00:37<02:08,  2.72it/s]

[99/449],train loss is:0.232147


 45%|████▍     | 200/449 [01:14<01:32,  2.68it/s]

[199/449],train loss is:0.235906


 67%|██████▋   | 300/449 [01:52<00:54,  2.72it/s]

[299/449],train loss is:0.238495


 89%|████████▉ | 400/449 [02:29<00:18,  2.71it/s]

[399/449],train loss is:0.239948


100%|██████████| 449/449 [02:47<00:00,  2.69it/s]


epoch:[4],train loss is:0.240424 

eval loss is 0.397741, precision is:10225.0, recall is:27477.0, f1_score is:0.744258834661717


 22%|██▏       | 100/449 [00:37<02:10,  2.67it/s]

[99/449],train loss is:0.193756


 45%|████▍     | 200/449 [01:15<01:32,  2.68it/s]

[199/449],train loss is:0.198217


 67%|██████▋   | 300/449 [01:53<00:55,  2.67it/s]

[299/449],train loss is:0.199503


 89%|████████▉ | 400/449 [02:31<00:18,  2.69it/s]

[399/449],train loss is:0.202782


100%|██████████| 449/449 [02:49<00:00,  2.65it/s]


epoch:[5],train loss is:0.204128 

eval loss is 0.447514, precision is:10819.0, recall is:28838.0, f1_score is:0.7503294264512103


 22%|██▏       | 100/449 [00:37<02:10,  2.66it/s]

[99/449],train loss is:0.159568


 45%|████▍     | 200/449 [01:16<01:32,  2.70it/s]

[199/449],train loss is:0.165006


 67%|██████▋   | 300/449 [01:54<00:55,  2.68it/s]

[299/449],train loss is:0.165984


 89%|████████▉ | 400/449 [02:31<00:18,  2.68it/s]

[399/449],train loss is:0.167606


100%|██████████| 449/449 [02:49<00:00,  2.64it/s]


epoch:[6],train loss is:0.167984 

eval loss is 0.455114, precision is:9716.0, recall is:26126.0, f1_score is:0.7437801423868943


In [17]:
model.fit(ner_train_dataset, 
          ner_dev_dataset,
          lr=2e-5,
          epochs=5, 
          batch_size=batch_size,
         )

 22%|██▏       | 100/449 [00:37<02:10,  2.68it/s]

[99/449],train loss is:0.121330


 45%|████▍     | 200/449 [01:15<01:34,  2.63it/s]

[199/449],train loss is:0.120803


 67%|██████▋   | 300/449 [01:53<00:55,  2.67it/s]

[299/449],train loss is:0.117548


 89%|████████▉ | 400/449 [02:31<00:18,  2.67it/s]

[399/449],train loss is:0.117804


100%|██████████| 449/449 [02:49<00:00,  2.65it/s]


epoch:[0],train loss is:0.117908 

eval loss is 0.509424, precision is:10562.0, recall is:27791.0, f1_score is:0.7601021913569141


 22%|██▏       | 100/449 [00:37<02:11,  2.65it/s]

[99/449],train loss is:0.099623


 45%|████▍     | 200/449 [01:15<01:35,  2.62it/s]

[199/449],train loss is:0.098516


 67%|██████▋   | 300/449 [01:53<00:56,  2.65it/s]

[299/449],train loss is:0.098094


 89%|████████▉ | 400/449 [02:31<00:18,  2.65it/s]

[399/449],train loss is:0.097878


100%|██████████| 449/449 [02:49<00:00,  2.65it/s]


epoch:[1],train loss is:0.097765 

eval loss is 0.554002, precision is:10532.0, recall is:27529.0, f1_score is:0.7651567437974499


 22%|██▏       | 100/449 [00:38<02:13,  2.62it/s]

[99/449],train loss is:0.081067


 45%|████▍     | 200/449 [01:15<01:33,  2.67it/s]

[199/449],train loss is:0.084846


 67%|██████▋   | 300/449 [01:53<00:56,  2.64it/s]

[299/449],train loss is:0.085359


 89%|████████▉ | 400/449 [02:31<00:18,  2.65it/s]

[399/449],train loss is:0.085514


100%|██████████| 449/449 [02:49<00:00,  2.65it/s]


epoch:[2],train loss is:0.085788 

eval loss is 0.568163, precision is:10507.0, recall is:27669.0, f1_score is:0.7594781163034443


 22%|██▏       | 100/449 [00:37<02:20,  2.49it/s]

[99/449],train loss is:0.074434


 45%|████▍     | 200/449 [01:15<01:34,  2.64it/s]

[199/449],train loss is:0.075413


 67%|██████▋   | 300/449 [01:53<00:56,  2.66it/s]

[299/449],train loss is:0.074585


 89%|████████▉ | 400/449 [02:31<00:18,  2.63it/s]

[399/449],train loss is:0.075585


100%|██████████| 449/449 [02:49<00:00,  2.64it/s]


epoch:[3],train loss is:0.076178 

eval loss is 0.614939, precision is:10496.0, recall is:27613.0, f1_score is:0.7602216347372615


 22%|██▏       | 100/449 [00:37<02:11,  2.65it/s]

[99/449],train loss is:0.065741


 45%|████▍     | 200/449 [01:15<01:33,  2.65it/s]

[199/449],train loss is:0.065736


 67%|██████▋   | 300/449 [01:53<00:59,  2.49it/s]

[299/449],train loss is:0.066396


 89%|████████▉ | 400/449 [02:31<00:18,  2.63it/s]

[399/449],train loss is:0.066652


100%|██████████| 449/449 [02:49<00:00,  2.65it/s]


epoch:[4],train loss is:0.067229 

eval loss is 0.632210, precision is:10352.0, recall is:27204.0, f1_score is:0.7610645493309808


<br>

### 四、生成提交数据

In [18]:
import json
from ark_nlp.model.ner.global_pointer_bert import Predictor

In [19]:
ner_predictor_instance = Predictor(model.module, tokenizer, ner_train_dataset.cat2id)

In [20]:
test_df = pd.read_json('/home/shencj/workspace/data/medical/CBLUE/CMeIE/CMeIE_test.json', lines=True)

submit = []
for _text in test_df['text'].to_list():
    submit.append({
        'text': _text,
        'entities': ner_predictor_instance.predict_one_sample(_text)
    })

In [21]:
output_path = './output_datasets/CMeEE_test.json'

with open(output_path,'w', encoding='utf-8') as f:
    f.write(json.dumps(submit, ensure_ascii=False))