In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import json
import jieba
import torch
import pickle
import codecs
import torch.nn as nn
import torch.optim as optim
import pandas as pd

import sys
sys.path.append('/home/shencj/workspace/code/nlp/Frame/ark-nlp-0.0.5/')

from ark_nlp.model.re.afea_bert import AFEABert
from ark_nlp.model.re.afea_bert import AFEABertConfig
from ark_nlp.model.re.afea_bert import Dataset
from ark_nlp.model.re.afea_bert import Task
from ark_nlp.model.re.afea_bert import get_default_model_optimizer
from ark_nlp.model.re.afea_bert import Tokenizer
from ark_nlp.factory.utils.seed import set_seed
set_seed(42)

### 一、数据读入与处理

#### 1. 数据读入

In [2]:
# 目录地址

train_data_path = '/home/shencj/workspace/data/medical/CBLUE/CMeIE/CMeIE_train.json'
dev_data_path = '/home/shencj/workspace/data/medical/CBLUE/CMeIE/CMeIE_dev.json'

In [3]:
def data_preprocess(data_path):

    data_list = []

    with codecs.open(data_path, mode='r', encoding='utf8') as f:
        lines = f.readlines()
        for index_, line_ in enumerate(lines):
            record_ = {}
            line_ = json.loads(line_.strip())
            record_['text'] = line_['text']
            record_['entities'] = []
            record_['triples'] = []
            for triple_ in line_['spo_list']:
                record_['entities'].append([
                    triple_['subject'],
                    '疾病',
                    record_['text'].index(triple_['subject']),
                    record_['text'].index(triple_['subject'])+ len(triple_['subject']) - 1,
                ])
                record_['entities'].append([
                    triple_['object']['@value'],
                    triple_['object_type']['@value'],
                    record_['text'].index(triple_['object']['@value']),
                    record_['text'].index(triple_['object']['@value']) + len(triple_['object']['@value']) - 1,
                ])
                record_['triples'].append([
                    triple_['subject'],
                    '疾病',
                    record_['text'].index(triple_['subject']),
                    record_['text'].index(triple_['subject'])+ len(triple_['subject']) - 1,
                    triple_['predicate'],
                    triple_['object']['@value'],
                    triple_['object_type']['@value'],
                    record_['text'].index(triple_['object']['@value']),
                    record_['text'].index(triple_['object']['@value']) + len(triple_['object']['@value']) - 1,
                ])
            record_['entities'] = list(set([tuple(entity) for entity in record_['entities']]))
            record_['entities'] = sorted(record_['entities'], key = lambda x: x[2])
            data_list.append(record_)
    return data_list

train_data_list = data_preprocess(train_data_path)
train_df = pd.DataFrame(train_data_list)

dev_data_list = data_preprocess(dev_data_path)
dev_df = pd.DataFrame(dev_data_list)

In [4]:
# 没有分类标为"None"
categories = list(set([triple[4] for triples in train_df['triples'] for triple in triples])) + ['None']

re_train_dataset = Dataset(train_df, categories=categories)
re_dev_dataset = Dataset(dev_df, categories=categories,
                         is_train=False)

#### 2. 词典创建和生成分词器

In [5]:
from transformers import AutoTokenizer
bert_vocab = AutoTokenizer.from_pretrained('nghuyong/ernie-1.0')

In [6]:
entity_categories = list(set([entity[1] for entities in train_df['entities'] for entity in entities]))
special_tokens = []
for category in entity_categories:
    special_tokens.append(f'[{category}]')
    special_tokens.append(f'[/{category}]')
bert_vocab.add_special_tokens({'additional_special_tokens': special_tokens})

22

In [7]:
tokenizer = Tokenizer(bert_vocab, max_seq_len=100)

#### 3. ID化

In [8]:
re_train_dataset.convert_to_ids(tokenizer)
re_dev_dataset.convert_to_ids(tokenizer)

<br>

### 二、模型构建

#### 1. 模型参数设置

In [9]:
bert_config = AFEABertConfig.from_pretrained('nghuyong/ernie-1.0',
                                               num_labels=len(re_train_dataset.cat2id))

#### 2. 模型创建

In [10]:
dl_module = AFEABert.from_pretrained('nghuyong/ernie-1.0',
                                       config=bert_config)

Some weights of the model checkpoint at nghuyong/ernie-1.0 were not used when initializing AFEABert: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing AFEABert from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AFEABert from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AFEABert were not initialized from the model checkpoint at nghuyong/ernie-1.0 and are newly initialized: ['linear.weight', 'linear.bias', 'classifier.weight', 'classifie

<br>

### 三、任务构建

#### 1. 任务参数和必要部件设定

In [11]:
optimizer = get_default_model_optimizer(dl_module)

#### 2. 任务创建

In [12]:
model = Task(dl_module, optimizer, 'ce', cuda_device=0)

#### 3. 训练

In [13]:
model.fit(
    re_train_dataset,
    re_dev_dataset,
    epochs=2,
    batch_size=32
)

 22%|██▏       | 100/449 [00:26<01:33,  3.73it/s]

[99/449],train loss is:1.309226,train evaluation is:6.698444


 45%|████▍     | 200/449 [00:53<01:07,  3.71it/s]

[199/449],train loss is:1.051003,train evaluation is:6.857333


 67%|██████▋   | 300/449 [01:20<00:40,  3.68it/s]

[299/449],train loss is:0.908890,train evaluation is:7.067333


 89%|████████▉ | 400/449 [01:47<00:13,  3.68it/s]

[399/449],train loss is:0.804335,train evaluation is:7.257056


100%|██████████| 449/449 [02:01<00:00,  3.71it/s]


epoch:[0],train loss is:0.768883,train evaluation is:7.240238 

classification_report: 
               precision    recall  f1-score   support

          病史       0.00      0.00      0.00        34
        多发群体       0.83      0.24      0.37       143
         死亡率       0.00      0.00      0.00        13
      风险评估因素       0.00      0.00      0.00       109
          病因       0.65      0.46      0.54       622
        鉴别诊断       0.73      0.30      0.42       258
        发病年龄       1.00      0.08      0.14        52
          预防       0.00      0.00      0.00        67
          阶段       0.00      0.00      0.00        30
         发病率       0.61      0.40      0.48        63
        预后状况       0.00      0.00      0.00        40
        手术治疗       0.75      0.75      0.75       166
        药物治疗       0.81      0.91      0.86       893
        发病部位       0.79      0.44      0.57       279
       实验室检查       0.63      0.78      0.70       406
          化疗       0.00      0.00      0.00   

 22%|██▏       | 100/449 [00:27<01:35,  3.66it/s]

[99/449],train loss is:0.425494,train evaluation is:7.401111


 45%|████▍     | 200/449 [00:54<01:08,  3.65it/s]

[199/449],train loss is:0.401296,train evaluation is:7.730000


 67%|██████▋   | 300/449 [01:22<00:40,  3.64it/s]

[299/449],train loss is:0.384893,train evaluation is:7.790222


 89%|████████▉ | 400/449 [01:49<00:13,  3.64it/s]

[399/449],train loss is:0.370685,train evaluation is:7.871056


100%|██████████| 449/449 [02:02<00:00,  3.66it/s]


epoch:[1],train loss is:0.366227,train evaluation is:7.830141 

classification_report: 
               precision    recall  f1-score   support

          病史       0.00      0.00      0.00        34
        多发群体       0.68      0.79      0.73       143
         死亡率       0.00      0.00      0.00        13
      风险评估因素       0.33      0.02      0.03       109
          病因       0.69      0.75      0.72       622
        鉴别诊断       0.72      0.59      0.65       258
        发病年龄       0.74      0.62      0.67        52
          预防       0.69      0.33      0.44        67
          阶段       1.00      0.07      0.12        30
         发病率       0.60      0.78      0.68        63
        预后状况       1.00      0.17      0.30        40
        手术治疗       0.73      0.87      0.79       166
        药物治疗       0.84      0.94      0.89       893
        发病部位       0.72      0.79      0.75       279
       实验室检查       0.71      0.85      0.78       406
          化疗       0.00      0.00      0.00   

<br>

### 四、模型预测

In [14]:
from tqdm import tqdm
from ark_nlp.model.re.afea_bert import Predictor

afea_re_predictor_instance = Predictor(model.module, tokenizer, re_train_dataset.cat2id)

In [15]:
text = '急性胰腺炎@有研究显示，进行早期 ERCP （24 小时内）可以降低梗阻性胆总管结石患者的并发症发生率和死亡率； 但是，对于无胆总管梗阻的胆汁性急性胰腺炎患者，不需要进行早期 ERCP。'
entities = [('急性胰腺炎', '疾病', 0, 4), ('ERCP', '检查', 17, 20)]

In [16]:
afea_re_predictor_instance.predict_one_sample(text, entities, topk=1)

[[('急性胰腺炎', '疾病', 0, 4), '影像学检查', ('ERCP', '检查', 17, 20)],
 [('ERCP', '检查', 17, 20), 'None', ('急性胰腺炎', '疾病', 0, 4)]]