In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import jieba
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
import pandas as pd

from tokenizer import TransfomerTokenizer as Tokenizer
from utils import convert_ner_task_uie_df
from prompt_uie import PromptUIE as Module
from prompt_uie_information_extraction_dataset import PromptUIEDataset as Dataset
from prompt_uie_information_extraction_task import PromptUIETask as Task
from ark_nlp.nn import BertConfig as ModuleConfig
from ark_nlp.factory.optimizer import get_default_bert_optimizer as get_default_model_optimizer

In [None]:
train_data_path = './data/source_datasets/CMeEE/CMeEE_train.json'
dev_data_path = './data/source_datasets/CMeEE/CMeEE_dev.json'

In [None]:
model_path = 'freedomking/prompt-uie-base'

### 一、数据读入与处理

#### 1. 数据读入

In [None]:
train_data_df = pd.read_json(train_data_path)
dev_data_df = pd.read_json(dev_data_path)

In [None]:
train_data_df = train_data_df.rename(columns={'entities': 'label'})
dev_data_df = dev_data_df.rename(columns={'entities': 'label'})

In [None]:
type2name = {
    'dis': '疾病',
    'sym': '临床表现',
    'pro': '医疗程序',
    'equ': '医疗设备',
    'dru': '药物',
    'ite': '医学检验项目',
    'bod': '身体',
    'dep': '科室',
    'mic': '微生物类'
}

In [None]:
def convert_entity_type(labels):
    
    converted_labels = []
    for label in labels:
        converted_labels.append({
            'start_idx': label['start_idx'],
            'end_idx': label['end_idx'],
            'type': type2name[label['type']],
            'entity': label['entity']
        })
        
    return converted_labels

In [None]:
train_data_df['label'] = train_data_df['label'].apply(lambda x: convert_entity_type(x))
dev_data_df['label'] = dev_data_df['label'].apply(lambda x: convert_entity_type(x))

In [None]:
train_data_df = convert_ner_task_uie_df(train_data_df, negative_ratio=2)

In [None]:
dev_data_df = convert_ner_task_uie_df(dev_data_df, negative_ratio=0)

In [None]:
ner_train_dataset = Dataset(train_data_df)
ner_dev_dataset = Dataset(dev_data_df)

#### 2. 词典创建和生成分词器

In [None]:
tokenizer = Tokenizer(vocab=model_path, max_seq_len=100)

#### 3. ID化

In [None]:
ner_train_dataset.convert_to_ids(tokenizer)
ner_dev_dataset.convert_to_ids(tokenizer)

<br>

### 二、模型构建

#### 1. 模型参数设置

In [None]:
config = ModuleConfig.from_pretrained(model_path)

#### 2. 模型创建

In [None]:
torch.cuda.empty_cache()

In [None]:
dl_module = Module.from_pretrained(model_path, config=config)

<br>

### 三、任务构建

#### 1. 任务参数和必要部件设定

In [None]:
# 设置运行次数
num_epoches = 5
batch_size = 32

In [None]:
optimizer = get_default_model_optimizer(dl_module)

#### 2. 任务创建

In [None]:
model = Task(dl_module, optimizer, None, cuda_device=0)

#### 3. 训练

In [None]:
model.fit(ner_train_dataset, 
          ner_dev_dataset,
          lr=1e-5,
          epochs=num_epoches, 
          batch_size=batch_size
         )

<br>

### 四、生成提交数据

In [None]:
import json

from tqdm import tqdm
from prompt_uie_information_extraction_predictor import PromptUIEPredictor as Predictor

In [None]:
ner_predictor_instance = Predictor(model.module, tokenizer)

In [None]:
test_df = pd.read_json('./data/source_datasets/CMeEE/CMeEE_test.json')

submit = []
for _text in tqdm(test_df['text'].to_list()):
        
    entities = []
    for source_type, prompt_type in type2name.items():
        
        for entity in ner_predictor_instance.predict_one_sample([_text, prompt_type]):
                        
            entities.append({
                'start_idx': entity['start_idx'],
                'end_idx': entity['end_idx'],
                'type': source_type,
                'entity': entity['entity'],
            })
        
    submit.append({
        'text': _text,
        'entities': entities
    })

In [None]:
output_path = './submit_CMeEE_test.json'

with open(output_path,'w', encoding='utf-8') as f:
    f.write(json.dumps(submit, ensure_ascii=False))