In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import jieba
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
import pandas as pd

from ark_nlp.model.ner.global_pointer_ernie import GlobalPointerErnie
from ark_nlp.model.ner.global_pointer_ernie import GlobalPointerErnieConfig
from ark_nlp.model.ner.global_pointer_ernie import Dataset
from ark_nlp.model.ner.global_pointer_ernie import Task
from ark_nlp.model.ner.global_pointer_ernie import get_default_model_optimizer
from ark_nlp.model.ner.global_pointer_ernie import Tokenizer

In [2]:
# 目录地址

train_data_path = '../data/source_datasets/CMeEE/CMeEE_train.json'
dev_data_path = '../data/source_datasets/CMeEE/CMeEE_dev.json'

### 一、数据读入与处理

#### 1. 数据读入

In [3]:
train_data_df = pd.read_json(train_data_path)
dev_data_df = pd.read_json(dev_data_path)

In [4]:
train_data_df = train_data_df.rename(columns={'entities': 'label'})
dev_data_df = dev_data_df.rename(columns={'entities': 'label'})

In [5]:
train_data_df = train_data_df.loc[:,['text', 'label']]
train_data_df['label'] = train_data_df['label'].apply(lambda x: str(x))

dev_data_df = dev_data_df.loc[:,['text', 'label']]
dev_data_df['label'] = dev_data_df['label'].apply(lambda x: str(x))

In [6]:
ner_train_dataset = Dataset(train_data_df)
ner_dev_dataset = Dataset(dev_data_df)

#### 2. 词典创建和生成分词器

In [7]:
# 可以先创建词典，再加载入分词器
# 也可以使用分词器自动加载
# bert_vocab = transformers.AutoTokenizer.from_pretrained('nghuyong/ernie-1.0')
# tokenizer = TransfomerTokenizer(bert_vocab, max_seq_len=30)

In [8]:
tokenizer = Tokenizer(vocab='nghuyong/ernie-1.0', max_seq_len=100)

#### 3. ID化

In [9]:
ner_train_dataset.convert_to_ids(tokenizer)
ner_dev_dataset.convert_to_ids(tokenizer)

<br>

### 二、模型构建

#### 1. 模型参数设置

In [10]:
config = GlobalPointerErnieConfig.from_pretrained('nghuyong/ernie-1.0', 
                                                 num_labels=len(ner_train_dataset.cat2id))

#### 2. 模型创建

In [11]:
torch.cuda.empty_cache()

In [12]:
dl_module = GlobalPointerErnie.from_pretrained('nghuyong/ernie-1.0', 
                                              config=config)

Some weights of the model checkpoint at nghuyong/ernie-1.0 were not used when initializing GlobalPointerErnie: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing GlobalPointerErnie from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GlobalPointerErnie from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GlobalPointerErnie were not initialized from the model checkpoint at nghuyong/ernie-1.0 and are newly initialized: ['classifier.weight', 'c

<br>

### 三、任务构建

#### 1. 任务参数和必要部件设定

In [13]:
# 设置运行次数
num_epoches = 1
batch_size = 32

In [14]:
optimizer = get_default_model_optimizer(dl_module)

#### 2. 任务创建

In [15]:
model = Task(dl_module, optimizer, 'gpce', cuda_device=1)

#### 3. 训练

In [16]:
model.fit(ner_train_dataset, 
          ner_dev_dataset,
          lr=1e-5,
          epochs=num_epoches, 
          batch_size=batch_size
         )

 21%|██▏       | 100/469 [00:34<02:15,  2.72it/s]

[99/469],train loss is:4.714920


 43%|████▎     | 200/469 [01:08<01:28,  3.05it/s]

[199/469],train loss is:3.293940


 64%|██████▍   | 300/469 [01:41<00:56,  3.00it/s]

[299/469],train loss is:2.624494


 85%|████████▌ | 400/469 [02:18<00:27,  2.55it/s]

[399/469],train loss is:2.201189


100%|██████████| 469/469 [02:45<00:00,  2.83it/s]


epoch:[0],train loss is:1.998847

eval loss is 0.728579, precision is:6956.0, recall is:29462.0, f1_score is:0.472201479872378


<br>

### 四、模型验证与保存

#### 1. 模型验证

In [24]:
from ark_nlp.model.ner.global_pointer_ernie import Predictor

In [25]:
ner_predictor_instance = Predictor(model.module, tokenizer, ner_train_dataset.cat2id)

In [26]:
ner_predictor_instance.predict_one_sample('今天扎的维生素K1')

[{'start_idx': 4, 'end_idx': 8, 'entity': '维生素K1', 'type': 'dru'}]

#### 2. Batch模型验证

In [28]:
test_data_df = pd.read_json('../data/source_datasets/CMeEE/CMeEE_test.json')
test_data_df = test_data_df.loc[:,['text']]

ner_test_dataset = Dataset(test_data_df, categories=ner_train_dataset.categories, is_test=True)

In [29]:
# predict_label = ner_predictor_instance.predict_batch(ner_test_dataset)

#### 3. 多样本验证

In [None]:
test_data_df = pd.read_json('../data/source_datasets/CMeEE/CMeEE_test.json')
test_data_df = test_data_df.loc[:,['text']]

record_ = []
for text_ in test_data_df['text'].to_list():
    record_.append([text_, ner_predictor_instance.predict_one_sample(text_)])

In [None]:
record_[22]

<br>

### 五、模型测试报告

In [None]:
1. 基本功能测试 通过
2. one sample predict 通过
3. batch predict  无batch接口
4. 存在训练过短就进行预测时出现维度bug