In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import jieba
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
import pandas as pd

from ark_nlp.model.ner.w2ner_bert import W2NERBert
from ark_nlp.model.ner.w2ner_bert import W2NERBertConfig
from ark_nlp.model.ner.w2ner_bert import Dataset
from ark_nlp.model.ner.w2ner_bert import Task
from ark_nlp.model.ner.w2ner_bert import get_default_model_optimizer
from ark_nlp.factory.lr_scheduler import get_default_linear_schedule_with_warmup
from ark_nlp.model.ner.w2ner_bert import Tokenizer
from ark_nlp.factory.utils.seed import set_seed
set_seed(123)

In [2]:
# 目录地址

train_data_path = '../data/source_datasets/resume-zh/train.json'
dev_data_path = '../data/source_datasets/resume-zh/dev.json'

### 一、数据读入与处理

#### 1. 数据读入

In [3]:
train_data_df = pd.read_json(train_data_path)
dev_data_df = pd.read_json(dev_data_path)

In [4]:
def get_label(train_data_df):
    entities = []
    for sentence, ners in zip(train_data_df['sentence'], train_data_df['ner']):
        entity = []
        for ner in ners:
            entity_ = {}
            entity_['idx'] = ner['index']
            entity_['type'] = ner['type']
            entity_['entity'] = ''.join(sentence[ner['index'][0]:ner['index'][-1] + 1])
            entity.append(entity_)
        entities.append(entity)

    return entities

In [5]:
train_data_df['label'] = get_label(train_data_df)
dev_data_df['label'] = get_label(dev_data_df)
train_data_df['text'] = train_data_df['sentence'].apply(lambda x: ''.join(x))
dev_data_df['text'] = dev_data_df['sentence'].apply(lambda x: ''.join(x))

In [6]:
train_data_df = train_data_df.loc[:,['text', 'label']]
train_data_df['label'] = train_data_df['label'].apply(lambda x: str(x))
dev_data_df = dev_data_df.loc[:,['text', 'label']]
dev_data_df['label'] = dev_data_df['label'].apply(lambda x: str(x))

In [7]:
ner_train_dataset = Dataset(train_data_df)
ner_dev_dataset = Dataset(dev_data_df, categories=ner_train_dataset.categories)

#### 2. 词典创建和生成分词器

In [8]:
tokenizer = Tokenizer(vocab='bert-base-chinese', max_seq_len=180)

#### 3. ID化

In [9]:
ner_train_dataset.convert_to_ids(tokenizer)
ner_dev_dataset.convert_to_ids(tokenizer)

<br>

### 二、模型构建

#### 1. 模型参数设置

In [10]:
config = W2NERBertConfig.from_pretrained('bert-base-chinese',
                                         num_labels=len(ner_train_dataset.cat2id))

#### 2. 模型创建

In [11]:
torch.cuda.empty_cache()

In [12]:
dl_module = W2NERBert.from_pretrained('bert-base-chinese',
                                    config=config)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing W2NERBert: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing W2NERBert from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing W2NERBert from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of W2NERBert were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['dis_embs.weight', 'reg_embs.weig

<br>

### 三、任务构建

#### 1. 任务参数和必要部件设定

In [13]:
# 设置运行次数
num_epoches = 10
batch_size = 6

In [14]:
optimizer = get_default_model_optimizer(dl_module)

In [15]:
# 注意lr衰减轮次的设定
t_total = len(ner_train_dataset) // batch_size * num_epoches
scheduler = get_default_linear_schedule_with_warmup(optimizer, t_total, warmup_ratio=0.1)

#### 2. 任务创建

In [16]:
model = Task(dl_module, optimizer, 'ce', cude_device=0, scheduler=scheduler, grad_clip=5.0)

#### 3. 训练

In [17]:
model.fit(ner_train_dataset,
          ner_dev_dataset,
          epochs=num_epoches,
          batch_size=batch_size
         )

 16%|█▌        | 100/637 [00:37<03:19,  2.69it/s]

[99/637],train loss is:0.800307


 31%|███▏      | 200/637 [01:15<02:43,  2.67it/s]

[199/637],train loss is:0.441193


 47%|████▋     | 300/637 [01:52<02:06,  2.66it/s]

[299/637],train loss is:0.307234


 63%|██████▎   | 400/637 [02:30<01:29,  2.66it/s]

[399/637],train loss is:0.234650


 78%|███████▊  | 500/637 [03:07<00:51,  2.66it/s]

[499/637],train loss is:0.190234


 94%|█████████▍| 600/637 [03:45<00:14,  2.64it/s]

[599/637],train loss is:0.159898


100%|██████████| 637/637 [03:59<00:00,  2.66it/s]


epoch:[0],train loss is:0.150993 

eval loss is 0.003817, precision is:0.9596518987341772, recall is:0.8102872411489646, f1_score is:0.8786671495834842


  2%|▏         | 11/637 [00:04<03:59,  2.61it/s]


KeyboardInterrupt: 

<br>

### 四、模型预测

#### 1. 模型验证

In [None]:
test_data_path = '../data/source_datasets/resume-zh/test.json'

In [None]:
test_data_df = pd.read_json(test_data_path)

test_data_df['label'] = get_label(test_data_df)
test_data_df['text'] = test_data_df['sentence'].apply(lambda x: ''.join(x))

test_data_df = test_data_df.loc[:,['text', 'label']]
test_data_df['label'] = test_data_df['label'].apply(lambda x: str(x))

ner_test_dataset = Dataset(dev_data_df, categories=ner_train_dataset.categories)

ner_test_dataset.convert_to_ids(tokenizer)

In [None]:
model.evaluate(ner_test_dataset)