In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import jieba
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
import pandas as pd

import sys
sys.path.append('/home/shencj/workspace/code/nlp/Frame/ark-nlp-0.0.5/')

from ark_nlp.model.ner.w2ner_bert import W2NERBert
from ark_nlp.model.ner.w2ner_bert import CrfBertConfig
from ark_nlp.model.ner.w2ner_bert import Dataset
from ark_nlp.model.ner.w2ner_bert import Task
from ark_nlp.model.ner.w2ner_bert import get_default_model_optimizer
from ark_nlp.factory.lr_scheduler import get_default_linear_schedule_with_warmup
from ark_nlp.model.ner.w2ner_bert import Tokenizer
from ark_nlp.factory.utils.seed import set_seed
set_seed(123)

In [2]:
# 目录地址

train_data_path = '/home/shencj/workspace/data/corpus/ner/cn/resume-zh/train.json'
dev_data_path = '/home/shencj/workspace/data/corpus/ner/cn/resume-zh/dev.json'

### 一、数据读入与处理

#### 1. 数据读入

In [3]:
train_data_df = pd.read_json(train_data_path)
dev_data_df = pd.read_json(dev_data_path)

In [4]:
def get_label(train_data_df):
    entities = []
    for sentence, ners in zip(train_data_df['sentence'], train_data_df['ner']):
        entity = []
        for ner in ners:
            entity_ = {}
            entity_['start_idx'] = ner['index'][0]
            entity_['end_idx'] = ner['index'][-1]
            entity_['type'] = ner['type']
            entity_['entity'] = ''.join(sentence[ner['index'][0]:ner['index'][-1] + 1])
            entity.append(entity_)
        entities.append(entity)

    return entities

In [5]:
train_data_df['label'] = get_label(train_data_df)
dev_data_df['label'] = get_label(dev_data_df)
train_data_df['text'] = train_data_df['sentence'].apply(lambda x: ' '.join(x))
dev_data_df['text'] = dev_data_df['sentence'].apply(lambda x: ' '.join(x))

In [6]:
train_data_df = train_data_df.loc[:,['text', 'label']]
train_data_df['label'] = train_data_df['label'].apply(lambda x: str(x))
dev_data_df = dev_data_df.loc[:,['text', 'label']]
dev_data_df['label'] = dev_data_df['label'].apply(lambda x: str(x))

In [7]:
ner_train_dataset = Dataset(train_data_df)
ner_dev_dataset = Dataset(dev_data_df, categories=ner_train_dataset.categories)

#### 2. 词典创建和生成分词器

In [8]:
# max_seq_len 不要小于180， 会报index错误
tokenizer = Tokenizer(vocab='bert-base-chinese', max_seq_len=180)

#### 3. ID化

In [9]:
ner_train_dataset.convert_to_ids(tokenizer)
ner_dev_dataset.convert_to_ids(tokenizer)

<br>

### 二、模型构建

#### 1. 模型参数设置

In [10]:
config = CrfBertConfig.from_pretrained('bert-base-chinese',
                                         num_labels=len(ner_train_dataset.cat2id))

#### 2. 模型创建

In [11]:
torch.cuda.empty_cache()

In [12]:
dl_module = W2NERBert.from_pretrained('bert-base-chinese',
                                    config=config)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing W2NERBert: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing W2NERBert from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing W2NERBert from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of W2NERBert were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['dis_embs.weight', 'reg_embs.weig

<br>

### 三、任务构建

#### 1. 任务参数和必要部件设定

In [13]:
# 设置运行次数
num_epoches = 20
# 源码 batch_size = 12
batch_size = 6

In [14]:
from transformers import AdamW
bert_params = set(dl_module.bert.parameters())
other_params = list(set(dl_module.parameters()) - bert_params)
no_decay = ['bias', 'LayerNorm.weight']
params = [
    {'params': [p for n, p in dl_module.bert.named_parameters() if not any(nd in n for nd in no_decay)],
     'lr': 5e-6,
     'weight_decay': 0.0},
    {'params': [p for n, p in dl_module.bert.named_parameters() if any(nd in n for nd in no_decay)],
     'lr': 5e-6,
     'weight_decay': 0.0},
    {'params': other_params,
     'lr': 1e-3,
     'weight_decay': 0.0},
]

optimizer = AdamW(params, lr=1e-3, weight_decay=0.0)

t_total = len(ner_train_dataset) // batch_size * num_epoches
scheduler = get_default_linear_schedule_with_warmup(optimizer, t_total, warmup_ratio=0.1)

#### 2. 任务创建

In [15]:
model = Task(dl_module, optimizer, 'ce', cude_device=0, scheduler=scheduler, grad_clip=5.0)

#### 3. 训练

In [None]:
model.fit(ner_train_dataset,
          ner_dev_dataset,
          epochs=num_epoches,
          batch_size=batch_size
         )

 16%|█▌        | 100/637 [00:37<03:21,  2.66it/s]

[99/637],train loss is:1.041140


 31%|███▏      | 200/637 [01:15<02:45,  2.64it/s]

[199/637],train loss is:0.565830


 47%|████▋     | 300/637 [01:53<02:08,  2.62it/s]

[299/637],train loss is:0.402918


 63%|██████▎   | 400/637 [02:31<01:30,  2.62it/s]

[399/637],train loss is:0.310090


 78%|███████▊  | 500/637 [03:09<00:52,  2.59it/s]

[499/637],train loss is:0.251957


 94%|█████████▍| 600/637 [03:47<00:14,  2.61it/s]

[599/637],train loss is:0.212280


100%|██████████| 637/637 [04:01<00:00,  2.63it/s]


epoch:[0],train loss is:0.200581 

928 1003 1497
eval loss is 0.007458, precision is:0.9252243270189432, recall is:0.6199064796259185, f1_score is:0.7424000000000001


 16%|█▌        | 100/637 [00:38<03:25,  2.61it/s]

[99/637],train loss is:0.009265


 31%|███▏      | 200/637 [01:16<02:56,  2.48it/s]

[199/637],train loss is:0.008254


 47%|████▋     | 300/637 [01:55<02:08,  2.62it/s]

[299/637],train loss is:0.007770


 63%|██████▎   | 400/637 [02:34<01:33,  2.54it/s]

[399/637],train loss is:0.007112


 78%|███████▊  | 500/637 [03:12<00:52,  2.59it/s]

[499/637],train loss is:0.006578


 94%|█████████▍| 600/637 [03:51<00:14,  2.62it/s]

[599/637],train loss is:0.006182


100%|██████████| 637/637 [04:05<00:00,  2.59it/s]


epoch:[1],train loss is:0.006091 

1401 1485 1497
eval loss is 0.002324, precision is:0.9434343434343434, recall is:0.935871743486974, f1_score is:0.9396378269617707


 16%|█▌        | 100/637 [00:38<03:27,  2.59it/s]

[99/637],train loss is:0.004318


 31%|███▏      | 200/637 [01:17<02:47,  2.60it/s]

[199/637],train loss is:0.003763


 47%|████▋     | 300/637 [01:55<02:08,  2.62it/s]

[299/637],train loss is:0.003324


 63%|██████▎   | 400/637 [02:34<01:32,  2.55it/s]

[399/637],train loss is:0.003201


 78%|███████▊  | 500/637 [03:12<00:52,  2.59it/s]

[499/637],train loss is:0.003046


 94%|█████████▍| 600/637 [03:51<00:14,  2.56it/s]

[599/637],train loss is:0.003006


100%|██████████| 637/637 [04:05<00:00,  2.59it/s]


epoch:[2],train loss is:0.002907 

1433 1521 1497
eval loss is 0.002097, precision is:0.9421433267587114, recall is:0.957247828991316, f1_score is:0.949635520212061


 16%|█▌        | 100/637 [00:38<03:26,  2.59it/s]

[99/637],train loss is:0.001848


 31%|███▏      | 200/637 [01:17<02:46,  2.62it/s]

[199/637],train loss is:0.001943


 47%|████▋     | 300/637 [01:56<02:07,  2.64it/s]

[299/637],train loss is:0.002025


 63%|██████▎   | 400/637 [02:34<01:31,  2.60it/s]

[399/637],train loss is:0.002113


 78%|███████▊  | 500/637 [03:13<00:51,  2.64it/s]

[499/637],train loss is:0.002148


 94%|█████████▍| 600/637 [03:51<00:14,  2.56it/s]

[599/637],train loss is:0.002108


100%|██████████| 637/637 [04:06<00:00,  2.59it/s]


epoch:[3],train loss is:0.002083 

1448 1529 1497
eval loss is 0.001867, precision is:0.94702419882276, recall is:0.9672678690714763, f1_score is:0.9570389953734303


 16%|█▌        | 100/637 [00:38<03:32,  2.53it/s]

[99/637],train loss is:0.001360


 31%|███▏      | 200/637 [01:17<02:50,  2.56it/s]

[199/637],train loss is:0.001409


 47%|████▋     | 300/637 [01:56<02:09,  2.60it/s]

[299/637],train loss is:0.001655


 63%|██████▎   | 400/637 [02:34<01:29,  2.64it/s]

[399/637],train loss is:0.001644


 78%|███████▊  | 500/637 [03:13<00:52,  2.60it/s]

[499/637],train loss is:0.001593


 94%|█████████▍| 600/637 [03:52<00:14,  2.56it/s]

[599/637],train loss is:0.001594


 99%|█████████▊| 628/637 [04:03<00:03,  2.57it/s]