In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import jieba
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
import pandas as pd

from ark_nlp.model.ner.w2ner_bert import W2NERBert
from ark_nlp.model.ner.w2ner_bert import W2NERBertConfig
from ark_nlp.model.ner.w2ner_bert import Dataset
from ark_nlp.model.ner.w2ner_bert import Task
from ark_nlp.factory.lr_scheduler import get_default_linear_schedule_with_warmup
from ark_nlp.model.ner.w2ner_bert import Tokenizer
from ark_nlp.factory.utils.seed import set_seed
set_seed(123)

In [None]:
# 目录地址

train_data_path = '../data/source_datasets/resume-zh/train.json'
dev_data_path = '../data/source_datasets/resume-zh/dev.json'

### 一、数据读入与处理

#### 1. 数据读入

In [None]:
train_data_df = pd.read_json(train_data_path)
dev_data_df = pd.read_json(dev_data_path)

In [None]:
def get_label(train_data_df):
    entities = []
    for sentence, ners in zip(train_data_df['sentence'], train_data_df['ner']):
        entity = []
        for ner in ners:
            entity_ = {}
            entity_['idx'] = ner['index']
            entity_['type'] = ner['type']
            entity_['entity'] = ''.join(sentence[ner['index'][0]:ner['index'][-1] + 1])
            entity.append(entity_)
        entities.append(entity)

    return entities

In [None]:
train_data_df['label'] = get_label(train_data_df)
dev_data_df['label'] = get_label(dev_data_df)
train_data_df['text'] = train_data_df['sentence'].apply(lambda x: ''.join(x))
dev_data_df['text'] = dev_data_df['sentence'].apply(lambda x: ''.join(x))

In [None]:
train_data_df = train_data_df.loc[:,['text', 'label']]
train_data_df['label'] = train_data_df['label'].apply(lambda x: str(x))
dev_data_df = dev_data_df.loc[:,['text', 'label']]
dev_data_df['label'] = dev_data_df['label'].apply(lambda x: str(x))

In [None]:
ner_train_dataset = Dataset(train_data_df)
ner_dev_dataset = Dataset(dev_data_df, categories=ner_train_dataset.categories)

#### 2. 词典创建和生成分词器

In [None]:
tokenizer = Tokenizer(vocab='bert-base-chinese', max_seq_len=180)

#### 3. ID化

In [None]:
ner_train_dataset.convert_to_ids(tokenizer)
ner_dev_dataset.convert_to_ids(tokenizer)

<br>

### 二、模型构建

#### 1. 模型参数设置

In [None]:
config = W2nerBertConfig.from_pretrained('bert-base-chinese',
                                         num_labels=len(ner_train_dataset.cat2id))

#### 2. 模型创建

In [None]:
torch.cuda.empty_cache()

In [None]:
dl_module = W2NERBert.from_pretrained('bert-base-chinese',
                                    config=config)

<br>

### 三、任务构建

#### 1. 任务参数和必要部件设定

In [None]:
# 设置运行次数
num_epoches = 10
batch_size = 6

In [None]:
from transformers import AdamW

def get_w2ner_model_optimizer(
    dl_module,
    lr: float = 1e-3,
    bert_lr: float = 5e-6,
    weight_decay = 0.0
):
    
    bert_params = set(dl_module.bert.parameters())
    other_params = list(set(dl_module.parameters()) - bert_params)
    no_decay = ['bias', 'LayerNorm.weight']
    params = [
        {'params': [p for n, p in dl_module.bert.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': bert_lr,
         'weight_decay': weight_decay},
        {'params': [p for n, p in dl_module.bert.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': bert_lr,
         'weight_decay': weight_decay},
        {'params': other_params,
         'lr': lr,
         'weight_decay': weight_decay},
    ]

    optimizer = AdamW(params, lr=lr, weight_decay=weight_decay)
    
    return optimizer

In [None]:
optimizer = get_w2ner_model_optimizer(dl_module)

# 注意lr衰减轮次的设定
t_total = len(ner_train_dataset) // batch_size * num_epoches
scheduler = get_default_linear_schedule_with_warmup(optimizer, t_total, warmup_ratio=0.1)

#### 2. 任务创建

In [None]:
model = Task(dl_module, optimizer, 'ce', cude_device=0, scheduler=scheduler, grad_clip=5.0)

#### 3. 训练

In [None]:
model.fit(ner_train_dataset,
          ner_dev_dataset,
          epochs=num_epoches,
          batch_size=batch_size
         )

<br>

### 四、模型预测

#### 1. 模型验证

In [None]:
test_data_path = '../data/source_datasets/resume-zh/test.json'

In [None]:
test_data_df = pd.read_json(test_data_path)

test_data_df['label'] = get_label(test_data_df)
test_data_df['text'] = test_data_df['sentence'].apply(lambda x: ''.join(x))

test_data_df = test_data_df.loc[:,['text', 'label']]
test_data_df['label'] = test_data_df['label'].apply(lambda x: str(x))

ner_test_dataset = Dataset(dev_data_df, categories=ner_train_dataset.categories)

ner_test_dataset.convert_to_ids(tokenizer)

In [None]:
model.evaluate(ner_test_dataset)