In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import jieba
import torch
import pickle
import torch.nn as nn
import torch.optim as optim
import pandas as pd

from ark_nlp.model.tm.bert import Bert
from ark_nlp.model.tm.bert import BertConfig
from ark_nlp.model.tm.bert import Dataset
from ark_nlp.model.tm.bert import Task
from ark_nlp.model.tm.bert import get_default_model_optimizer
from ark_nlp.model.tm.bert import Tokenizer

### 一、数据读入与处理

#### 1. 数据读入

In [2]:
train_data_df = pd.read_json('../data/source_datasets/KUAKE-QTR/KUAKE-QTR_train.json')
train_data_df = (train_data_df
                 .rename(columns={'query': 'text_a', 'title': 'text_b'})
                 .loc[:,['text_a', 'text_b', 'label']])

dev_data_df = pd.read_json('../data/source_datasets/KUAKE-QTR/KUAKE-QTR_dev.json')
dev_data_df = (dev_data_df
                 .rename(columns={'query': 'text_a', 'title': 'text_b'})
                 .loc[:,['text_a', 'text_b', 'label']])

In [4]:
tm_train_dataset = Dataset(train_data_df)
tm_dev_dataset = Dataset(dev_data_df)

#### 2. 词典创建和生成分词器

In [5]:
tokenizer = Tokenizer(vocab='nghuyong/ernie-1.0', max_seq_len=30)

#### 3. ID化

In [6]:
tm_train_dataset.convert_to_ids(tokenizer)
tm_dev_dataset.convert_to_ids(tokenizer)

<br>

### 二、模型构建

#### 1. 模型参数设置

In [12]:
config = BertConfig.from_pretrained('nghuyong/ernie-1.0',
                                    num_labels=len(tm_train_dataset.cat2id))

#### 2. 模型创建

In [13]:
torch.cuda.empty_cache()

In [14]:
dl_module = Bert.from_pretrained('nghuyong/ernie-1.0', 
                                 config=config)

Some weights of the model checkpoint at nghuyong/ernie-1.0 were not used when initializing Bert: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing Bert from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Bert from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Bert were not initialized from the model checkpoint at nghuyong/ernie-1.0 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on

<br>

### 三、任务构建

#### 1. 任务参数和必要部件设定

In [15]:
# 设置运行次数
num_epoches = 10
batch_size = 32

In [18]:
optimizer = get_default_model_optimizer(dl_module)

#### 2. 任务创建

In [19]:
model = Task(dl_module, optimizer, 'ce', cuda_device=0)

#### 3. 训练

In [None]:
model.fit(tm_train_dataset, 
          tm_dev_dataset,
          lr=2e-5,
          epochs=5, 
          batch_size=batch_size
         )

<br>

### 四、模型验证与保存

#### 1. 模型验证

In [21]:
from ark_nlp.model.tm.bert import Predictor

In [22]:
tm_predictor_instance = Predictor(model.module, tokenizer, tm_train_dataset.cat2id)

In [26]:
tm_predictor_instance.predict_one_sample(['13个月宝宝不会说话，但会指', '十五个月大的宝宝还不会说话怎么办？'])

['1', '2', '0', '3']

In [28]:
tm_predictor_instance.predict_one_sample(['13个月宝宝不会说话，但会指', '十五个月大的宝宝还不会说话怎么办？'], return_proba=True)

[('1', 0.6151635646820068),
 ('2', 0.28690773248672485),
 ('0', 0.09583227336406708),
 ('3', 0.002096480457112193)]

#### 2. Batch模型验证

In [29]:
test_data_df = pd.read_json('../data/source_datasets/KUAKE-QTR/KUAKE-QTR_test.json')
test_data_df = (test_data_df
                 .rename(columns={'query': 'text_a', 'title': 'text_b'})
                 .loc[:,['text_a', 'text_b']])

tm_test_dataset = Dataset(test_data_df, categories=tm_train_dataset.categories, is_test=True)
tm_test_dataset.convert_to_ids(tokenizer)

In [31]:
predict_label = tm_predictor_instance.predict_batch(tm_test_dataset)

#### 3. 多样本验证

In [35]:
test_data_df = pd.read_json('../data/source_datasets/KUAKE-QTR/KUAKE-QTR_test.json')
test_data_df = test_data_df.loc[:,['query', 'title']]

record_ = []
for _text_a, _text_b in zip(test_data_df['query'], test_data_df['title']):
    record_.append([_text_a, _text_b, tm_predictor_instance.predict_one_sample([_text_a, _text_b])])

In [36]:
record_[1024]

['劳拉停了又突然吃 感觉头晕', '服用维生素c突然停药后感觉头晕', ['0', '1', '2', '3']]

<br>

### 五、模型测试报告

In [22]:
1. 基本功能测试 通过
2. one sample predict 通过
3. batch predict  通过
4. 多样本验证