In [None]:
import torch
import pandas as pd

from ark_nlp.model.tm.unsupervised_simcse import SimCSE
from ark_nlp.model.tm.unsupervised_simcse import ModuleConfig
from ark_nlp.model.tm.unsupervised_simcse import Dataset
from ark_nlp.model.tm.unsupervised_simcse import Task
from ark_nlp.model.tm.unsupervised_simcse import get_default_model_optimizer
from ark_nlp.model.tm.unsupervised_simcse import Tokenizer

In [None]:
# 目录地址
train_data_path = '../data/source_datasets/LCQMC/train.txt'
dev_data_path = '../data/source_datasets/LCQMC/dev.txt'

### 一、数据读入与处理

#### 1. 数据读入

In [None]:
train_data_df = pd.read_csv(train_data_path, sep='\t')
train_data_df = (train_data_df
                 .loc[:,['text_a']])
train_data_df['text_b'] = train_data_df['text_a']

dev_data_df = pd.read_csv(dev_data_path, sep='\t')
dev_data_df = (dev_data_df
                 .loc[:,['text_a', 'text_b', 'label']])

In [None]:
simcse_train_dataset = Dataset(train_data_df)
simcse_dev_dataset = Dataset(dev_data_df)

#### 2. 词典创建和生成分词器

In [None]:
# 加载分词器
tokenizer = Tokenizer(vocab='nghuyong/ernie-1.0', max_seq_len=64)

#### 3. ID化

In [None]:
simcse_train_dataset.convert_to_ids(tokenizer)
simcse_dev_dataset.convert_to_ids(tokenizer)

<br>

### 二、模型构建

#### 1. 模型参数设置

In [None]:
from transformers import BertConfig

bert_config = BertConfig.from_pretrained(
    'nghuyong/ernie-1.0',
    num_labels=2
)

In [None]:
torch.cuda.empty_cache()

#### 2. 模型创建

In [None]:
dl_module = SimCSE.from_pretrained(
    'nghuyong/ernie-1.0', 
    config=bert_config,
    dropout=0.3
)

<br>

### 三、任务构建

#### 1. 任务参数和必要部件设定

In [None]:
# 设置运行次数
num_epoches = 10
batch_size = 64

In [None]:
param_optimizer = list(dl_module.named_parameters())
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]     

#### 2. 任务创建

In [None]:
model = Task(dl_module, 'adamw', 'ce', cuda_device=0)

#### 3. 训练

In [None]:
model.fit(
    simcse_train_dataset,
    simcse_dev_dataset,
    lr=1e-5,
    epochs=1,
    batch_size=64,
    params=optimizer_grouped_parameters
)

<br>

### 四、模型验证与保存

#### 1. 模型验证

In [None]:
from ark_nlp.model.tm.unsupervised_simcse import Predictor

simcse_predictor_instance = Predictor(model.module, tokenizer, simcse_dev_dataset.cat2id)

In [None]:
simcse_predictor_instance.predict_one_sample(['感冒', '恐惧'])

In [None]:
simcse_predictor_instance.predict_one_sample(['感冒', '恐惧'], return_proba=True)

In [None]:
simcse_predictor_instance.predict_one_sample(['感冒', '恐惧'], threshold=None)

#### 2. Batch模型验证

In [None]:
test_data_path = '../data/source_datasets/LCQMC/test.txt'

In [None]:
test_data_df = pd.read_csv(test_data_path, sep='\t')
test_data_df = (test_data_df
                 .loc[:,['text_a', 'text_b', 'label']])

In [None]:
simcse_test_dataset = Dataset(test_data_df, categories=simcse_train_dataset.categories, is_test=True)
simcse_test_dataset.convert_to_ids(tokenizer)

In [None]:
predict_label = simcse_predictor_instance.predict_batch(simcse_test_dataset)

#### 3. 多样本验证

In [None]:
test_data_path = '../data/source_datasets/LCQMC/test.txt'
test_data_df = pd.read_csv(test_data_path, sep='\t')
test_data_df = (test_data_df
                 .loc[:,['text_a', 'text_b', 'label']])

record_ = []
for _text_a, _text_b in zip(test_data_df['text_a'], test_data_df['text_b']):
    record_.append([_text_a, _text_b, simcse_predictor_instance.predict_one_sample([_text_a, _text_b])])

<br>

### 五、模型测试报告

1. 基本功能测试 通过
2. one sample predict 通过
3. batch predict  通过
4. 多样本验证  通过