In [1]:
!pip install --upgrade paddlenlp -q

In [None]:
# 查看当前使用 paddle以及生态工具的版本
!pip list | grep paddle

paddlehub              2.0.4          
paddlenlp              2.0.5          
paddlepaddle-gpu       2.1.0.post101  
tb-paddle              0.3.6          


## 比赛结果


## 任务 & 数据集
文本相似度：https://aistudio.baidu.com/aistudio/competition/detail/45

## 数据浏览 & 数据处理

In [None]:
# 解压数据集, 如果存在跳过此步骤
# !unzip -q data/data52714/bq_corpus.zip 
# !unzip -q data/data52714/lcqmc.zip 
# !unzip -q data/data52714/paws-x-zh.zip

# print("--- lcqmc ----")
# !head -n 5 lcqmc/train.tsv
# print("---bq_corpus----")
# !head -n 5 bq_corpus/train.tsv
# print("---paws-x-zh----")
# !head -n 5 paws-x-zh/train.tsv

In [None]:
import paddle
import paddlenlp
from paddlenlp.datasets import load_dataset

def get_ds(ds_name='lcqmc'):
    """
    lcqmc、bq_corpus、paws-x-zh
    """

    train_file = '{}/train.tsv'.format(ds_name)
    dev_file = '{}/dev.tsv'.format(ds_name)
    test_file = '{}/test.tsv'.format(ds_name)
    
    train_ds, dev_ds, test_ds= load_dataset("lcqmc", data_files=[train_file, dev_file, test_file])

    return train_ds, dev_ds, test_ds

# 注意每次选择一个数据集进行跑
# data_name = 'lcqmc'

# data_name = 'bq_corpus'

# 注意 paws-x-zh 数据集中，有些行的数据仅有一列数据，所以需要修改 paddlenlp.datasets.lcqmc.py 
# 请参见：https://github.com/PaddlePaddle/PaddleNLP/pull/553/files
data_name = 'paws-x-zh'

train_ds, dev_ds, test_ds = get_ds(data_name)

In [None]:
test_ds.data[0]

In [None]:
# 预训练模型列表：https://github.com/PaddlePaddle/PaddleNLP/blob/develop/docs/model_zoo/transformers.rst

# ERNIE-GRAM
# tokenizer = paddlenlp.transformers.ErnieGramTokenizer.from_pretrained('ernie-gram-zh')

# RoBERTa
tokenizer = paddlenlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext-large')

def convert_example(example, tokenizer, max_seq_length=512, is_test=False):

    query, title = example["query"], example["title"]

    encoded_inputs = tokenizer(
        text=query, text_pair=title, max_seq_len=max_seq_length)

    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]

    if not is_test:
        label = np.array([example["label"]], dtype="int64")
        return input_ids, token_type_ids, label
    # 在预测或者评估阶段，不返回 label 字段
    else:
        return input_ids, token_type_ids
        

In [None]:
import paddle
import numpy as np
# 为了后续方便使用，我们使用python偏函数（partial）给 convert_example 赋予一些默认参数
from functools import partial
from paddlenlp.data import Tuple, Pad, Stack


# 训练集和验证集的样本转换函数
trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=512)

# 我们的训练数据会返回 input_ids, token_type_ids, labels 3 个字段
# 因此针对这 3 个字段需要分别定义 3 个组 batch 操作
batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # token_type_ids
    Stack(dtype="int64")  # label
): [data for data in fn(samples)]
# 定义分布式 Sampler: 自动对训练数据进行切分，支持多卡并行训练
# batch_size 设置的越大，训练速度越快，同时显存占用也越多，注意平衡
batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=32, shuffle=True) #128

# 基于 train_ds 定义 train_data_loader
# 因为我们使用了分布式的 DistributedBatchSampler, train_data_loader 会自动对训练数据进行切分
train_data_loader = paddle.io.DataLoader(
        dataset=train_ds.map(trans_func),
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        return_list=True)

# 针对验证集数据加载，我们使用单卡进行评估，所以采用 paddle.io.BatchSampler 即可
# 定义 dev_data_loader
batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=16, shuffle=False)
dev_data_loader = paddle.io.DataLoader(
        dataset=dev_ds.map(trans_func),
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        return_list=True)

In [None]:
# =======================================================================
# 测试集
# 注意: predict 数据没有 label, 因此 convert_exmaple 的 is_test 参数设为 True
# =======================================================================
trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=512,
    is_test=True)

# 预测数据的组 batch 操作
# predict 数据只返回 input_ids 和 token_type_ids，因此只需要 2 个 Pad 对象作为 batchify_fn
batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment_ids
): [data for data in fn(samples)]

batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=32, shuffle=False)

# 生成预测数据 data_loader
predict_data_loader =paddle.io.DataLoader(
        dataset=test_ds.map(trans_func),
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        return_list=True)

In [None]:
test_ds.data[0]

## 建模
详情参考：https://github.com/PaddlePaddle/PaddleNLP/blob/v2.0.2/examples/text_matching/ernie_matching/train_pointwise.py
* 注意： paddlenlp == 2.0.2

主要由以下几种实现方式：
 1. 基于单塔 Point-wise 范式的语义匹配模型 ernie_matching: 模型精度高、计算复杂度高, 适合直接进行语义匹配 2 分类的应用场景。
 2. 基于单塔 Pair-wise 范式的语义匹配模型 ernie_matching: 模型精度高、计算复杂度高, 对文本相似度大小的序关系建模能力更强，适合将相似度特征作为上层排序模块输入特征的应用场景。
 3. 基于双塔 Point-wise 范式的语义匹配模型 SimNet 和 Sentence Transformers, 这 2 种方案计算效率更高，适合对延时要求高、根据语义相似度进行粗排的应用场景。

### 基于单塔 Point-wise 范式的语义匹配模型

#### 环境 init

In [None]:
import random
def set_seed(seed):
    """sets random seed"""
    random.seed(seed)
    np.random.seed(seed)
    paddle.seed(seed)

In [None]:
seed = 1000
device = paddle.get_device()
paddle.set_device(device)

### 模型定义

In [None]:
# 预训练模型列表：https://github.com/PaddlePaddle/PaddleNLP/blob/develop/docs/model_zoo/transformers.rst

# ERNIE-GRAM
# pretrained_model = paddlenlp.transformers.ErnieGramModel.from_pretrained('ernie-gram-zh')

# RoBERTa 
pretrained_model = paddlenlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext-large')

In [None]:
import paddle.nn as nn
import paddle.nn.functional as F

# 定义 Point-wise 语义匹配网络
class PointwiseMatching(nn.Layer):

    def __init__(self, pretrained_model, dropout=None):
        super().__init__()
        self.ptm = pretrained_model
        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)

        # 语义匹配任务: 相似、不相似 2 分类任务
        self.classifier = nn.Linear(self.ptm.config["hidden_size"], 2)

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None):

        # 此处的 Input_ids 由两条文本的 token ids 拼接而成
        # token_type_ids 表示两段文本的类型编码
        # 返回的 cls_embedding 就表示这两段文本经过模型的计算之后而得到的语义表示向量
        _, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids,
                                    attention_mask)

        cls_embedding = self.dropout(cls_embedding)

        # 基于文本对的语义表示向量进行 2 分类任务
        logits = self.classifier(cls_embedding)
        probs = F.softmax(logits)

        return probs


In [None]:
model = PointwiseMatching(pretrained_model)

### 设置优化策略

In [None]:
from paddlenlp.transformers import LinearDecayWithWarmup

epochs = 10
num_training_steps = len(train_data_loader) * epochs

# 定义 learning_rate_scheduler，负责在训练过程中对 lr 进行调度
lr_scheduler = LinearDecayWithWarmup(5E-5, num_training_steps, 0.0)

# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [
    p.name for n, p in model.named_parameters()
    if not any(nd in n for nd in ["bias", "norm"])
]

# 定义 Optimizer
lr_val = paddle.optimizer.lr.MultiStepDecay(learning_rate=5e-6, milestones=[3, 6, 9], gamma=0.5, verbose=True)
optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_val,
    parameters=model.parameters(),
    weight_decay=0.001,
    apply_decay_param_fun=lambda x: x in decay_params)

# 采用交叉熵 损失函数
criterion = paddle.nn.loss.CrossEntropyLoss()

# 评估的时候采用准确率指标
metric = paddle.metric.Accuracy()

In [None]:
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader, phase="dev"):
    model.eval()
    metric.reset()
    losses = []
    for batch in data_loader:
        input_ids, token_type_ids, labels = batch
        probs = model(input_ids=input_ids, token_type_ids=token_type_ids)
        loss = criterion(probs, labels)
        losses.append(loss.numpy())
        correct = metric.compute(probs, labels)
        metric.update(correct)
        accu = metric.accumulate()
    print("eval {} loss: {:.5}, accu: {:.5}".format(phase,
                                                    np.mean(losses), accu))
    model.train()
    metric.reset()
    return accu

### 训练

In [None]:
# 指定 best model 保存路径
model_file = "{}/model/model_state.pdparams".format(data_name)
# model_file = "{}/roberta_model/model_state.pdparams".format(data_name)
model_file

In [None]:
# train loop

import time 
import os

global_step = 0
tic_train = time.time()
max_acc = 0
for epoch in range(1, epochs + 1):
    # trick：我们将之前的模型 load 后，这里可以使用 dev_data_loader 再训练一下参数
    for step, batch in enumerate(train_data_loader, start=1):

        input_ids, token_type_ids, labels = batch
        probs = model(input_ids=input_ids, token_type_ids=token_type_ids)
        loss = criterion(probs, labels)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()

        global_step += 1
        
        # 每间隔 10 step 输出训练指标
        if global_step % 100 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, acc,
                    10 / (time.time() - tic_train)))
            tic_train = time.time()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.clear_grad()
        
        # 每间隔 100 step 在验证集和测试集上进行评估
        if global_step % 200 == 0:
            acc = evaluate(model, criterion, metric, dev_data_loader, "dev")
                
            if acc >= 0.85 and acc > max_acc:
                print("current step: {}, acc: {}".format(global_step, acc))
                # model_file = "{}/model_{}/model_state.pdparams".format(data_name, global_step)
                if os.path.exists(model_file):
                    os.remove(model_file)
                paddle.save(model.state_dict(), model_file)
                max_acc = acc
    
    epoch_model_file = "{}/model_{}/model_state.pdparams".format(data_name, epoch)
    if os.path.exists(epoch_model_file):
        os.remove(epoch_model_file)
    paddle.save(model.state_dict(), epoch_model_file)

### 预测

In [19]:
def predict(model, data_loader):
    
    batch_probs = []

    # 预测阶段打开 eval 模式，模型中的 dropout 等操作会关掉
    model.eval()

    with paddle.no_grad():
        for batch_data in data_loader:
            input_ids, token_type_ids = batch_data
            input_ids = paddle.to_tensor(input_ids)
            token_type_ids = paddle.to_tensor(token_type_ids)
            
            # 获取每个样本的预测概率: [batch_size, 2] 的矩阵
            batch_prob = model(
                input_ids=input_ids, token_type_ids=token_type_ids).numpy()

            batch_probs.append(batch_prob)
        batch_probs = np.concatenate(batch_probs, axis=0)

        return batch_probs

In [None]:
# 可选步骤，创建模型，并选择最好的模型进行加载
# ERNIE-GRAM
# pretrained_model = paddlenlp.transformers.ErnieGramModel.from_pretrained('ernie-gram-zh')

# RoBERTa 
pretrained_model = paddlenlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext-large')

model = PointwiseMatching(pretrained_model)

state_dict = paddle.load(model_file)
model.set_dict(state_dict)

[2021-07-13 11:07:21,991] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/roberta-wwm-ext-large/roberta_chn_large.pdparams


In [None]:
# 执行预测函数
y_probs = predict(model, predict_data_loader)

# 根据预测概率获取预测 label
y_preds = np.argmax(y_probs, axis=1)

In [None]:
# import os
# if os.path.exists("submit.zip"):
#     os.remove("submit.zip")

#### 输出预测结果
* 满足比赛的输出格式 （https://aistudio.baidu.com/aistudio/competition/detail/45）

In [None]:
file_name = "{}/{}.tsv".format(data_name, data_name)
print("will generate file: ", file_name)
with open(file_name, 'w', encoding="utf-8") as f:
    f.write("index\tprediction\n")    
    for idx, y_pred in enumerate(y_preds):
        f.write("{}\t{}\n".format(idx, y_pred))
        # text_pair = test_ds.data[idx]
        # text_pair["label"] = y_pred
        # if  idx <= 10:
        #     print(text_pair)

### 打包预测结果，提交至比赛处
* 当所有的数据（lcqmc、bq_corpus、paws-x-zh）都分别训练，并产出预测文件后（lcqmc.tsv、paws-x.tsv、bq_corpus.tsv），我们将其打包

In [None]:
# # 打包预测结果
# !mv paws-x-zh.tsv paws-x.tsv
# !zip submit.zip lcqmc.tsv paws-x.tsv bq_corpus.tsv

  adding: lcqmc.tsv (deflated 65%)
  adding: paws-x.tsv (deflated 64%)
  adding: bq_corpus.tsv (deflated 65%)


In [None]:
for i in range(10):
    model_file = data_name + "/model_" + str(i+1) + "/model_state.pdparams"
    # pretrained_model = paddlenlp.transformers.ErnieGramModel.from_pretrained('ernie-gram-zh')

    # RoBERTa 
    pretrained_model = paddlenlp.transformers.RobertaModel.from_pretrained('roberta-wwm-ext-large')

    model = PointwiseMatching(pretrained_model)
    state_dict = paddle.load(model_file)
    model.set_dict(state_dict)

    # 执行预测函数
    y_probs = predict(model, predict_data_loader)

    # 根据预测概率获取预测 label
    y_preds = np.argmax(y_probs, axis=1)

    file_name = data_name + "/out_files_ro/{}.tsv".format(str(i))
    print("will generate file: ", file_name)
    with open(file_name, 'w', encoding="utf-8") as f:
        f.write("index\tprediction\n")    
        for idx, y_pred in enumerate(y_preds):
            f.write("{}\t{}\n".format(idx, y_pred))

In [2]:
!python create_zip.py

lcqmc/out_files_ro/0.tsv bq_corpus/out_files_ro/0.tsv paws-x-zh/out_files_ro/0.tsv
  adding: submission/bq_corpus.tsv (deflated 65%)
  adding: submission/lcqmc.tsv (deflated 65%)
  adding: submission/paws-x.tsv (deflated 64%)
lcqmc/out_files_ro/1.tsv bq_corpus/out_files_ro/1.tsv paws-x-zh/out_files_ro/1.tsv
  adding: submission/bq_corpus.tsv (deflated 65%)
  adding: submission/lcqmc.tsv (deflated 65%)
  adding: submission/paws-x.tsv (deflated 64%)
lcqmc/out_files_ro/2.tsv bq_corpus/out_files_ro/2.tsv paws-x-zh/out_files_ro/2.tsv
  adding: submission/bq_corpus.tsv (deflated 65%)
  adding: submission/lcqmc.tsv (deflated 65%)
  adding: submission/paws-x.tsv (deflated 64%)
lcqmc/out_files_ro/3.tsv bq_corpus/out_files_ro/3.tsv paws-x-zh/out_files_ro/3.tsv
  adding: submission/bq_corpus.tsv (deflated 65%)
  adding: submission/lcqmc.tsv (deflated 65%)
  adding: submission/paws-x.tsv (deflated 64%)
lcqmc/out_files_ro/4.tsv bq_corpus/out_files_ro/4.tsv paws-x-zh/out_files_ro/4.tsv
  adding: sub