# 读取数据

In [1]:
# pandas 数据集读取，dataframe形式的
import pandas as pd
# 文件读取
import codecs

train_df = pd.read_csv('datasets/text_match/kaggle-quora-question-pairs/train.csv.zip')

train_df = train_df[train_df['question2'].apply(lambda x: isinstance(x, str))]
train_df = train_df[train_df['question1'].apply(lambda x: isinstance(x, str))]

In [2]:
train_df.tail()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0
404289,404289,537932,537933,What is like to have sex with cousin?,What is it like to have sex with your cousin?,0


In [11]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
import random
import re

In [5]:
# 划分为训练集和验证集
# stratify 按照标签进行采样，训练集和验证部分同分布
q1_train, q1_val, q2_train, q2_val, train_label, test_label =  train_test_split(
    train_df['question1'].iloc[:5000], 
    train_df['question2'].iloc[:5000],
    train_df['is_duplicate'].iloc[:5000],
    test_size=0.2, 
    stratify=train_df['is_duplicate'].iloc[:5000])

# 模型训练

## tokenizer

In [6]:
# pip install transformers
# transformers bert相关的模型使用和加载
from transformers import BertTokenizer
# 分词器，词典

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encoding = tokenizer(list(q1_train), list(q2_train), 
                           truncation=True, padding=True, max_length=100)
test_encoding = tokenizer(list(q1_val), list(q2_val), 
                          truncation=True, padding=True, max_length=100)

In [8]:
# input_ids：字的编码
# token_type_ids：标识是第一个句子还是第二个句子
# attention_mask：标识是不是填充

tokenizer(list(q1_train)[0], list(q2_train)[0])

{'input_ids': [101, 2515, 16021, 23091, 2490, 21025, 10343, 1029, 102, 2129, 2079, 1045, 2131, 2062, 8771, 2006, 16021, 23091, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
# 数据集读取
class QuoraDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    # 读取单个样本
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = QuoraDataset(train_encoding, list(train_label))
test_dataset = QuoraDataset(test_encoding, list(test_label))

In [12]:
print(train_dataset[0])
tokenizer.decode(train_dataset[0]['input_ids'])

{'input_ids': tensor([  101,  2515, 16021, 23091,  2490, 21025, 10343,  1029,   102,  2129,
         2079,  1045,  2131,  2062,  8771,  2006, 16021, 23091,  1029,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 

'[CLS] does instagram support gifs? [SEP] how do i get more followers on instagram? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [15]:
# 精度计算
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

## 模型训练

In [18]:
from transformers import BertForNextSentencePrediction, AdamW, get_linear_schedule_with_warmup
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# 优化方法
optim = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 1

# 学习率预热:在预热期间，学习率从0线性增加到优化器中的初始lr。
# 在预热阶段之后创建一个schedule，使其学习率从优化器中的初始lr线性降低到0
# 有助于减缓模型在初始阶段对mini-batch的提前过拟合现象，保持分布的平稳
# 有助于保持模型深层的稳定性
# 具体可以去看下知乎：https://www.zhihu.com/question/338066667
scheduler = get_linear_schedule_with_warmup(optim, #  The optimizer for which to schedule the learning rate.
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            # The number of steps for the warmup phase.
                                            num_training_steps = total_steps # The total number of training steps.
                                           )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        
        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # 参数更新
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("-------------------------------")
    

for epoch in range(5):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()

------------Epoch: 0 ----------------
epoth: 0, iter_num: 100, loss: 0.4381, 40.00%
epoth: 0, iter_num: 200, loss: 0.5089, 80.00%
Epoch: 0, Average training loss: 0.5774
Accuracy: 0.7649
Average testing loss: 0.4712
-------------------------------
------------Epoch: 1 ----------------
epoth: 1, iter_num: 100, loss: 0.3928, 40.00%
epoth: 1, iter_num: 200, loss: 0.4835, 80.00%
Epoch: 1, Average training loss: 0.4340
Accuracy: 0.7639
Average testing loss: 0.4711
-------------------------------
------------Epoch: 2 ----------------
epoth: 2, iter_num: 100, loss: 0.3540, 40.00%
epoth: 2, iter_num: 200, loss: 0.4188, 80.00%
Epoch: 2, Average training loss: 0.4311
Accuracy: 0.7659
Average testing loss: 0.4711
-------------------------------
------------Epoch: 3 ----------------
epoth: 3, iter_num: 100, loss: 0.4385, 40.00%
epoth: 3, iter_num: 200, loss: 0.4898, 80.00%
Epoch: 3, Average training loss: 0.4328
Accuracy: 0.7639
Average testing loss: 0.4715
-------------------------------
--------

## FGM对抗训练

In [13]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=0.001):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and 'embeddings.word_embeddings' in name:
                
                # 保存原始参数
                self.backup[name] = param.data.clone()
                
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and 'embeddings.word_embeddings' in name: 
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}
        
# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    fgm = FGM(model)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        
        # 反向梯度信息
        loss.backward()
                
        fgm.attack() # 在embedding上添加对抗扰动
        outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
        )
        outputs[0].backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
        fgm.restore() # 恢复embedding参数
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # 参数更新
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("-------------------------------")
    

for epoch in range(2):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()

------------Epoch: 0 ----------------
epoth: 0, iter_num: 100, loss: 0.3866, 40.00%
epoth: 0, iter_num: 200, loss: 0.6430, 80.00%
Epoch: 0, Average training loss: 0.4196
Accuracy: 0.7450
Average testing loss: 0.4897
-------------------------------
------------Epoch: 1 ----------------
epoth: 1, iter_num: 100, loss: 0.4222, 40.00%
epoth: 1, iter_num: 200, loss: 0.4139, 80.00%
Epoch: 1, Average training loss: 0.4191
Accuracy: 0.7460
Average testing loss: 0.4873
-------------------------------


# 模型预测

In [39]:
predict_example = [
    [
     'What is the approx annual cost of living while studying in UIC Chicago, for an Indian student?',
     'I am having little hairfall problem but I want to use hair styling product. Which one should I prefer out of gel, wax and clay?'
    ],
    [
     "How would the bilateral relationship between India and the USA if Hillary Clinton wins the election?",
     "How would the bilateral relationship between India and the USA be under Hillary Clinton's presidency?"   
    ]
                   ]
predict_example = tokenizer(predict_example, padding=True, max_length=128)

In [40]:
predict_example

{'input_ids': [[101, 2054, 2003, 1996, 22480, 3296, 3465, 1997, 2542, 2096, 5702, 1999, 21318, 2278, 3190, 1010, 2005, 2019, 2796, 3076, 1029, 102, 1045, 2572, 2383, 2210, 2606, 13976, 3291, 2021, 1045, 2215, 2000, 2224, 2606, 20724, 4031, 1012, 2029, 2028, 2323, 1045, 9544, 2041, 1997, 21500, 1010, 13844, 1998, 5726, 1029, 102], [101, 2129, 2052, 1996, 17758, 3276, 2090, 2634, 1998, 1996, 3915, 2065, 18520, 7207, 5222, 1996, 2602, 1029, 102, 2129, 2052, 1996, 17758, 3276, 2090, 2634, 1998, 1996, 3915, 2022, 2104, 18520, 7207, 1005, 1055, 8798, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [41]:
tokenizer.decode(predict_example['input_ids'][0])

'[CLS] what is the approx annual cost of living while studying in uic chicago, for an indian student? [SEP] i am having little hairfall problem but i want to use hair styling product. which one should i prefer out of gel, wax and clay? [SEP]'

In [42]:
with torch.no_grad():
    # 正常传播
    input_ids = torch.tensor(predict_example['input_ids']).to(device)
    attention_mask =  torch.tensor(predict_example['attention_mask']).to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    print(outputs)
    result = outputs['logits'].argmax(axis=1)
    print(result)

NextSentencePredictorOutput(loss=None, logits=tensor([[ 2.4534, -1.4938],
        [-0.1257,  0.9566]], device='cuda:0'), hidden_states=None, attentions=None)
tensor([0, 1], device='cuda:0')


# 参考例子

- coggle例子：https://mp.weixin.qq.com/s/dE3cbTBH8JhwKjF0u_oxPw