In [None]:
! pip install transformers==4.0.1

Looking in indexes: https://mirror.baidu.com/pypi/simple/
Collecting transformers==4.0.1
[?25l  Downloading https://mirror.baidu.com/pypi/packages/ed/db/98c3ea1a78190dac41c0127a063abf92bd01b4b0b6970a6db1c2f5b66fa0/transformers-4.0.1-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 14.5MB/s eta 0:00:01
[?25hCollecting filelock (from transformers==4.0.1)
  Downloading https://mirror.baidu.com/pypi/packages/93/83/71a2ee6158bb9f39a90c0dea1637f81d5eef866e188e1971a1b1ab01a35a/filelock-3.0.12-py3-none-any.whl
Collecting packaging (from transformers==4.0.1)
[?25l  Downloading https://mirror.baidu.com/pypi/packages/3c/77/e2362b676dc5008d81be423070dd9577fa03be5da2ba1105811900fda546/packaging-21.0-py3-none-any.whl (40kB)
[K     |████████████████████████████████| 40kB 15.6MB/s eta 0:00:01
Collecting tokenizers==0.9.4 (from transformers==4.0.1)
[?25l  Downloading https://mirror.baidu.com/pypi/packages/fb/36/59e4a62254c5fcb43894c6b0e9403ec6f4238cc2422a003ed2e6279a1784/t

![文本相似度](https://img-blog.csdnimg.cn/136dde6917294a3aa8200b7b853d9133.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![train.json](https://img-blog.csdnimg.cn/img_convert/d0484fd918be5f239de10499fb07879c.png)

![test.json](https://img-blog.csdnimg.cn/img_convert/84e411156968837c4071bf0f22a76c2b.png)

In [None]:
train_df = read_data('data/data100821/train.json')

In [None]:
train_df.head(50)

In [None]:
dev_df = read_data('data/data100821/dev.json')

In [None]:
(train_df.text_a.str.len() + train_df.text_b.str.len()).hist(bins=20);
(dev_df.text_a.str.len() + dev_df.text_b.str.len()).hist(bins=20);

In [None]:
train_df.text_a.str.len() + train_df.text_b.str.len()).quantile(0.99)

In [None]:
import torch
import random
import numpy as np
config = {
        'train_file_path': 'data/data100821/train.json',
        'dev_file_path': 'data/data100821/dev.json',
        'test_file_path': 'data/data100821/test.json',
        'embedding_file_path': 'data/data100821/sgns.weibo.word.bz2',
        'train_val_ratio': 0.1,
        'vocab_size': 30000,
        'batch_size': 64,
        'max_seq_len':64,
        'num_epochs': 1,
        'learning_rate': 1e-3,
        'device': 'cpu',
        'logging_step': 200,
        'seed': 2021
    } 

if torch.cuda.is_available():
    config['device'] = 'cuda'

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(config['seed'])

In [None]:
def get_embedding(vocab, embedding_file_path):
    print('processing embedding file ...')

    token2embedding = {}

    with bz2.open(embedding_file_path) as f:

        token_vectors = f.readlines()
        meta_info = token_vectors[0].split()
        print(f'{meta_info[0]} tokens in embedding file in total, vector size is {meta_info[-1]}')

        for line in tqdm(token_vectors[1:]):
            line = line.split()
            token = line[0].decode('utf8')

            vector = line[1:]
            if token in vocab:
                token2embedding[token] = [float(num) for num in vector]

    token2idx = {token: idx for idx, token in enumerate(token2embedding.keys(), 4)}
    UNK, PAD, BOS, EOS = '<unk>', '<pad>', '<bos>', '<eos>'
    token2idx[PAD] = 0
    token2idx[UNK] = 1
    token2idx[BOS] = 2
    token2idx[EOS] = 3
    idx2token = {idx: token for token, idx in token2idx.items()}
    idx2embedding = {token2idx[token]: embedding for token, embedding in token2embedding.items()}
    idx2embedding[0] = [.0] * int(meta_info[-1])
    idx2embedding[1] = [.0] * int(meta_info[-1])
    idx2embedding[2] = np.random.random(int(meta_info[-1])).tolist()
    idx2embedding[3] = np.random.random(int(meta_info[-1])).tolist()
    emb_mat = [idx2embedding[idx] for idx in range(len(idx2embedding))]

    return torch.tensor(emb_mat, dtype=torch.float), token2idx, len(vocab) + 4

In [None]:
embedding_matrix, token2idx, config['vocab_size'] = get_embedding(vocab, config['embedding_file_path'])

In [None]:
from collections import defaultdict
def tokenizer(sent, token2id):
    # .get() 找到返回 token的id, 没找到就返回 1 1->UNK
    ids = [token2id.get(token, 1) for token in jieba.cut(sent)]
    return ids

In [None]:
def read_data(data_df, train_val_ratio, token2id, mode='train'):
    if mode == 'train':
        X_train, y_train = defaultdict(list), []
        X_val, y_val = defaultdict(list), []
        num_val = int(len(data_df) * train_val_ratio)
    else:
        X_test, y_test = defaultdict(list), []

    for i, row in tqdm(data_df.iterrows(), desc=f'Preprocessing {mode} data', total=len(data_df)):
        # -------------  new ---------------------------#


        # -------------  new ---------------------------#

    if mode == 'train':
        label2id = {label: i for i, label in enumerate(np.unique(y_train))}
        id2label = {i: label for label, i in label2id.items()}
        y_train = torch.tensor([label2id[label] for label in y_train], dtype=torch.long)
        y_val = torch.tensor([label2id[label] for label in y_val], dtype=torch.long)
        return X_train, y_train, X_val, y_val, label2id, id2label
    else:
        y_test = torch.tensor(y_test, dtype=torch.long)
        return X_test, y_test

In [None]:
X_train, y_train, X_val, y_val, label2id, id2label = read_data(train_df, config['train_val_ratio'], token2idx, mode='train')
X_test, y_test = read_data(test_df, config['train_val_ratio'], token2idx, mode='test')

In [None]:
from torch.utils.data import Dataset
class AFQMCDataset(Dataset):

    def __init__(self, x, y):
        super(AFQMCDataset, self).__init__()
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        # ------------ new -----------------#

        # ------------ new -----------------#

    def __len__(self):
        return self.y.size(0)

```
TextCNN中collete_fn函数
def collete_fn(examples):
    input_ids_list = []
    labels =[]
    for example in examples:
        input_ids_list.append(example['input_ids'])
        labels.append(example['label'])
    
    # 对齐操作 -- 找到 input_ids_list 中 最长的 句子， 执行短句子补齐
    # 1. 找到 input_ids_list 中 最长的 句子
    max_length = max(len(input_ids) for input_ids in input_ids_list) 
    # 2. 定义一个 input_ids_tensor, 我们要把 每个 input_ids 放入 tensor 中
    input_ids_tensor = torch.zeros((len(labels), max_length), dtype=torch.long)
    for i, input_ids in enumerate(input_ids_list):
        # 得到当前句子的长度
        seq_len = len(input_ids)
        # 第i个句子，填充 seq_len 这么长
        input_ids_tensor[i, :seq_len] = torch.tensor(input_ids, dtype=torch.long)
    
    return {
        'input_ids' : input_ids_tensor,
        'labels' : torch.tensor(labels, dtype=torch.long)
    }
```

In [None]:
from torch.utils.data import DataLoader
def build_dataloader(train_df, test_df, config, vocab):
    X_train, y_train, X_val, y_val, label2id, id2label = read_data(train_df, config['train_val_ratio'], vocab, mode='train')
    X_test, y_test = read_data(test_df, config['train_val_ratio'], vocab, mode='test')

    train_dataset = AFQMCDataset(X_train, y_train)
    val_dataset = AFQMCDataset(X_val, y_val)
    test_dataset = AFQMCDataset(X_test, y_test)
    
    # -----------------new -----------------------#

    # -----------------new -----------------------#

    train_dataloader = DataLoader(dataset=train_dataset, batch_size=config['batch_size'],
                                  num_workers=4, shuffle=True, collate_fn=collate_fn)
    val_dataloader = DataLoader(dataset=val_dataset, batch_size=config['batch_size'],
                                num_workers=4, shuffle=False, collate_fn=collate_fn)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=config['batch_size'],
                                 num_workers=4, shuffle=False, collate_fn=collate_fn)

    return id2label, test_dataloader, train_dataloader, val_dataloader

In [None]:
id2label, test_dataloader, train_dataloader, val_dataloader = build_dataloader(train_df, test_df, config, token2idx)

In [None]:
from sklearn.metrics import f1_score, accuracy_score
def evaluation(config, model, val_dataloader):
    model.eval()
    preds = []
    labels = []
    val_loss = 0.
    val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))
    with torch.no_grad():
        for batch in val_iterator:
            # -----------new ----------------#

            # -----------new ----------------#

            val_loss += loss.item()
            preds.append(logits.argmax(dim=-1).detach().cpu())

    avg_val_loss = val_loss / len(val_dataloader)
    labels = torch.cat(labels, dim=0).numpy()
    preds = torch.cat(preds, dim=0).numpy()
    f1 = f1_score(labels, preds, average='macro')
    # -----------new ----------------#

    # -----------new ----------------#
    return avg_val_loss, f1, acc

In [None]:
from tqdm import trange
from transformers import AdamW

def train(model, config, id2label, train_dataloader, val_dataloader):
    optimizer = AdamW(model.parameters(), lr=config['learning_rate'])
    model.to(config['device'])
    epoch_iterator = trange(config['num_epochs'])

    global_steps = 0
    train_loss = 0.
    logging_loss = 0.

    for epoch in epoch_iterator:

        train_iterator = tqdm(train_dataloader, desc='Training', total=len(train_dataloader))
        model.train()
        for batch in train_iterator:
            # -----------new ----------------#

            # -----------new ----------------#

            model.zero_grad()
            loss.backward()
            
            optimizer.step()

            train_loss += loss.item()
            global_steps += 1

            if global_steps % config['logging_step'] == 0:
                print_train_loss = (train_loss - logging_loss) / config['logging_step']
                logging_loss = train_loss

                # -----------new ----------------#

                # -----------new ----------------#

                print_log = f'>>> training loss: {print_train_loss:.4f}, valid loss: {avg_val_loss:.4f}, ' \
                            f'valid f1 score: {f1:.4f}, valid acc: {acc:.4f}'
                print(print_log)
                model.train()

    return model

In [None]:
def predict(config, id2label, model, test_dataloader):
    test_iterator = tqdm(test_dataloader, desc='Predicting', total=len(test_dataloader))
    model.eval()
    test_preds = []
    with torch.no_grad():
        for batch in test_iterator:
            batch = [item.to(config['device']) for item in batch]
            logits = model(batch)[1]
            test_preds.append(logits.argmax(dim=-1).detach().cpu())
    test_preds = torch.cat(test_preds, dim=0).numpy()
    test_preds = [id2label[id_] for id_ in test_preds]
    return test_preds

## 预备知识

### masked_fill

## ESIM模型 

![ESIM](https://img-blog.csdnimg.cn/img_convert/1adb67ec46e87da23fa042f298ff88bb.png)

![ESIM2](https://img-blog.csdnimg.cn/img_convert/6cfe48bd15e9616c0d92eaeebfa50e7b.png)

![pytorch LSTM](https://img-blog.csdnimg.cn/5697d3a5bfec44039e4c1b407c4ec924.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![stackRNN](https://img-blog.csdnimg.cn/02b1b24b629c4defabb888776f9d3f57.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![Local Inference Modeling](https://img-blog.csdnimg.cn/c8afbd13e0da49a080fd8207e31eac8c.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![Local inference collected over sequences](https://img-blog.csdnimg.cn/759a1006a9354deeabdb8e0a7bc5f20a.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![Local inference 3](https://img-blog.csdnimg.cn/176f587346994ceaa2d6e0d3393a69b4.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![The composition layer](https://img-blog.csdnimg.cn/bb58ab8d53454eb2aaf450b5378bcb74.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![Pooling](https://img-blog.csdnimg.cn/66a661431da8440d9cd0efdb3aeac523.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![all of gongshi](https://img-blog.csdnimg.cn/652165f9f0584ac683c0df8d412514be.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

In [None]:
model_config = {
        'embedding': embedding_matrix,  # torch.Size([5251, 300])
        'freeze_emb': True,
        'hidden_size': 256,
        'dropout': 0.3,
        'num_layers': 2,
        'concat_layers': True,
        'rnn_type': 'lstm',
        'num_labels': len(id2label)
    }

In [None]:
class ESIM(nn.Module):
    
    def __init__(self, config):
        super().__init__()

        rnn_mapping = {'lstm': nn.LSTM, 'gru': nn.GRU}
        self.embedding = nn.Embedding.from_pretrained(config['embedding'], freeze=config['freeze_emb'])

        self.rnn_dropout = RNNDropout(p=config['dropout'])
        rnn_size = config['hidden_size']

        if config['concat_layers']:
            rnn_size //= config['num_layers']

        self.input_encoding = StackedBRNN(input_size=config['embedding'].size(1),
                                          hidden_size=rnn_size // 2,
                                          num_layers=config['num_layers'],
                                          rnn_type=rnn_mapping[config['rnn_type']],
                                          concat_layers=config['concat_layers'])


        self.attention = BidirectionalAttention()


        self.projection = nn.Sequential(
            nn.Linear(4 * config['hidden_size'], config['hidden_size']),
            nn.ReLU()
        )


        self.composition = StackedBRNN(input_size=config['hidden_size'],
                                      hidden_size=rnn_size // 2,
                                      num_layers=config['num_layers'],
                                      rnn_type=rnn_mapping[config['rnn_type']],
                                      concat_layers=config['concat_layers'])



        self.classification = nn.Sequential(
            nn.Dropout(p=config['dropout']),
            nn.Linear(4 * config['hidden_size'], config['hidden_size']),
            nn.Tanh(),
            nn.Dropout(p=config['dropout']))
            
        self.out = nn.Linear(config['hidden_size'], config['num_labels'])

    def forward(self, inputs):


In [None]:
model = ESIM(model_config)

In [None]:
best_model = train(model, config, id2label, train_dataloader, val_dataloader)

In [None]:
predict(config, id2label, best_model, test_dataloader)