## Esim中的：注意力机制
$$\large \rm e_{ij} = {\overline p_i}^T {\overline q}_j $$


||爱|打|篮|球||
|---|---|---|---|---|---|
|喜|score0|score1|score2|score3|\|step1:score0 = 喜的embedding 与 爱的embedding内积(内积=相似度=未归一化的余弦相似度)|
|欢|||||\|step2:score0-score4 进过softmax变成p0-p3, p0+p1+p2+p3=1|
|打|||||\|step3:喜_hat = 爱embeding\*p0 + 打embedding\*p1 + 篮embedding\*p2 + 球embedding\*p3|
|篮|||||\|即：喜_hat = 喜 与 '爱打篮球‘每个词的相似度的加权平均|
|球|||||\|key point： 注意力机制是词粒度的, 注意力要实现的是语义对齐。（跟词的embedding无关)|


$$\large \rm {\overline  p}_i  = \sum_{j=1}^{l_q}  \frac{exp(e_{ij})}{\sum_{k=1}^{l_q} exp(e_{ij})} {\overline q}_j, \forall_i \in [1, \cdots, l_p]$$


$$\large \rm {\overline  q}_i = \sum_{j=1}^{l_p}  \frac{exp(e_{ij})}{\sum_{k=1}^{l_p} exp(e_{ij})} {\overline p}_i, \forall_i \in [1, \cdots, l_q]$$


In [1]:
import torch
import torch.nn as nn

import pandas as pd
import jieba

from collections import defaultdict

In [2]:
torch.__version__

'1.9.0+cu111'

In [3]:
# 加载停用词
def load_stop_words():
    stop_words = set([])
    fname = './data/stopwords.txt'
    with open(fname, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            stop_words.add(line)
    return stop_words   

In [4]:
stop_words = load_stop_words()

In [5]:
def tokenize(text):
    return [word for word in jieba.cut(text) if word not in stop_words]

In [6]:
# 划分训练集和测试集
def split_data(df, split=0.7):
    df = df.sample(frac=1)
    length = len(df)
    train_data = df[0:length - 5000]
    eval_data = df[length - 5000:]

    return train_data, eval_data

In [7]:
# 把数据转换成index
def seq2index(seq):
    seg = tokenize(seq)
    seg_index = []
    for s in seg:
        seg_index.append(vocab.get(s, 1))
    return seg_index

In [8]:
# 构建词典
def build_vocab(del_word_frequency):
    data = pd.read_csv('./data/LCQMC.csv')
    segment1 = data['sentence1'].apply(tokenize)
    segment2 = data['sentence2'].apply(tokenize)
    

    word_frequency = defaultdict(int)
    for row in segment1:
        for i in row:
            word_frequency[i] += 1
    for row in segment2:
        for i in row:
            word_frequency[i] += 1
    
    word_sort = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True)  # 根据词频降序排序

    f = open('./data/vocab_esim.txt', 'w', encoding='utf-8')
    f.write('[PAD]' + "\n" + '[UNK]' + "\n")
    for d in word_sort:
        if d[1] > del_word_frequency:
            f.write(d[0] + "\n")
    f.close()

In [9]:
build_vocab(del_word_frequency=3)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.386 seconds.
Prefix dict has been built successfully.


In [10]:
vocab = {}
import os
if os.path.exists('./data/vocab_esim.txt'):
    with open('./data/vocab_esim.txt', encoding='utf-8')as file:
        for line in file.readlines():
            vocab[line.strip()] = len(vocab)

In [11]:
len(vocab)

22429

In [12]:
class ESIM(nn.Module):
    
    def __init__(self, vocab_size, char_dim, char_hidden_size, max_len):
        super(ESIM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, char_dim)
        
        # [batch_size, seq_len, hidden_size]
        # 期望[seq_len, batch_size, hidden_size]
        self.char_lstm = nn.LSTM(input_size=char_dim, 
                                hidden_size=char_hidden_size, 
                                num_layers=1,
                                bidirectional=True,
                                batch_first=True)
                                # drop_out=0.4) 在这里drop_out也是一样的, 两种方式都可以
        
        self.context_lstm = nn.LSTM(input_size=2*char_hidden_size*4,
                                   hidden_size=char_hidden_size,
                                   num_layers=1,
                                   bidirectional=True,
                                   batch_first=True)
        
        
        self.max_pool = nn.MaxPool2d(kernel_size=(max_len, 1))
        
        self.fc1 = nn.Linear(in_features=char_hidden_size*2*4, 
                             out_features=char_hidden_size)
        self.fc2 = nn.Linear(in_features=char_hidden_size, out_features=1)
        
        self.drop_out = nn.Dropout(0.4)
       
    
    def forward(self, char_p, char_q):
        # input encoding
        embedding_p = self.embedding(char_p)
        embedding_q = self.embedding(char_q)
        
        lstm_p, _ = self.char_lstm(embedding_p)
        lstm_q, _ = self.char_lstm(embedding_q)
        lstm_p = self.drop_out(lstm_p)
        lstm_q = self.drop_out(lstm_q)
        
        # Local inference modeling
        # 矩阵乘法：最后两个维度可乘，前面所有维度保持一致
        # [batch_size, seq_len, hidden_size] * [batch_size, seq_len, hidden_size]
        # [seq_len, hidden_size] * [seq_len, hidden_size]
        # [seq_len, hidden_size] * [hidden_size, seq_len]
        # print('lstm_p', lstm_p.shape, lstm_p)
        # print('lstm_q', lstm_q.shape, lstm_q)
        e = torch.matmul(lstm_p, torch.transpose(lstm_q, 1, 2) )
        # [seq_len, seq_len_p, seq_len_q]
        # print('p*q矩阵', e.shape, e)
        
        # attention
        p_hat = torch.matmul(torch.softmax(e, dim=2), lstm_q)
        q_hat = torch.matmul(torch.transpose(torch.softmax(e, dim=1), 1, 2), lstm_p)
        #q_hat = torch.matmul(torch.softmax(e, dim=1), lstm_p)
        # print('p_hat', p_hat)
        # print('q_hat', q_hat)
        # raise Exception("ERR")
        
        p_cat = torch.cat([lstm_p, p_hat, lstm_p-p_hat, lstm_p*p_hat], dim=-1)
        q_cat = torch.cat([lstm_q, q_hat, lstm_q-q_hat, lstm_q*q_hat], dim=-1)
        
        # inference Composition
        p, _ = self.context_lstm(p_cat)
        q, _ = self.context_lstm(q_cat)
        
        # predict
        p_max = self.max_pool(p).squeeze(dim=1)
        q_max = self.max_pool(q).squeeze(dim=1)
        
        p_mean = torch.mean(p, dim=1)
        q_mean = torch.mean(q, dim=1)
        
        y = torch.cat([p_max, q_max, p_mean, q_mean], dim=-1)
        y = self.drop_out(y)
        
        y = self.fc1(y)
        y = torch.tanh(y)
        y = self.drop_out(y)
        y = self.fc2(y)
        y = torch.sigmoid(y)
        y = y.squeeze(dim=-1)
        return y

In [13]:
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

In [14]:
vocab_size = len(vocab)
max_len=10

In [17]:
ESIM(vocab_size=vocab_size,
             char_dim=200,
             char_hidden_size=128,
             max_len=max_len)

ESIM(
  (embedding): Embedding(22429, 200)
  (char_lstm): LSTM(200, 128, batch_first=True, bidirectional=True)
  (context_lstm): LSTM(1024, 128, batch_first=True, bidirectional=True)
  (max_pool): MaxPool2d(kernel_size=(10, 1), stride=(10, 1), padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=1024, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (drop_out): Dropout(p=0.4, inplace=False)
)

In [18]:
# 统一长度
def padding_seq(X, max_len=max_len):
    return np.array([
        np.concatenate([x, [0] * (max_len - len(x))]) if len(x) < max_len else x[:max_len] for x in X
    ])

In [19]:
def load_data(batch_size=128):
    df = pd.read_csv('./data/LCQMC.csv')
    train_df, eval_df = split_data(df)
    train_p = df['sentence1']
    train_q = df['sentence2']
    train_y = df['label']
    eval_p = eval_df['sentence1']
    eval_q = eval_df['sentence2']
    eval_y = eval_df['label']

    train_p = padding_seq(train_p.apply(seq2index))
    train_q = padding_seq(train_q.apply(seq2index))
    train_y = np.array(train_y)

    train_data_set = TensorDataset(torch.from_numpy(train_p),
                                   torch.from_numpy(train_q),
                                   torch.from_numpy(train_y))
    train_data_loader = DataLoader(dataset=train_data_set, batch_size=batch_size, shuffle=True)

    eval_p = padding_seq(eval_p.apply(seq2index))
    eval_q = padding_seq(eval_q.apply(seq2index))
    return train_data_loader, [eval_p, eval_q], eval_y.values

In [20]:
model = ESIM(vocab_size=vocab_size,
         # char_dim=200,
         char_dim=10,
         # char_hidden_size=128,
         char_hidden_size=8,
         max_len=max_len)

In [27]:
# 训练模型
def train():
    global model

    train_data_loader, eval_x, eval_y = load_data(128)
    
    eval_p = eval_x[0]
    eval_q = eval_x[1]
    eval_p = torch.from_numpy(eval_p)
    eval_q = torch.from_numpy(eval_q)
    
    if torch.cuda.is_available():
        model = model.cuda()
        eval_p = eval_p.cuda()
        eval_q = eval_q.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    loss_func = nn.BCELoss()

    best_acc = 0

    for epoch in range(10):
        for step, (b_p, b_q, b_y) in enumerate(train_data_loader):
            if torch.cuda.is_available():
                b_p = b_p.cuda()
                b_q = b_q.cuda()
                b_y = b_y.cuda()

            output = model(b_p.long(), b_q.long())

            loss = loss_func(output, b_y.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        # # 每个epoch跑完在计算准确率
        # test_output = model(eval_p, eval_q)
        # pred_y = (test_output.cpu().data.numpy() > 0.5).astype(int)
        # accuracy = float((pred_y == eval_y).astype(int).sum()) / float(eval_y.size)
        # if accuracy > best_acc:
        #     best_acc = accuracy
        #     torch.save(model, 'esim.p')
        #     print('save model, accuracy: %.3f' % accuracy)
        # print('Epoch: ', epoch, '| train loss: %.4f' % loss.cpu().data.numpy(),
        #       '| test accuracy: %.3f' % accuracy)


            if step % 500 == 0:
                # Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; 
                # but got torch.DoubleTensor instead (while checking arguments for embedding)
                # to long
                test_output = model(eval_p.long(), eval_q.long())
                pred_y = (test_output.cpu().data.numpy() > 0.5).astype(int)
                accuracy = float((pred_y == eval_y).astype(int).sum()) / float(eval_y.size)
                if accuracy > best_acc:
                    best_acc = accuracy
                    # torch.save(model, 'esim.p')
                    # print('save model, accuracy: %.3f' % accuracy)
                print('Epoch: ', epoch, '| train loss: %.4f' % loss.cpu().data.numpy(),
                      '| test accuracy: %.3f' % accuracy)


In [28]:
train()

Epoch:  0 | train loss: 0.3092 | test accuracy: 0.898
Epoch:  0 | train loss: 0.2156 | test accuracy: 0.903
Epoch:  0 | train loss: 0.2684 | test accuracy: 0.897
Epoch:  0 | train loss: 0.3032 | test accuracy: 0.900
Epoch:  1 | train loss: 0.3225 | test accuracy: 0.901
Epoch:  1 | train loss: 0.3205 | test accuracy: 0.901
Epoch:  1 | train loss: 0.2474 | test accuracy: 0.902
Epoch:  1 | train loss: 0.2390 | test accuracy: 0.902
Epoch:  2 | train loss: 0.2544 | test accuracy: 0.900
Epoch:  2 | train loss: 0.2818 | test accuracy: 0.905
Epoch:  2 | train loss: 0.2739 | test accuracy: 0.903
Epoch:  2 | train loss: 0.2835 | test accuracy: 0.904
Epoch:  3 | train loss: 0.2516 | test accuracy: 0.899
Epoch:  3 | train loss: 0.2627 | test accuracy: 0.901
Epoch:  3 | train loss: 0.2824 | test accuracy: 0.908
Epoch:  3 | train loss: 0.3010 | test accuracy: 0.905
Epoch:  4 | train loss: 0.2175 | test accuracy: 0.904
Epoch:  4 | train loss: 0.2047 | test accuracy: 0.912
Epoch:  4 | train loss: 0.25

In [29]:
# model_new = torch.load('./esim.p')
model_new = model

In [30]:
model_new.eval()

ESIM(
  (embedding): Embedding(22429, 10)
  (char_lstm): LSTM(10, 8, batch_first=True, bidirectional=True)
  (context_lstm): LSTM(64, 8, batch_first=True, bidirectional=True)
  (max_pool): MaxPool2d(kernel_size=(10, 1), stride=(10, 1), padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=64, out_features=8, bias=True)
  (fc2): Linear(in_features=8, out_features=1, bias=True)
  (drop_out): Dropout(p=0.4, inplace=False)
)

In [33]:
def classfication_predicts(s1, s2):
    s1 = seq2index(s1)
    s1 = torch.from_numpy(padding_seq([s1])).long()
    
    s2 = seq2index(s2)
    s2 = torch.from_numpy(padding_seq([s2])).long()
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    s1 = s1.to(device)
    s2 = s2.to(device)
    
    
    out = model_new(s1, s2)
    # return out.data.cpu().numpy()
    return out.data

In [34]:
s1 = "喜欢打篮球的男生喜欢什么样的女生"
s2 = "爱打篮球的男生喜欢什么样的女生"

In [35]:
classfication_predicts(s1, s2)

tensor([0.8217], device='cuda:0')

In [36]:
s1 = "部落冲突游戏怎么找回"
s2 = "部落冲突重置游戏"

In [37]:
classfication_predicts(s1, s2)

tensor([0.0064], device='cuda:0')

In [38]:
s1 = "大家觉得她好看吗"
s2 = "大家觉得跑男好看吗"

In [39]:
classfication_predicts(s1, s2)

tensor([0.1598], device='cuda:0')

In [40]:
s1 = "人情债怎么还"
s2 = "人情债怎么还？"

In [41]:
classfication_predicts(s1, s2)

tensor([0.9267], device='cuda:0')