<a href="https://colab.research.google.com/github/xx529/NLP/blob/main/ESIM/%E8%9A%82%E8%9A%81%E9%87%91%E8%9E%8D%E8%AF%AD%E4%B9%89%E7%9B%B8%E4%BC%BC%E5%BA%A6%E4%BB%BB%E5%8A%A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import pandas as pd
from tqdm import tqdm
import json
import bz2
import numpy as np
import jieba
import torch.nn as nn
import random
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch

In [10]:
seed = 529
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

## Loading

In [11]:
with open('/content/drive/MyDrive/Colab Notebooks Project/NLP/ESIM/train.json') as f:
    train_raw = f.readlines()
df_train = pd.DataFrame([json.loads(train_raw[x]) for x in range(len(train_raw))])
df_train['label'] = df_train['label'].astype(int)
df_train

Unnamed: 0,sentence1,sentence2,label
0,蚂蚁借呗等额还款可以换成先息后本吗,借呗有先息到期还本吗,0
1,蚂蚁花呗说我违约一次,蚂蚁花呗违约行为是什么,0
2,帮我看一下本月花呗账单有没有结清,下月花呗账单,0
3,蚂蚁借呗多长时间综合评估一次,借呗得评估多久,0
4,我的花呗账单是***，还款怎么是***,我的花呗，月结出来说让我还***元，我自己算了一下详细名单我应该还***元,1
...,...,...,...
34329,借呗还款日是,借呗还款日计算方法,0
34330,支付宝怎么关闭花呗自动付款功能,如何关闭余额支付功能,0
34331,为什么我借呗利息越来越高了,借呗分期利率怎么计算的,0
34332,二维码开通花呗信用卡付款,怎么样开通花呗信用卡支付码,0


In [12]:
with open('/content/drive/MyDrive/Colab Notebooks Project/NLP/ESIM/dev.json') as f:
    dev_raw = f.readlines()
df_dev = pd.DataFrame([json.loads(dev_raw[x]) for x in range(len(dev_raw))])
df_dev['label'] = df_dev['label'].astype(int)
df_dev

Unnamed: 0,sentence1,sentence2,label
0,双十一花呗提额在哪,里可以提花呗额度,0
1,花呗支持高铁票支付吗,为什么友付宝不支持花呗付款,0
2,我的蚂蚁花呗支付金额怎么会有限制,我到支付宝实体店消费用花呗支付受金额限制,1
3,为什么有花呗额度不能分期付款,花呗分期额度不足,0
4,赠品不能设置用花呗付款,怎么不能花呗分期付款,0
...,...,...,...
4311,使用的借呗之后，可以出国吗，因为我的工作在国外，我需要在国外还款,借呗这个月分期还款了，还了的钱还可以再次取出用吗,0
4312,还要多久能开通花呗,花呗要多久才会重新开通,0
4313,怎样设置花呗访问手机通讯录,花呗读取不了手机通讯录权限,0
4314,花呗的学历填错了，可不可以改,花呗学历填错了怎样修改,0


In [13]:
with open('/content/drive/MyDrive/Colab Notebooks Project/NLP/ESIM/test.json') as f:
    test_raw = f.readlines()
df_test = pd.DataFrame([json.loads(test_raw[x]) for x in range(len(test_raw))])
df_test

Unnamed: 0,id,sentence1,sentence2
0,0,借呗什么时候会取消,蚂蚁借呗什么时候可以恢复***个月
1,1,网商贷怎么转变成借呗,如何将网商贷切换为借呗
2,2,我的借呗为啥开通不了,我怎么没法开通借呗
3,3,蚂蚁借呗额度怎么不显示了,蚂蚁借呗额度不显示了
4,4,我的借呗没用给关闭了,把我的借呗关了
...,...,...,...
3856,3856,花呗怎么不能再美团使用了,美团 滴滴用不了花呗
3857,3857,花呗的货币是真实的吗,花呗是自己设定的吗
3858,3858,花呗分期后为什么不可以提前还款,花呗还可以负数么
3859,3859,是用户用花呗支付。我们要收取多少比例的费用,我店铺怎么用不了花呗的


In [14]:
with bz2.open('/content/drive/MyDrive/Colab Notebooks Project/NLP/TouTiao_text/sgns.weibo.word.bz2') as f:
    embedding_raw = f.readlines()

In [15]:
info, *emb = embedding_raw
num, embedding_dim = map(int, info.decode('utf8').split())

word_to_idx, idx_to_word = {'<unk>': 0}, {0: '<unk>'}
embedding_mat = np.zeros((num + 1, embedding_dim))

for idx, i in enumerate(emb):
    word, *embedding = i.decode('utf8').split()
    word_to_idx[word] = idx + 1
    idx_to_word[idx + 1] = word
    embedding_mat[idx] = list(map(float, embedding))

## process context

In [16]:
def transform_to_idx(x):
    idx_list = [word_to_idx[i] if i in word_to_idx else 0 for i in jieba.lcut(x)]
    length = len(idx_list)
    if length >= 30:
        return np.array(idx_list[:30])
    else:
        return np.array(idx_list + [0]*(30-length))

for df in [df_train, df_dev, df_test]:
    df['query'] = df['sentence1'].apply(transform_to_idx)
    df['doc'] = df['sentence2'].apply(transform_to_idx)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.971 seconds.
Prefix dict has been built successfully.


## dataset

In [17]:
class MyDataSet(Dataset):
    def __init__(self, query, doc, y=None):
        super().__init__()
        self.x = [(torch.LongTensor(q), torch.LongTensor(d)) for q, d in zip(query, doc)]
        self.y = torch.LongTensor(y) if y is not None else [0] * len(query) 

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

train_set = MyDataSet(df_train['query'], df_train['doc'], df_train['label'])
dev_set = MyDataSet(df_dev['query'], df_dev['doc'], df_dev['label'])
test_set = MyDataSet(df_test['query'], df_test['doc'])

# 加载 DataLoader
train_loader = DataLoader(train_set, batch_size=128, shuffle=True, num_workers=2)
dev_loader = DataLoader(dev_set, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_set, batch_size=128, shuffle=True, num_workers=2)

## ESIM

In [18]:
config = {
    'embedding_mat': torch.Tensor(embedding_mat),  # 词向量矩阵
    'is_embedding_freeze': True,  # 词向量是否进行固定
    'base_rnn': nn.GRU,  # BiLSTM 使用的 rnn 类型
    'stack_rnn_layers': 2,  # BiLSTM stack 的层数
    'input_size': embedding_dim,  # BiLSTM 初始输入的特征维度
    'hidden_size': 32,  # BiLSTM 隐藏层的特征维度
    'rnn_layer': 1  # 每个 BiLSTM 里的层数
}

### Input Encoding

In [19]:
class InputEncoder(nn.Module):
    def __init__(self, config):
        super(InputEncoder, self).__init__()
        
        # stack rnn 层数
        self.layers = config['stack_rnn_layers']
        
        # 预训练词向量
        self.embedding = nn.Embedding.from_pretrained(
            embeddings=config['embedding_mat'], 
            freeze=config['is_embedding_freeze']
            )
        
        # 两层堆叠的 RNN 层，前一层的 output 是后一层的 input
        self.stack_rnn = nn.ModuleList()
        
        for i in range(config['stack_rnn_layers']):
            rnn_input_size = config['input_size'] if i == 0 else 2 * config['hidden_size']

            self.stack_rnn.append(
                config['base_rnn'](
                    input_size=rnn_input_size,
                    hidden_size=config['hidden_size'],
                    num_layers=config['rnn_layer'],
                    bidirectional=True
                    )
                )

    def forward(self, pair_x):
        query, doc = pair_x

        # (batch, seq) - embedding -> (batch, seq, embedding_dim) - transpose -> (seq, batch，input_size)
        # LSTM 的输入需要维度 (seq, batch，input_size)
        # 初始值当作最开始的输出值
        query = self.embedding(query).transpose(1,0)
        doc = self.embedding(doc).transpose(1,0)

        # 每一层 stackrnn 计算
        for i in range(len(self.stack_rnn)):
            query, _ = self.stack_rnn[i](query)  # Outputs: output, (h_n, c_n)
            doc, _ = self.stack_rnn[i](doc)  # Outputs: output, (h_n, c_n)

        return query, doc

In [20]:
for x, y in train_loader:
    temp_x = x
    break

In [21]:
model = InputEncoder(config)
res = model(temp_x)

In [22]:
res[0]
# temp_x

tensor([[[-0.0367, -0.0881,  0.0480,  ...,  0.4109, -0.2363,  0.2303],
         [ 0.0610, -0.0823,  0.0749,  ...,  0.3054, -0.0548,  0.3829],
         [ 0.0477, -0.1109,  0.0140,  ...,  0.4023, -0.1518,  0.2731],
         ...,
         [ 0.3310,  0.0743,  0.0396,  ..., -0.1268,  0.1050,  0.3486],
         [ 0.1289, -0.0859,  0.0745,  ...,  0.4290, -0.0419,  0.2955],
         [ 0.0739,  0.0482,  0.0257,  ...,  0.1887,  0.1385,  0.1444]],

        [[-0.0102, -0.0892,  0.0495,  ...,  0.3295, -0.3082,  0.2372],
         [ 0.1014, -0.1312,  0.0078,  ...,  0.2057, -0.0387,  0.4396],
         [ 0.0070, -0.1813, -0.1038,  ...,  0.2537,  0.0727,  0.2929],
         ...,
         [ 0.4824, -0.0560, -0.0813,  ..., -0.0658, -0.0261,  0.2994],
         [ 0.1704, -0.0424, -0.0183,  ...,  0.5331, -0.0319,  0.2839],
         [ 0.0290,  0.0941, -0.0133,  ...,  0.2436,  0.0883,  0.1499]],

        [[ 0.0233, -0.1791,  0.0413,  ...,  0.3563, -0.2640,  0.2673],
         [ 0.1181, -0.1396,  0.0946,  ...,  0

### Model

In [23]:
class ESIM(nn.Module):
    def __init__(self, config):
        super(ESIM, self).__init__()
        self.input_encoder = InputEncoder(config)

    def forward(self, pair_x):
        x = self.input_encoder(pair_x)

        return x

In [24]:
esim_model = ESIM(config)

for pair_x, y in train_loader:
    output = esim_model(pair_x)
    break

In [25]:
len(output)

2