# Task3

In [1]:
import numpy as np # linear algebra
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import re
import collections
import random
import time
from tqdm import tqdm

import torch
from torch import nn
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import torchtext.vocab as Vocab
import torch.utils.data as Data

torch.cuda.set_device(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 1 导入数据 

In [2]:
def read_snli(data_dir, is_train):
    '''
    @params:
        data_dir: 数据集所在位置
        is_train：为True时加载训练集，否则加载测试集
    @return: premises：前提
             hypotheses：假设
             labels：标签
    '''
    def extract_text(s):
        # 移除不必要的信息
        s = re.sub('\\(', '', s)
        s = re.sub('\\)', '', s)
        # 用空格代替两个或多个连续空格
        s = re.sub('\\s{2,}', ' ', s)
        return s.strip()
    '''
    蕴涵：假设可以从前提中推断出来。
    矛盾：假设的否定可以从前提推断出来。
    中立：所有其情况。             
    '''
    label_set = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
    file_name = os.path.join(data_dir, 'snli_1.0_train.txt'
                             if is_train else 'snli_1.0_test.txt')
    with open(file_name, 'r') as f:
        rows = [row.split('\t') for row in f.readlines()[1:]]
    premises = [extract_text(row[1]) for row in rows if row[0] in label_set]
    hypotheses = [extract_text(row[2]) for row in rows if row[0] in label_set]
    labels = [label_set[row[0]] for row in rows if row[0] in label_set]
    return premises, hypotheses, labels

In [3]:
data_dir = "./snli/"
# 加载训练集
train_data = read_snli(data_dir, is_train=True)

In [4]:
for x0, x1, y in zip(train_data[0][:3], train_data[1][:3], train_data[2][:3]):
    print('premise:', x0)
    print('hypothesis:', x1)
    print('label:', y)

premise: A person on a horse jumps over a broken down airplane .
hypothesis: A person is training his horse for a competition .
label: 2
premise: A person on a horse jumps over a broken down airplane .
hypothesis: A person is at a diner , ordering an omelette .
label: 1
premise: A person on a horse jumps over a broken down airplane .
hypothesis: A person is outdoors , on a horse .
label: 0


In [5]:
# 加载测试集
test_data = read_snli(data_dir, is_train=False)

In [6]:
for data in [train_data, test_data]:
    print([[row for row in data[2]].count(i) for i in range(3)])

[183416, 183187, 182764]
[3368, 3237, 3219]


In [7]:
max_length = 0
for x0, x1, y in zip(train_data[0], train_data[1], train_data[2]):
    max_length = max(max_length, len(x0.split()), len(x1.split()))
    
for x0, x1, y in zip(test_data[0], test_data[1], test_data[2]):
    max_length = max(max_length, len(x0.split()), len(x1.split()))

max_length

82

训练集有大约55万组，测试集有大约10000组。结果表明，在训练集和测试集中，“蕴涵”、“矛盾”和“中性”三个标签是平衡的。

In [8]:
def get_tokenized(data):
    '''
    @params:
        data: 数据的列表，列表中的每个元素为 [文本字符串，0/1标签] 二元组
    @return: 切分词后的文本的列表，列表中的每个元素为切分后的词序列
    '''
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    
    return [tokenizer(review) for review in data]

def get_vocab(data):
    '''
    @params:
        data: 同上
    @return: 数据集上的词典，Vocab 的实例（freqs, stoi, itos）
    '''
    tokenized_data = get_tokenized(data[0]+data[1])
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)

vocab = get_vocab(train_data+test_data)
print('# words in vocab:', len(vocab))

# words in vocab: 16674


In [9]:
def preprocess(data, vocab):
    '''
    @params:
        data: 同上，原始的读入数据
        vocab: 训练集上生成的词典
    @return:
        features: 单词下标序列，形状为 (n, max_l) 的整数张量
        labels: 情感标签，形状为 (n,) 的0/1整数张量
    '''
    max_l = max_length  # 将每条评论通过截断或者补0，使得长度变成max_length

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data_x0 = get_tokenized(data[0])
    features_x0 = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data_x0])
    
    tokenized_data_x1 = get_tokenized(data[1])
    features_x1 = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data_x0])
    labels = torch.tensor([score for score in data[2]])
    return features_x0, features_x1, labels

In [10]:
train_set = Data.TensorDataset(*preprocess(train_data, vocab))
test_set = Data.TensorDataset(*preprocess(test_data, vocab))

batch_size = 128
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

for x0, x1, y in train_iter:
    print('x0', x0.shape, 'x1', x1.shape, 'y', y.shape)
    break
print('#batches:', len(train_iter))

x0 torch.Size([128, 82]) x1 torch.Size([128, 82]) y torch.Size([128])
#batches: 4292


In [11]:
# for labels, batch in enumerate(test_iter):
#     print(len(batch[2]))

In [12]:
# for x0, x1, y in train_iter:
#     print('x0', x0, 'x1', x1.shape, 'y', y.shape)
#     embeds = nn.Embedding(len(vocab), 100)
# #     print(embeds(x0))
#     print((embeds(x0).transpose(1, 2).contiguous()).transpose(1, 2).shape)
#     break

## 2 加载模型

In [13]:
class ESIM(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        '''
        @params:
            vocab: 在数据集上创建的词典，用于获取词典大小
            embed_size: 嵌入维度大小
            num_hiddens: 隐藏状态维度大小
            num_layers: 隐藏层个数
        '''
        super(ESIM, self).__init__()
        self.dropout = 0.2
        self.hidden_size = num_hiddens
        self.embeds_dim = embed_size
        self.num_layers = num_layers
                 
        self.embeds = nn.Embedding(len(vocab), self.embeds_dim)
        self.bn_embeds = nn.BatchNorm1d(self.embeds_dim)
        
        self.lstm1 = nn.LSTM(input_size=self.embeds_dim, 
                             hidden_size=self.hidden_size,
                             num_layers=self.num_layers,
                             bidirectional=True)
        self.lstm2 = nn.LSTM(input_size=self.hidden_size * 8, 
                             hidden_size=self.hidden_size, 
                             num_layers=self.num_layers,
                             bidirectional=True)
 
        self.fc = nn.Sequential(
            nn.BatchNorm1d(self.hidden_size * 8),
            nn.Linear(self.hidden_size * 8, 16),
            nn.Dropout(self.dropout),
            nn.Linear(16, 3),
        )
 
    def soft_attention_align(self, x1, x2, mask1, mask2):
        '''
        x1: batch_size * seq_len * dim
        x2: batch_size * seq_len * dim
        '''
        # attention: batch_size * seq_len * seq_len
        attention = torch.matmul(x1, x2.transpose(1, 2))
        # mask1 = mask1.float().masked_fill_(mask1, float('-inf'))
        # mask2 = mask2.float().masked_fill_(mask2, float('-inf'))
 
        # weight: batch_size * seq_len * seq_len
        # weight1 = F.softmax(attention + mask2.unsqueeze(1), dim=-1)
        weight1 = F.softmax(attention, dim=-1)
        x1_align = torch.matmul(weight1, x2)
        # weight2 = F.softmax(attention.transpose(1, 2) + mask1.unsqueeze(1), dim=-1)
        weight2 = F.softmax(attention.transpose(1, 2), dim=-1)
        x2_align = torch.matmul(weight2, x1)
        # x_align: batch_size * seq_len * hidden_size
 
        return x1_align, x2_align
 
    def submul(self, x1, x2):
        mul = x1 * x2
        sub = x1 - x2
        return torch.cat([sub, mul], -1)
 
    def apply_multiple(self, x):
        # input: batch_size * seq_len * (2 * hidden_size)
        p1 = F.avg_pool1d(x.transpose(1, 2), x.size(1)).squeeze(-1)
        p2 = F.max_pool1d(x.transpose(1, 2), x.size(1)).squeeze(-1)
        # output: batch_size * (4 * hidden_size)
        return torch.cat([p1, p2], 1)
 
    def forward(self, *input):
        # batch_size * seq_len
        sent1, sent2 = input[0], input[1]
#         print(sent1.shape, sent2.shape)
        mask1, mask2 = sent1.eq(0), sent2.eq(0)
 
        # embeds: batch_size * seq_len => batch_size * seq_len * dim
        x1 = self.bn_embeds(self.embeds(sent1).transpose(1, 2).contiguous()).transpose(1, 2)
        x2 = self.bn_embeds(self.embeds(sent2).transpose(1, 2).contiguous()).transpose(1, 2)
 
        # batch_size * seq_len * dim =>      batch_size * seq_len * hidden_size
        o1, _ = self.lstm1(x1)
        o2, _ = self.lstm1(x2)
 
        # Attention
        # batch_size * seq_len * hidden_size
        q1_align, q2_align = self.soft_attention_align(o1, o2, mask1, mask2)
 
        # Compose
        # batch_size * seq_len * (8 * hidden_size)
        q1_combined = torch.cat([o1, q1_align, self.submul(o1, q1_align)], -1)
        q2_combined = torch.cat([o2, q2_align, self.submul(o2, q2_align)], -1)
 
        # batch_size * seq_len * (2 * hidden_size)
        q1_compose, _ = self.lstm2(q1_combined)
        q2_compose, _ = self.lstm2(q2_combined)
 
        # Aggregate
        # input: batch_size * seq_len * (2 * hidden_size)
        # output: batch_size * (4 * hidden_size)
        q1_rep = self.apply_multiple(q1_compose)
        q2_rep = self.apply_multiple(q2_compose)
 
        # Classifier
        x = torch.cat([q1_rep, q2_rep], -1)
        out = self.fc(x)
        return out
embed_size, num_hiddens, num_layers = 100, 100, 2
net = ESIM(vocab, embed_size, num_hiddens, num_layers)

In [14]:
# cache_dir = "./"
# glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=cache_dir)

# def load_pretrained_embedding(words, pretrained_vocab):
#     '''
#     @params:
#         words: 需要加载词向量的词语列表，以 itos (index to string) 的词典形式给出
#         pretrained_vocab: 预训练词向量
#     @return:
#         embed: 加载到的词向量
#     '''
#     embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
#     oov_count = 0 # out of vocabulary
#     for i, word in enumerate(words):
#         try:
#             idx = pretrained_vocab.stoi[word]
#             embed[i, :] = pretrained_vocab.vectors[idx]
#         except KeyError:
#             oov_count += 1
#     if oov_count > 0:
#         print("There are %d oov words." % oov_count)
#     return embed

# net.embeds.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))

In [15]:
def func(*args):
    print(args)
q = 1, 2, 3
func(q)

((1, 2, 3),)


In [16]:
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for x0, x1, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval()
                acc_sum += (net(x0.to(device), x1.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train()
            else:
                if('is_training' in net.__code__.co_varnames):
                    acc_sum += (net(x0, x1, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(x0, x1).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for x0, x1, y in train_iter:
            optimizer.zero_grad()
            
            x0 = x0.to(device)
            x1 = x1.to(device)
            y = y.to(device)
            
            y_hat = net(x0,x1)
#             print((y_hat.argmax(dim=1) == y).float().sum()/len(y))
            l = loss(y_hat, y)
            
            
            l.backward()
            optimizer.step()
            
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

In [None]:
lr, num_epochs = 0.0005, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()

train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 1.1012, train acc 0.333, test acc 0.331, time 1223.3 sec
