In [1]:
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch
import json
import time
import pprint
import collections
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import jdc

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
PATH = "model/"
train_file = "data/snli_1.0_train.jsonl"
dev_file = "data/snli_1.0_dev.jsonl"
embedding_file = "model/glove.6B.50d.txt"
with open(train_file, 'r') as f:
    lines = f.readlines()
    train_data_raw = [json.loads(line.rstrip()) for line in lines]
    
with open(dev_file, 'r') as f:
    lines = f.readlines()
    dev_data_raw = [json.loads(line.rstrip()) for line in lines]

In [4]:
train_data_raw[0], len(train_data_raw)

({'annotator_labels': ['neutral'],
  'captionID': '3416050480.jpg#4',
  'gold_label': 'neutral',
  'pairID': '3416050480.jpg#4r1n',
  'sentence1': 'A person on a horse jumps over a broken down airplane.',
  'sentence1_binary_parse': '( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )',
  'sentence1_parse': '(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))',
  'sentence2': 'A person is training his horse for a competition.',
  'sentence2_binary_parse': '( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )',
  'sentence2_parse': '(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))'},
 550152)

**Suppse we don't need Tree LSTM, so that it's not necessary to exploit the parsing property of each sentence.**

In [5]:
def load_data(raw, depreciated=['-']):
    prem, hypo, label = [], [], []
    for obj in raw:
        # gold_label should not be '-'
        if obj['gold_label'] in depreciated:
            continue
        prem.append(obj['sentence1'])
        hypo.append(obj['sentence2'])
        label.append(obj['gold_label'])
    return prem, hypo, label

def tokenize(lines):
    return [line.rstrip('.').lower().split() for line in lines]

In [6]:
train_prem, train_hypo, train_label = load_data(train_data_raw)
dev_prem, dev_hypo, dev_label = load_data(dev_data_raw)
tokens = tokenize(train_prem) + tokenize(train_hypo) + tokenize(dev_prem) + tokenize(dev_hypo)

In [7]:
print(tokens[:3], len(tokens))

[['a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken', 'down', 'airplane'], ['a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken', 'down', 'airplane'], ['a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken', 'down', 'airplane']] 1118418


**Get the corpus and idx_to_token**

In [8]:
class Vocab:
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        counter = self.count_corpus_(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
        uniq_tokens += [token for token, freq in self.token_freqs if freq >= min_freq and token not in uniq_tokens]
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self, indices):
        # Recursive definition
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.to_tokens(index) for index in indices]
    
    def count_corpus_(self, tokens):
        if isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        return collections.Counter(tokens)

In [9]:
vocab, vocab_truncated = Vocab(tokens, reserved_tokens=["<pad>"]), Vocab(tokens, 5, reserved_tokens=["<pad>"])

**Truncated corpus is necessary because many words are really sparse in this corpus**

In [10]:
vocab.to_tokens([100, 200, [300, [4000]]]), len(vocab), vocab[[['cvnlp', ['lol'], 'shit'], 'duck']]

(['boys', 'glasses', ['river', ['bracelet']]],
 44086,
 [[0, [38250], 3941], 1515])

In [11]:
vocab_truncated.to_tokens([100, 200, [300, [4000]]]), len(vocab_truncated), vocab_truncated[[['cvnlp', ['lol'], 'shit'], 'duck']]

(['boys', 'glasses', ['river', ['bracelet']]], 19301, [[0, [0], 3941], 1515])

**Sentence preprocessing**

In [12]:
class SNLIDataset(torch.utils.data.Dataset):
    def __init__(self, premise_tokens, hypothesis_tokens, labels, num_steps, vocab=None):
        self.num_steps = num_steps
        if vocab is None:
            self.vocab = Vocab(premise_tokens + hypothesis_tokens,
                                   min_freq=5, reserved_tokens=['<pad>'])
        else:
            self.vocab = vocab
        self.premises = self._pad(premise_tokens)
        self.hypotheses = self._pad(hypothesis_tokens)
        labels = list(map(lambda x: 0 if x == 'entailment' else 1 if x == 'neutral' else 2, labels))
        self.labels = torch.tensor(labels)
        print('read ' + str(len(self.premises)) + ' examples')

    def _pad(self, lines):
        return torch.tensor([truncate_pad(
            self.vocab[line.split()], self.num_steps, self.vocab['<pad>'])
                         for line in lines])

    def __getitem__(self, idx):
        return (self.premises[idx], self.hypotheses[idx]), self.labels[idx]

    def __len__(self):
        return len(self.premises)
    
def truncate_pad(seq, num_steps, pad):
    return seq[:num_steps] if len(seq) > num_steps else seq + [pad] * (num_steps - len(seq))
    
def load_data_snli(train_set, dev_set, vocab, batch_size=50, num_steps=50):
    train_prem, train_hypo, train_label = train_set
    dev_prem, dev_hypo, dev_label = dev_set
    train_set = SNLIDataset(train_prem, train_hypo, train_label, num_steps, vocab)
    dev_set = SNLIDataset(dev_prem, dev_hypo, dev_label, num_steps, vocab)
    train_iter = torch.utils.data.DataLoader(train_set, batch_size,
                                             shuffle=True)
    dev_iter = torch.utils.data.DataLoader(dev_set, batch_size,
                                            shuffle=False)
    return train_iter, dev_iter

In [13]:
train_iter, dev_iter = load_data_snli((train_prem, train_hypo, train_label),
                                       (dev_prem, dev_hypo, dev_label),
                                        vocab_truncated, batch_size = 100)

read 549367 examples
read 9842 examples


**DataLoader seems not very efficient**

In [14]:
%%timeit
for prem, label in train_iter:
    continue

3.68 s ± 139 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


**Load GloVe**

In [15]:
class GloVe(object):
    def __init__(self):
        with open(embedding_file, 'r', encoding='utf-8') as f:
            words = [x.rstrip().split(' ')[0] for x in f.readlines()]
        with open(embedding_file, 'r', encoding='utf-8') as f:
            vectors = {}
            for line in f:
                vals = line.rstrip().split(' ')
                vectors[vals[0]] = [float(x) for x in vals[1:]]
        
        vocab_size = len(words)
        self.vocab = {w: idx for idx, w in enumerate(words)}
        self.ivocab = {idx: w for idx, w in enumerate(words)}

        vector_dim = len(vectors[self.ivocab[0]])
        W = np.zeros((vocab_size, vector_dim))
        for word, v in vectors.items():
            W[self.vocab[word], :] = v
        
        # normalize each word vector to unit variance
        self.W_norm = np.zeros(W.shape)
        d = (np.sum(W ** 2, 1) ** (0.5))
        self.W_norm = (W.T / d).T
        self.num_feature = vector_dim
    
    def __getitem__(self, word):
        # pretrained, 0/1 (0 for pretrained, 1 for unknown)
        if word in self.vocab:
            return self.W_norm[self.vocab[word], :], 1
        else:
            return np.random.normal(scale=0.6, size=(self.W_norm.shape[1])), 0

In [16]:
def create_embedding(vocab):
    G = GloVe()
    W = torch.zeros((len(vocab), G.num_feature))
    select = []
    for i in range(len(vocab)):
        vector, pretrained = G[vocab[i]]
        W[i] = torch.from_numpy(vector)
        if pretrained:
            select.append(i)
    return nn.Embedding.from_pretrained(W, freeze=False), select

In [17]:
embedding, select = create_embedding(vocab_truncated)

In [18]:
embedding.weight.requires_grad

True

**We need to manually clean the gradient for pretrained part of the embeddings. Fine tunning the embedding may cause overfitting.**

In [19]:
def mlp(num_inputs, num_hiddens, flatten, activation='relu', dropout=0.2):
    net = []
    net.append(nn.Dropout(dropout))
    net.append(nn.Linear(num_inputs, num_hiddens))
    net.append(nn.Tanh() if activation == 'tanh' else nn.ReLU())
    if flatten:
        net.append(nn.Flatten(start_dim=1))
    net.append(nn.Dropout(dropout))
    net.append(nn.Linear(num_hiddens, num_hiddens))
    net.append(nn.Tanh() if activation == 'tanh' else nn.ReLU())
    if flatten:
        net.append(nn.Flatten(start_dim=1))
    return nn.Sequential(*net)

**Attention layer basically computes the alignment of premise $\bar{a}$ with hypothesis and the alignment of hypothesis $\bar{b}$ with premise. If necessary [Chen et al.,2017], a BiLSTM is added to encode the sequence. In paper [Chen et al.,2017], the author indicated that no $F(\cdot)$ is required for the output of BiLSTM, but an MLP is required for the embedded sequence in [Parikh et al. 2016].**

In [20]:
class Attend(nn.Module):
    def __init__(self, num_inputs, num_hiddens, num_layers=2, f='mlp', dropout=0.2):
        super(Attend, self).__init__()
        if f == 'mlp':
            self.f = mlp(num_inputs, num_hiddens, flatten=False, dropout=dropout)
        elif f.lower() == 'bilstm':
            self.f = nn.LSTM(num_inputs, num_hiddens, num_layers, batch_first=True, bidirectional=True, dropout=dropout)
        self.encoder = f

    def forward(self, A, B):
        # f: A/B: (batch_size, seq_len, embed_size) -> A/B_bar: (batch_size, seq_len, hidden_size)
        if self.encoder == 'mlp':
            A_bar = self.f(A)
            B_bar = self.f(B)
        elif self.encoder == 'bilstm':
            A_bar, _ = self.f(A)
            B_bar, _ = self.f(B)
            A, B = A_bar, B_bar
        
        # e: (batch_size, seqA, seqB)
        e = torch.bmm(A_bar, B_bar.permute(0, 2, 1))
        # A/B_tilde: (batch_size, seqB/A, embed_size)
        A_tilde = torch.bmm(F.softmax(e, dim=-1), B)
        B_tilde = torch.bmm(F.softmax(e.permute(0, 2, 1), dim=-1), A)
        
        return (A, B, A_tilde, B_tilde)

In [21]:
net = Attend(8, 4, 'mlp')
net(torch.rand((2, 2, 8)), torch.rand((2, 1, 8)))

(tensor([[[0.9337, 0.0409, 0.0598, 0.0955, 0.1283, 0.3423, 0.4160, 0.6789],
          [0.1245, 0.8826, 0.8863, 0.4941, 0.5254, 0.9943, 0.0803, 0.9632]],
 
         [[0.5675, 0.4124, 0.2126, 0.8347, 0.2480, 0.5631, 0.0050, 0.4898],
          [0.1630, 0.8203, 0.5798, 0.5135, 0.1002, 0.3461, 0.8230, 0.1522]]]),
 tensor([[[0.0063, 0.6608, 0.6430, 0.7623, 0.9438, 0.8079, 0.0252, 0.2524]],
 
         [[0.7647, 0.7510, 0.6696, 0.0868, 0.5204, 0.8029, 0.1255, 0.6940]]]),
 tensor([[[0.0063, 0.6608, 0.6430, 0.7623, 0.9438, 0.8079, 0.0252, 0.2524],
          [0.0063, 0.6608, 0.6430, 0.7623, 0.9438, 0.8079, 0.0252, 0.2524]],
 
         [[0.7647, 0.7510, 0.6696, 0.0868, 0.5204, 0.8029, 0.1255, 0.6940],
          [0.7647, 0.7510, 0.6696, 0.0868, 0.5204, 0.8029, 0.1255, 0.6940]]],
        grad_fn=<BmmBackward0>),
 tensor([[[0.5287, 0.4622, 0.4735, 0.2950, 0.3271, 0.6687, 0.2480, 0.8212]],
 
         [[0.3707, 0.6109, 0.3913, 0.6784, 0.1761, 0.4575, 0.4030, 0.3255]]],
        grad_fn=<BmmBackward0>))

In [22]:
net = Attend(8, 4, 2, 'bilstm')
net(torch.rand((2, 2, 8)), torch.rand((2, 2, 8)))

(tensor([[[ 0.0467, -0.1232, -0.0125, -0.0718, -0.2257,  0.0667, -0.0375,
           -0.1345],
          [ 0.0366, -0.1245, -0.0813, -0.0700, -0.1666, -0.0060, -0.0715,
           -0.1138]],
 
         [[ 0.0365, -0.1135, -0.0176, -0.0869, -0.2208,  0.0744, -0.0135,
           -0.1304],
          [ 0.0636, -0.1514, -0.0421, -0.1403, -0.1443,  0.0665, -0.0142,
           -0.0914]]], grad_fn=<TransposeBackward0>),
 tensor([[[ 0.0558, -0.1317, -0.0351, -0.0809, -0.2062,  0.0782, -0.0124,
           -0.1107],
          [ 0.0677, -0.1630, -0.0480, -0.1058, -0.1464,  0.0362, -0.0006,
           -0.0803]],
 
         [[ 0.0492, -0.1262, -0.0253, -0.1050, -0.2260,  0.0792, -0.0168,
           -0.1418],
          [ 0.0429, -0.1575, -0.0861, -0.0620, -0.1583, -0.0143, -0.0479,
           -0.1033]]], grad_fn=<TransposeBackward0>),
 tensor([[[ 0.0617, -0.1472, -0.0415, -0.0933, -0.1765,  0.0574, -0.0065,
           -0.0956],
          [ 0.0617, -0.1473, -0.0415, -0.0933, -0.1764,  0.0573, -0.0065,

**We would try max and avg_pooling to extract features, instead of summing the $\{v_{ai}\}$ and $\{v_{bi}\}$ together. This avoids the unexpected explosion of sentence length.**

In [23]:
class Aggregate(nn.Module):
    def __init__(self, num_inputs, num_hiddens, num_outputs):
        super(Aggregate, self).__init__()
        self.f = mlp(num_inputs * 4, num_hiddens, activation='relu', flatten=True)
        self.g = mlp(num_inputs * 4, num_inputs, flatten=False)
        self.linear = nn.Linear(num_hiddens, num_outputs)

    def forward(self, A_bar, B_bar, A_tilde, B_tilde):
        # V_A = [A_bar; A_tilde; A_bar-A_tilde; A_bar.*A_tilde]
        V_A = self.g(torch.cat([A_bar, A_tilde, A_bar-A_tilde, torch.mul(A_bar, A_tilde)], dim=2))
        V_B = self.g(torch.cat([B_bar, B_tilde, B_bar-B_tilde, torch.mul(B_bar, B_tilde)], dim=2))
        
        # max_pooling and avg_pooling. Before: (batch_size, seqA/B, embed_dim), After: (batch_size, 4 * embed_dim)
        V_A_avg = torch.mean(V_A, dim=1)
        V_A_max, _ = torch.max(V_A, dim=1)
        V_B_avg = torch.mean(V_B, dim=1)
        V_B_max, _ = torch.max(V_B, dim=1)
        
        # Feed the concatenation of both summarization results into an MLP
        Y_hat = self.linear(self.f(torch.cat([V_A_avg, V_A_max, V_B_avg, V_B_max], dim=1)))
        return Y_hat

In [24]:
aggr = Aggregate(8, 5, 3)  # num_input = num_hidden (of attend) * 32
A, B, A_tilde, B_tilde = net(torch.rand((2, 2, 8)), torch.rand((2, 2, 8)))
# A_bar.size(), A_tilde.size(), torch.mul(A_bar, A_tilde), B_bar.size()
aggr(A, B, A_tilde, B_tilde)

tensor([[0.3276, 0.4355, 0.1557],
        [0.3291, 0.4358, 0.1519]], grad_fn=<AddmmBackward0>)

**The entire Attention network**

In [25]:
class DecomposableAttention(nn.Module):
    def __init__(self, vocab, num_hiddens_1, num_hiddens_2, embedding=None, select=None, embed_dim=50, f='mlp'):
        super(DecomposableAttention, self).__init__()
        if embedding is not None:
            self.embedding = embedding
        else:
            self.embedding = nn.Embedding(len(vocab), embed_dim)
        if select is not None:
            self.select = select   # This is used to clear the gradients
        else:
            self.select = []

        self.attend = Attend(embed_dim, num_hiddens_1, f=f)   # Attend(emb_dim, num_hiddens_1, num_layers=2, f='mlp')
        if f == 'mlp':
            self.name = 'parikh'
            self.aggregate = Aggregate(embed_dim, num_hiddens_2, num_outputs=3) # Aggregate(embed_dim * 32, num_hiddens_2, 3)
        else:
            self.name = 'esim'
            self.aggregate = Aggregate(num_hiddens_1 * 2, num_hiddens_2, num_outputs=3) # Aggregate(num_hidden_1 * 32, num_hiddens_2, 3)
        

    def forward(self, X):
        premises, hypotheses = X
        A = self.embedding(premises)
        B = self.embedding(hypotheses)
        A, B, A_tilde, B_tilde = self.attend(A, B)
        Y_hat = self.aggregate(A, B, A_tilde, B_tilde)
        return Y_hat
    
    def freeze_embedding_grad_(self):
        self.embedding.weight.grad[self.select] = 0      

**In practice, we may fix the embedding for the first two epochs**

In [26]:
def train(net, train_iter, dev_iter, loss, trainer, num_epochs, device, freeze_epoch=3, save=False, suffix=''):
    train_loss = []
    train_acc = []
    dev_acc = []
    for i in range(num_epochs):
        total_acc = []
        total_loss = []
        net.train()
        for j, (X, y) in tqdm(enumerate(train_iter)):
            X[0].to(device), X[1].to(device), y.to(device)
            net.zero_grad()
            logits = net(X)
            L = loss(logits, y)
            y_pred = torch.argmax(logits, dim=1)
            total_loss.append(L.item())
            total_acc.append((y_pred == y).sum() / y.size()[0] * 100)
            if j > 1000:
                total_loss, total_acc = total_loss[1:], total_acc[1:]
            L.backward()
            if i < freeze_epoch:
                net.freeze_embedding_grad_()
            trainer.step()
            
            if (j+1) % 1000 == 0:
                train_loss.append(sum(total_loss) / len(total_loss))
                train_acc.append(sum(total_acc) / len(total_acc))
                print(f"iteration {j+1} of epoch {i+1}, train loss:{train_loss[-1]}, train accuracy:{train_acc[-1]}")
        
        with torch.no_grad():
            total_acc = []
            for X, y in dev_iter:
                X[0].to(device), X[1].to(device), y.to(device)
                logits = net(X)
                L = loss(logits, y)
                y_pred = torch.argmax(logits, dim=1)
                total_acc.append((y_pred == y).sum() / y.size()[0] * 100)
            dev_acc += [sum(total_acc) / len(total_acc)] * (len(train_iter) // 1000)
        
        if save:
            torch.save(net.state_dict(), PATH + net.name + suffix + '.pth')
    return net, (train_loss, train_acc, dev_acc)

**The only difference between Parikh and ESIM lies in the encoder.**

In [27]:
embedding, select = create_embedding(vocab_truncated)
parikh = DecomposableAttention(vocab_truncated, 100, 100, embedding, select, embedding.weight.size()[1], 'mlp')
num_epochs, lr = 2, 0.001
loss = nn.CrossEntropyLoss()
trainer = optim.Adam(parikh.parameters(), lr=lr)
parikh, history_p = train(parikh, train_iter, dev_iter, loss, trainer, num_epochs, device, freeze_epoch=1, save=True, suffix='-50-30-30')

1001it [01:13, 13.14it/s]

iteration 1000 of epoch 1, train loss:1.0249824350476264, train accuracy:47.237998962402344


2001it [02:25, 14.07it/s]

iteration 2000 of epoch 1, train loss:0.9416316803637799, train accuracy:55.97502517700195


3001it [03:39, 13.42it/s]

iteration 3000 of epoch 1, train loss:0.9066073506266683, train accuracy:58.50349807739258


4002it [04:57, 12.53it/s]

iteration 4000 of epoch 1, train loss:0.8854592329376823, train accuracy:59.95304870605469


5002it [06:11, 13.42it/s]

iteration 5000 of epoch 1, train loss:0.8699641615360767, train accuracy:60.74725341796875


5494it [06:47, 13.47it/s]
1001it [01:13, 13.76it/s]

iteration 1000 of epoch 2, train loss:0.8480161155462265, train accuracy:62.2869987487793


2001it [02:27, 12.89it/s]

iteration 2000 of epoch 2, train loss:0.8432458038334841, train accuracy:62.539459228515625


3001it [03:42, 12.76it/s]

iteration 3000 of epoch 2, train loss:0.8367857135140098, train accuracy:62.8211784362793


4002it [04:57, 13.99it/s]

iteration 4000 of epoch 2, train loss:0.8303616816228205, train accuracy:63.22677230834961


5002it [06:11, 12.77it/s]

iteration 5000 of epoch 2, train loss:0.8247710052546445, train accuracy:63.61438751220703


5494it [06:47, 13.48it/s]


In [28]:
embedding, select = create_embedding(vocab)
esim = DecomposableAttention(vocab, 100, 100, embedding, select, embedding.weight.size()[1], 'bilstm')
num_epochs, lr = 2, 0.002
loss = nn.CrossEntropyLoss()
trainer = optim.Adam(esim.parameters(), lr=lr)
esim, history_e = train(esim, train_iter, dev_iter, loss, trainer, num_epochs, device, freeze_epoch=1, save=True, suffix='-50-100-100')

1000it [13:11,  1.30it/s]

iteration 1000 of epoch 1, train loss:1.017000095129013, train accuracy:47.78799819946289


2000it [26:00,  1.31it/s]

iteration 2000 of epoch 1, train loss:0.9204398975386605, train accuracy:56.785213470458984


3000it [38:50,  1.33it/s]

iteration 3000 of epoch 1, train loss:0.8826814089026246, train accuracy:59.72427749633789


4000it [51:41,  1.31it/s]

iteration 4000 of epoch 1, train loss:0.8546697033511532, train accuracy:61.739261627197266


5000it [1:04:32,  1.22it/s]

iteration 5000 of epoch 1, train loss:0.8222182278747444, train accuracy:63.837162017822266


5494it [1:10:53,  1.29it/s]
1000it [12:53,  1.30it/s]

iteration 1000 of epoch 2, train loss:0.7755439891815186, train accuracy:66.50800323486328


2000it [25:45,  1.30it/s]

iteration 2000 of epoch 2, train loss:0.7665508627057909, train accuracy:66.93106842041016


3000it [38:38,  1.27it/s]

iteration 3000 of epoch 2, train loss:0.754696276221242, train accuracy:67.74225616455078


4000it [51:46,  1.28it/s]

iteration 4000 of epoch 2, train loss:0.7479835885030763, train accuracy:68.02198028564453


5000it [1:04:42,  1.31it/s]

iteration 5000 of epoch 2, train loss:0.7436615874121835, train accuracy:68.01798248291016


5494it [1:11:08,  1.29it/s]


In [29]:
def predict_snli(net, vocab, premise, hypothesis):
    net.eval()
    premise = torch.tensor(vocab[truncate_pad(premise.rstrip('.').lower().split(), 50, pad='<pad>')])
    hypothesis = torch.tensor(vocab[truncate_pad(hypothesis.rstrip('.').lower().split(), 50, pad='<pad>')])
    label = torch.argmax(net([premise.reshape((1, -1)),
                           hypothesis.reshape((1, -1))]), dim=1)
    print(net([premise.reshape((1, -1)), hypothesis.reshape((1, -1))]))
    return 'entailment' if label == 0 else 'contradiction' if label == 2 \
            else 'neutral'

**It is questionable because s2 can derive s1 but s1 can't derive s2. These attention model treat the premise and hypothesis equally. I am also curious about whether or not we should \<pad\> our setence, because the pooling layer will squeeze the length.**

In [30]:
s1 = "I have a driving license."
s2 = "I can drive."
(predict_snli(esim, vocab_truncated, s1, s2), predict_snli(esim, vocab_truncated, s2, s1),
 predict_snli(parikh, vocab_truncated, s1, s2), predict_snli(parikh, vocab_truncated, s2, s1))

tensor([[ 0.1788, -0.3292,  0.2063]], grad_fn=<AddmmBackward0>)
tensor([[-0.5025, -0.3422,  0.3792]], grad_fn=<AddmmBackward0>)
tensor([[ 0.2799, -0.2884, -0.1119]], grad_fn=<AddmmBackward0>)
tensor([[-0.8443, -0.1859,  0.8113]], grad_fn=<AddmmBackward0>)


('contradiction', 'contradiction', 'entailment', 'contradiction')

In [31]:
s1 = "good"
s2 = "bad"
(predict_snli(esim, vocab_truncated, s1, s2), predict_snli(esim, vocab_truncated, s2, s1),
 predict_snli(parikh, vocab_truncated, s1, s2), predict_snli(parikh, vocab_truncated, s2, s1))

tensor([[ 0.7665,  0.0025, -0.2533]], grad_fn=<AddmmBackward0>)
tensor([[ 0.8471,  0.2624, -0.5545]], grad_fn=<AddmmBackward0>)
tensor([[ 0.1223,  0.2669, -0.7166]], grad_fn=<AddmmBackward0>)
tensor([[ 0.2049,  0.4520, -1.1317]], grad_fn=<AddmmBackward0>)


('entailment', 'entailment', 'neutral', 'neutral')

In [32]:
s1 = "This church choir sings joyous songs from the book at a church."
s2 = "The church is filled with song."
(predict_snli(esim, vocab_truncated, s1, s2), predict_snli(esim, vocab_truncated, s2, s1),
 predict_snli(parikh, vocab_truncated, s1, s2), predict_snli(parikh, vocab_truncated, s2, s1))

tensor([[-0.1467,  0.0608, -0.0546]], grad_fn=<AddmmBackward0>)
tensor([[-1.1650,  0.9435, -0.0813]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5233,  0.1290, -0.8376]], grad_fn=<AddmmBackward0>)
tensor([[-1.1438,  0.8058, -0.2529]], grad_fn=<AddmmBackward0>)


('neutral', 'neutral', 'entailment', 'neutral')

**Others**

In [33]:
train_loss, train_acc, dev_acc = np.array(history_p[0]), np.array(history_p[1])/100, np.array(history_p[2])/100
np.save('train_loss_parikh', train_loss)
np.save('train_acc_parikh', train_acc)
np.save('dev_acc_parikh', dev_acc)

In [34]:
train_loss, train_acc, dev_acc = np.array(history_e[0]), np.array(history_e[1])/100, np.array(history_e[2])/100
np.save('train_loss_esim', train_loss)
np.save('train_acc_esim', train_acc)
np.save('dev_acc_esim', dev_acc)

**Result on dev_set**

In [35]:
history_p[-1][-1], history_e[-1][-1]

(tensor(65.1785), tensor(69.2530))