In [2]:
import torch
import torch.nn as nn
import torchtext
from torchtext import data
import spacy
import os
from torch.autograd import Variable

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
print(USE_CUDA)

False


# Hyperparameters

In [3]:
batch_size_train = 5
batch_size_eval = 256
emb_dim = 50
embed_from = "glove.6B.50d"
hidden_size = 100
num_layers = 1
rnn_dropout = 0

# Load data
refer to 

http://anie.me/On-Torchtext/

http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

http://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/

In [4]:
data_dir = 'preprocessed'
combined_fname = 'all-combined-data-processed.json'
train_fname = 'train-trial-combined-data-processed.json'
dev_fname = 'dev-data-processed.json'
test_fname = 'test-data-processed.json'

we have keys: 'id', 'd_words', 'd_pos', 'd_ner', 'q_words', 'q_pos', 'c_words', 'label', 'in_q', 'in_c', 'lemma_in_q', 'tf', 'p_q_relation', 'p_c_relation'

In [5]:
TEXT = data.ReversibleField(sequential=True, lower=True, include_lengths=True)

train, val, test = data.TabularDataset.splits(
    path=data_dir, train=train_fname,
    validation=dev_fname, test=test_fname, format='json',
    fields={'d_words': ('d_words', TEXT),
            'q_words': ('q_words', TEXT),
            'c_words': ('c_words', TEXT),
            'label': ('label', data.Field(sequential=False, use_vocab=False))
           })

print('train: %d, val: %d, test: %d' % (len(train), len(val), len(test)))

train: 20482, val: 2822, test: 5594


In [6]:
# combined is only used for building vocabulary
combined = data.TabularDataset(
    path=os.path.join(data_dir, combined_fname), format='json',
    fields={'d_words': ('d_words', TEXT),
            'q_words': ('q_words', TEXT),
            'c_words': ('c_words', TEXT),
            'label': ('label', data.Field(sequential=False, use_vocab=False))
           })

# TEXT.build_vocab(combined)
TEXT.build_vocab(combined, vectors=embed_from)
print('vocab size: %d' % len(TEXT.vocab))

vocab size: 10695


In [7]:
# train_iter, val_iter, test_iter = data.Iterator.splits(
#         (train, val, test), sort_key=lambda x: len(x.d_words), 
#         batch_sizes=(batch_size_train, batch_size_eval, batch_size_eval), device=-1, 
#         sort_within_batch=True, repeat=False)

train_iter, val_iter, test_iter = data.Iterator.splits(
        (train, val, test), batch_sizes=(batch_size_train, batch_size_eval, batch_size_eval), device=device, 
        sort_within_batch=False, repeat=False)

# Create embedding

In [8]:
embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
embedding.weight.data.copy_(TEXT.vocab.vectors)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
        ...,
        [-0.1160,  1.1429,  0.0260,  ..., -0.8676,  0.0750,  0.8040],
        [-0.5689,  0.9256,  0.7289,  ...,  0.7101, -0.2287,  1.4826],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [9]:
embedding.weight.shape

torch.Size([10695, 50])

# Build model
refer to

https://github.com/intfloat/commonsense-rc

https://towardsdatascience.com/taming-lstms-variable-sized-mini-batches-and-why-pytorch-is-good-for-your-health-61d35642972e

https://discuss.pytorch.org/t/solved-multiple-packedsequence-input-ordering/2106/23

In [54]:
class BLSTM(nn.Module):
    def __init__(self, input_dim, hidden_size, num_layers, rnn_dropout):
        super(VLBLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=rnn_dropout,
            bidirectional=True
        )
    
    def forward(self, inputs, lengths):
        # requires inputs to be batch first
        lengths_sorted, sorted_idx = lengths.sort(descending=True)
        inputs_sorted = inputs[sorted_idx]
    
        inputs_packed = nn.utils.rnn.pack_padded_sequence(inputs_sorted, lengths_sorted.tolist(), batch_first=True)
        outputs_packed, _ = self.lstm(inputs_packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs_packed, batch_first=True)
        
        # Reverses sorting. 
        outputs = torch.zeros_like(outputs)\
            .scatter_(0, sorted_idx.unsqueeze(1).unsqueeze(1)
                      .expand(-1, outputs.shape[1], outputs.shape[2]), outputs)
        
        return outputs
        

# class TriAN(nn.Module):
#     def __init__(self, embedding):
#         super(TriAN, self).__init__()
#         self.embedding = embedding
#         self.doc_rnn = ???
#         self.question_rnn = ???
#         self.choice_rnn = ???

In [55]:
rnn = BLSTM(emb_dim, hidden_size, num_layers, rnn_dropout)

In [61]:
for i, batch in enumerate(train_iter):
    d_words, d_lengths = batch.d_words
    q_words, q_lengths = batch.q_words
    c_words, c_lengths = batch.c_words
    labels = batch.label
    
    # convert to batch first
    d_words = torch.transpose(d_words, 0, 1)
    q_words = torch.transpose(q_words, 0, 1)
    c_words = torch.transpose(c_words, 0, 1)
    
    d_words_embed = embedding(d_words)
    outputs = rnn(d_words_embed, d_lengths)
    
#     print(d_words.shape, d_lengths.shape, labels.shape)
#     print(d_lengths)
#     print(q_lengths)
#     print(c_lengths)

    if i == 0:
        break

tensor([198, 214, 188, 177, 175])
tensor([[[-0.1746,  0.0269,  0.2662,  ..., -0.0008, -0.0549,  0.1810],
         [-0.1200,  0.0884,  0.2625,  ..., -0.0027, -0.0642,  0.0378],
         [-0.1149,  0.1104,  0.2383,  ...,  0.1011,  0.0217,  0.0867],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.1444,  0.0177,  0.1477,  ...,  0.0404, -0.0392,  0.1008],
         [-0.1623, -0.0442,  0.0949,  ..., -0.0021, -0.0375,  0.0130],
         [-0.1534,  0.0477,  0.1611,  ...,  0.0179, -0.0453,  0.1759],
         ...,
         [-0.1052,  0.0392,  0.2023,  ...,  0.0442, -0.0547,  0.0747],
         [-0.0661,  0.0552,  0.1639,  ...,  0.0710, -0.0368,  0.0608],
         [-0.0796,  0.0906,  0.1743,  ...,  0.0706, -0.0201,  0.0445]],

        [[-0.1304,  0.1922,  0.2095,  ...,  0.1259, -0.0899,  0.2511],
         [-