In [87]:
import torch
import torch.nn as nn
import torchtext
from torchtext import data
import spacy
import os
from torch.autograd import Variable

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
print(USE_CUDA)

False


# Hyperparameters

In [66]:
batch_size_train = 5
batch_size_eval = 256
emb_dim = 50
embed_from = "glove.6B.50d"
hidden_size = 100
num_layers = 1
rnn_dropout = 0

# Load data
refer to 

http://anie.me/On-Torchtext/

http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

http://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/

In [42]:
data_dir = 'preprocessed'
combined_fname = 'all-combined-data-processed.json'
train_fname = 'train-trial-combined-data-processed.json'
dev_fname = 'dev-data-processed.json'
test_fname = 'test-data-processed.json'

we have keys: 'id', 'd_words', 'd_pos', 'd_ner', 'q_words', 'q_pos', 'c_words', 'label', 'in_q', 'in_c', 'lemma_in_q', 'tf', 'p_q_relation', 'p_c_relation'

In [43]:
TEXT = data.ReversibleField(sequential=True, lower=True, include_lengths=True)

train, val, test = data.TabularDataset.splits(
    path=data_dir, train=train_fname,
    validation=dev_fname, test=test_fname, format='json',
    fields={'d_words': ('d_words', TEXT),
            'q_words': ('q_words', TEXT),
            'c_words': ('c_words', TEXT),
            'label': ('label', data.Field(sequential=False, use_vocab=False))
           })

print('train: %d, val: %d, test: %d' % (len(train), len(val), len(test)))

train: 20482, val: 2822, test: 5594


In [44]:
# combined is only used for building vocabulary
combined = data.TabularDataset(
    path=os.path.join(data_dir, combined_fname), format='json',
    fields={'d_words': ('d_words', TEXT),
            'q_words': ('q_words', TEXT),
            'c_words': ('c_words', TEXT),
            'label': ('label', data.Field(sequential=False, use_vocab=False))
           })

# TEXT.build_vocab(combined)
TEXT.build_vocab(combined, vectors=embed_from)
print('vocab size: %d' % len(TEXT.vocab))

vocab size: 10695


In [45]:
# train_iter, val_iter, test_iter = data.Iterator.splits(
#         (train, val, test), sort_key=lambda x: len(x.d_words), 
#         batch_sizes=(batch_size_train, batch_size_eval, batch_size_eval), device=-1, 
#         sort_within_batch=True, repeat=False)

train_iter, val_iter, test_iter = data.Iterator.splits(
        (train, val, test), batch_sizes=(batch_size_train, batch_size_eval, batch_size_eval), device=device, 
        sort_within_batch=False, repeat=False)

# Create embedding

In [46]:
embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
embedding.weight.data.copy_(TEXT.vocab.vectors)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
        ...,
        [-0.1160,  1.1429,  0.0260,  ..., -0.8676,  0.0750,  0.8040],
        [-0.5689,  0.9256,  0.7289,  ...,  0.7101, -0.2287,  1.4826],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [47]:
embedding.weight.shape

torch.Size([10695, 50])

# Build model
refer to

https://github.com/intfloat/commonsense-rc

https://towardsdatascience.com/taming-lstms-variable-sized-mini-batches-and-why-pytorch-is-good-for-your-health-61d35642972e

https://discuss.pytorch.org/t/solved-multiple-packedsequence-input-ordering/2106/23

In [147]:
# lengths = torch.tensor([len(indices) for indices in indices_list], dtype=torch.long, device=device)
# lengths_sorted, sorted_idx = lengths.sort(descending=True)

# indices_padded = pad_lists(indices, padding_idx, dtype=torch.long, device=device) # custom function
# indices_sorted = indices_padded[sorted_idx]

# embeddings_padded = self.embedding(indices_sorted)
# embeddings_packed = pack_padded_sequence(embeddings_padded, lengths_sorted.tolist(), batch_first=True)

# h, (h_n, _) = self.lstm(embeddings_packed)

# h, _ = pad_packed_sequence(h, batch_first=True, padding_value=padding_idx)

# # Reverses sorting. 
# h = torch.zeros_like(h).scatter_(0, sorted_idx.unsqueeze(1).unsqueeze(1).expand(-1, h.shape[1], h.shape[2]), h)


class VLBLSTM(nn.Module):
    def __init__(self, input_dim, hidden_size, num_layers, rnn_dropout):
        super(VLBLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=rnn_dropout,
            bidirectional=True
        )
        
    def init_hidden(self, batch_size):
        # the weights are of the form (num_layers * 2, batch_size, hidden_size)
        h0 = torch.randn(self.num_layers * 2, batch_size, self.hidden_size)
        c0 = torch.randn(self.num_layers * 2, batch_size, self.hidden_size)

        h0 = Variable(h0, requires_grad=False).to(device)
        c0 = Variable(c0, requires_grad=False).to(device)

        return (h0, c0)
    
    def forward(self, inputs, lengths):
        # requires inputs to be batch first
        
        batch_size = inputs.shape[0]
        h0, c0 = self.init_hidden(batch_size)
        
#         print(lengths)
#         print(inputs)
        
        lengths_sorted, sorted_idx = lengths.sort(descending=True)
        inputs_sorted = inputs[sorted_idx]
        
#         print('~'*80)
#         print(lengths_sorted)
#         print(inputs_sorted)

        print(inputs_sorted.shape)
        inputs_packed = nn.utils.rnn.pack_padded_sequence(inputs_sorted, lengths_sorted.tolist(), batch_first=True)
        h, y = self.lstm(inputs_packed, (h0, c0))
#         h, y = self.lstm(inputs_packed)
        print(h)
#         print(y.shape)
        
        
        return None
        

# class TriAN(nn.Module):
#     def __init__(self, embedding):
#         super(TriAN, self).__init__()
#         self.embedding = embedding
#         self.doc_rnn = ???
#         self.question_rnn = ???
#         self.choice_rnn = ???

In [148]:
rnn = VLBLSTM(emb_dim, hidden_size, num_layers, rnn_dropout)

In [149]:
for i, batch in enumerate(train_iter):
    d_words, d_lengths = batch.d_words
    q_words, q_lengths = batch.q_words
    c_words, c_lengths = batch.c_words
    labels = batch.label
    
    # convert to batch first
    d_words = torch.transpose(d_words, 0, 1)
    q_words = torch.transpose(q_words, 0, 1)
    c_words = torch.transpose(c_words, 0, 1)
    
    d_words_embed = embedding(d_words)
    rnn(d_words_embed, d_lengths)
    
#     print(d_words.shape, d_lengths.shape, labels.shape)
#     print(d_lengths)
#     print(q_lengths)
#     print(c_lengths)

    if i == 0:
        break

torch.Size([5, 181, 50])
PackedSequence(data=tensor([[ 0.2332, -0.2186, -0.1090,  ...,  0.0260, -0.0711,  0.0104],
        [ 0.0996,  0.0185, -0.2400,  ...,  0.1251, -0.2101,  0.1141],
        [-0.0199, -0.0138,  0.0613,  ...,  0.1251, -0.2101,  0.1141],
        ...,
        [-0.0062, -0.1765,  0.0739,  ..., -0.0228,  0.0320, -0.0119],
        [-0.0062, -0.1765,  0.0739,  ..., -0.2049,  0.5399, -0.1181],
        [-0.1133, -0.1470,  0.0147,  ...,  0.2422, -0.1199, -0.0799]],
       grad_fn=<CatBackward>), batch_sizes=tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5