In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import torchtext
import os
import tensorboard
%load_ext autoreload
%autoreload 2

In [2]:
from torchtext.experimental.datasets import SQuAD1
from torchtext.data.utils import get_tokenizer
# data_dir = '.data'
# data_names = ['dev-v1.1.json', 'train-v1.1.json']
# for data_name in data_names:
#     if not os.path.isfile(os.path.join(data_dir, data_name)):
#         print('download')
#         train, dev = SQuAD1()
#         break
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
# dataset shape: (paragraph, question, answer, span)
train, dev = SQuAD1(tokenizer=tokenizer)

100%|██████████| 87599/87599 [00:20<00:00, 4231.57lines/s]


In [3]:
vocab = train.get_vocab()

In [4]:
# import re
# errors = 0
# print('length of vocab before filtering:', len(vocab.stoi))
# for key, value in list(vocab.stoi.items()):
#     if re.search('\n', key) or re.search(' ', key):
#         errors += 1
#         print(key)
#         vocab.stoi.pop(key)
#         vocab.itos.pop(value)
#         vocab.freqs.pop(key)
#         # vocab.freqs[key] -= 1
#         # if vocab.freqs[key] < 1:
#         #     vocab.freqs.pop(key)
#
# print(errors)
# print('length of vocab after filtering:', len(vocab.stoi))

In [5]:
train, dev = SQuAD1(vocab=vocab)

In [6]:
train_data = [(len(paragraph), len(question), idx, paragraph, question, answer, span)
            for idx, (paragraph, question, answer, span) in enumerate(train)]
# train_data.sort() # sort by length and pad sequences with similar lengths
# paragpraph, question: tensor of indices of words, use itos to get word
# Generate the pad id
pad_id = vocab['<pad>']

In [7]:
# print(train_data[0][3])
# for idx in train_data[0][3]:
#     print(train.get_vocab().itos[idx], sep=' ')

In [8]:
def pad_data(data):
    # Find max length of the mini-batch
    # train.get_vocab()['pad'], dev.get_vocab()['pad'] is equal to 22949
    max_p_len = max(list(zip(*data))[0])
    max_q_len = max(list(zip(*data))[1])
    paragraph_list = list(zip(*data))[3]
    question_list = list(zip(*data))[4]
    answer_list = list(zip(*data))[5]
    span_list = list(zip(*data))[6]
    padded_paragraphs = torch.stack([torch.cat((paragraph,
            torch.LongTensor([pad_id] * (max_p_len - len(paragraph))))) \
            for paragraph in paragraph_list])
    padded_questions = torch.stack([torch.cat((question,
            torch.tensor([pad_id] * (max_q_len - len(question))).long())) \
            for question in question_list])
    paragraph_pad_mask = torch.zeros_like(padded_paragraphs).masked_fill(padded_paragraphs == pad_id, 1)
    question_pad_mask = torch.zeros_like(padded_questions).masked_fill(padded_questions == pad_id, 1)

    return padded_paragraphs, padded_questions, span_list, answer_list, \
           paragraph_pad_mask, question_pad_mask

In [9]:
BATCH_SIZE = 32
from torch.utils.data import DataLoader
trainloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_data)
testloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=pad_data)

In [10]:
# for i, (p, q, a, s) in enumerate(train):
#     print(p,q,a,s)
#     nps = s[0].numpy()
#     tokens = tokenizer(train.data[i][0])
#     print(tokens[int(nps[0])])
#     print(tokens[nps[1]])
#     if i > 5:
#         break

In [11]:
for idx, (padded_paragraphs, padded_questions, span_list, answer_list,
           paragraph_pad_mask, question_pad_mask) in enumerate(trainloader):
    # print(idx, padded_paragraphs, padded_questions, span_list, answer_list,
    #        paragraph_pad_mask, question_pad_mask)
    print(span_list)
    if idx > 1:
        break

([tensor([32, 33])], [tensor([27, 27])], [tensor([217, 221])], [tensor([19, 20])], [tensor([24, 24])], [tensor([81, 81])], [tensor([87, 89])], [tensor([47, 63])], [tensor([45, 45])], [tensor([16, 16])], [tensor([57, 57])], [tensor([41, 44])], [tensor([13, 14])], [tensor([50, 50])], [tensor([3, 8])], [tensor([0, 1])], [tensor([79, 80])], [tensor([118, 130])], [tensor([13, 13])], [tensor([36, 36])], [tensor([18, 20])], [tensor([158, 161])], [tensor([0, 0])], [tensor([74, 76])], [tensor([31, 31])], [tensor([44, 45])], [tensor([ 9, 16])], [tensor([68, 68])], [tensor([111, 111])], [tensor([124, 126])], [tensor([27, 28])], [tensor([102, 102])])
([tensor([21, 22])], [tensor([105, 107])], [tensor([110, 111])], [tensor([110, 110])], [tensor([57, 58])], [tensor([77, 78])], [tensor([31, 33])], [tensor([ 3, 11])], [tensor([41, 42])], [tensor([36, 36])], [tensor([54, 57])], [tensor([162, 162])], [tensor([66, 67])], [tensor([64, 83])], [tensor([37, 37])], [tensor([29, 45])], [tensor([25, 26])], [ten

In [12]:
# print(train.get_vocab()['pad'], dev.get_vocab()['pad'])

In [13]:
glove_vec = torchtext.vocab.GloVe(name='840B', dim=300)

In [14]:
def build_word_embedding(vocab, pre_trained_emb_vec):
    print(pre_trained_emb_vec.dim)
    weights_matrix = np.zeros((len(vocab), pre_trained_emb_vec.dim))
    words_found = 0
    no_word = 0
    for i, (word, _) in enumerate(vocab.freqs.most_common()):
        try:
            word_index = pre_trained_emb_vec.stoi[word]
            weights_matrix[i] = pre_trained_emb_vec[word_index]
            words_found += 1
        except:
            no_word += 1 # no such word in pre_trained_embedding: zero vector
    print('words not found:', no_word)
    print('words found:', words_found)
    return weights_matrix

In [15]:
# for key, value in vocab.freqs.items():
#     if re.search(' ', key):
#         print(key, value)
# for i, word in enumerate(vocab.freqs.most_common()):
#     print(word)
#     if i > 5:
#         break

In [16]:
word_emb_table = build_word_embedding(vocab, glove_vec)

300
words not found: 17435
words found: 86591


In [17]:
# glove_vec.vectors[:5]

In [38]:
import spacy
nlp = spacy.load('en_core_web_sm')

def exact_match(paragraphs_indices, questions_indices, itos):
    # process one paragraph sentence, one question sentence
    exact_match_table = np.zeros((len(paragraphs_indices), len(questions_indices[0])))
    for i, (paragraph_indices, question_indices) in \
            enumerate(zip(paragraphs_indices, questions_indices)):
        print(paragraphs_indices)
        print(paragraphs_indices.size())
        paragraph_sentence = ' '.join(word for word in itos[paragraph_indices])
        question_sentence = ' '.join(word for word in itos[question_indices])
        paragraph_sentence_uncased = paragraph_sentence.lower()
        question_sentence_uncased = question_sentence.lower()
        paragraph_processed = nlp(paragraph_sentence)
        question_processed = nlp(question_sentence)
        question_lemmas = [lem.lemma_ for lem in question_processed]
        paragraph_uncased_processed = nlp(paragraph_sentence_uncased)
        question_uncased_processed = nlp(question_sentence_uncased)
        for j, paragraph_word, paragraph_word_uncased in \
                enumerate(zip(paragraph_processed, paragraph_uncased_processed)):
            if paragraph_word.text == '<pad>':
                continue
            em_tensor = torch.LongTensor([0, 0, 0])
            # original
            if paragraph_word.text in question_sentence:
                em_tensor[0] = 1
            # lemma
            if paragraph_word.lemma_ in question_lemmas:
                em_tensor[1] = 1
            # uncased
            if paragraph_word_uncased.text in question_sentence_uncased:
                em_tensor[2] = 1
            exact_match_table[i][j] = em_tensor

    return exact_match_table




In [39]:
class AlignedQuestionEmbedding(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.relu = nn.ReLU()
        self.linear = nn.Linear(input_dim, input_dim)

    def forward(self, paragraph, question, question_pad_mask):
        p = self.relu(self.linear(paragraph))

        q = self.relu(self.linear(question))
        q = question.permute(0, 2, 1)

        dot_product = torch.bmm(p, q)
        dot_product = dot_product.masked_fill(question_pad_mask == 1, -np.inf)

        dot_product_flatten = dot_product.view(-1, question.size(1))

        attn_score = F.softmax(dot_product_flatten, dim=1)
        attn_score = attn_score.view(-1, paragraph.shape[1], question.shape[1])

        aligned_embedding = torch.bmm(attn_score, question)
        return aligned_embedding

In [40]:
P_ENCODING_NUM = 2
class MultiLayerBiLSTM(nn.Module):

    def __init__(self, input_size, hidden_size, nlayers, dropout):
        super().__init__()
        self.nlayers = nlayers

        self.lstms = nn.ModuleList()
        self.dropout = nn.Dropout(p=dropout)
        self.lstms.append(nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True))
        for i in range(1, nlayers):
            self.lstms.append(nn.LSTM(hidden_size * 2, hidden_size,
                                      batch_first=True, bidirectional=True))

    def forward(self, x):
        lstm_output = self.dropout(x)
        lstm_output, (hidden_state, cell_state) = self.lstms[0](lstm_output)
        hidden_states = [hidden_state]

        for i in range(1, self.nlayers):
            lstm_output = self.dropout(hidden_state)
            lstm_output, (hidden_state, cell_state) = self.lstms[i](lstm_output)
            hidden_states.append(hidden_state)

        output = torch.cat(hidden_states, dim=2)

        output = self.dropout(output)
        return output

In [41]:
class QuestionEncoding(nn.Module):
    def __init__(self, input_size, hidden_size, nlayers, dropout):
        super().__init__()
        self.input_size = input_size
        self.linear = nn.Linear(input_size, 1)
        self.lstm = MultiLayerBiLSTM(input_size, hidden_size, nlayers, dropout)

    def forward(self, x, question_mask):
        x_lstm = self.lstm(x)
        x = x.view(-1, self.input_size)
        x = self.linear(x) # attention score
        x = x.masked_fill(question_mask == 1, -np.inf) # masking
        x = F.softmax(x, dim=1)

        x = x.unsqueeze(1)
        encoding = torch.bmm(x_lstm, x)
        encoding = encoding.squeeze(1)
        return encoding

In [42]:
class PredictionLayer(nn.Module):
    def __init__(self, p_size, q_size):
        super().__init__()
        self.linear = nn.Linear(q_size, p_size)

    def forward(self, paragraph, question, paragraph_mask):
        Wq = self.linear(question)
        Wq = Wq.unsqueeze(2)
        pWq = paragraph.bmm(Wq)
        pWq = pWq.squeeze(2)
        pWq = pWq.masked_fill(paragraph_mask == 1, -np.inf)


In [43]:
def fix_embedding(grad):
    grad[1000:] = 0
    return grad

class DocumentReader(nn.Module):
    def __init__(self, hidden_size, embedding_size, nlayers, paragraph_encoding_num, dropout, device):
        super().__init__()
        self.device = device

        self.p_word_embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(word_emb_table).to(device), freeze=False)
        self.p_word_embedding_layer.register_backward_hook(fix_embedding)
        self.aligned_embedding_layer = AlignedQuestionEmbedding(embedding_size)
        self.paragraph_lstm = MultiLayerBiLSTM(embedding_size * 2 + 3, hidden_size, nlayers, dropout)
        # self.paragraph_lstm = MultiLayerBiLSTM(embedding_size * 2, hidden_size, nlayers, dropout)

        self.question_encoder = QuestionEncoding(embedding_size, hidden_size, nlayers, dropout)

        self.prediction_layer_start = PredictionLayer(hidden_size * nlayers * paragraph_encoding_num,
                                                          hidden_size * nlayers * paragraph_encoding_num)
        self.prediction_layer_end = PredictionLayer(hidden_size * nlayers * paragraph_encoding_num,
                                                        hidden_size * nlayers * paragraph_encoding_num)

        self.dropout = nn.Dropout(dropout)
    def forward(self, paragraph, question, paragraph_mask, question_mask):
        em_embedding = exact_match(paragraph, question, vocab.itos)
        p_word_embedding = self.p_word_embedding_layer(paragraph)
        aligned_embedding = self.aligned_embedding_layer(paragraph, question, paragraph_mask)

        paragraph_embeddings = torch.cat([em_embedding, p_word_embedding, aligned_embedding])
        paragraph_encoding = self.paragraph_lstm(paragraph_embeddings)

        question_encoding = self.question_encoder(question, question_mask)

        prediction_start = self.prediction_layer_start(paragraph_encoding, question_encoding, paragraph_mask)
        prediction_end = self.prediction_layer_end(paragraph_encoding, question_encoding, paragraph_mask)

        return prediction_start, prediction_end

In [44]:
HIDDEN_SIZE = 128
EMB_SIZE = 300
NLAYERS = 3
PARAGRAPH_EMBEDDING_NUM = 2
DROPOUT = 0.3
# device = torch.device('cpu')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = DocumentReader(HIDDEN_SIZE,
                       EMB_SIZE,
                       NLAYERS,
                       PARAGRAPH_EMBEDDING_NUM,
                       DROPOUT,
                       device).to(device)


In [45]:
optimizer = torch.optim.Adamax(model.parameters())

In [46]:
def train(model, train_dataset):
    '''
    Trains the model.
    '''

    print("Start training ........")

    train_loss = 0.

    # put the model in training mode
    model.train()

    # iterate through training data
    for i, (paragraphs, questions, span_list, answer_list,
            paragraph_mask, question_mask) in enumerate(train_dataset):

        if i % 500 == 0:
            print(f"Starting batch: {i}")


        # place the tensors on GPU
        paragraphs = paragraphs.to(device)
        paragraph_mask = paragraph_mask.to(device)
        questions = questions.to(device)
        question_mask = question_mask.to(device)
        # span_list = span_list.to(device)

        # forward pass, get the predictions
        preds = model(paragraphs, questions, paragraph_mask, question_mask)

        start_pred, end_pred = preds

        # separate labels for start and end position
        start_span, end_span = span_list[:,0], span_list[:,1]

        # calculate loss
        loss = F.cross_entropy(start_pred, start_span) + F.cross_entropy(end_pred, end_span)

        # backward pass, calculates the gradients
        loss.backward()

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 10)

        # update the gradients
        optimizer.step()

        # zero the gradients to prevent them from accumulating
        optimizer.zero_grad()

        train_loss += loss.item()

    return train_loss/len(train_dataset)

In [47]:
train_loss = train(model, trainloader)

Start training ........
Starting batch: 0
tensor([[  243,  1319,     9,  ...,     1,     1,     1],
        [    2, 14318,   557,  ...,     1,     1,     1],
        [    7,     0,  1819,  ...,     1,     1,     1],
        ...,
        [    2,   363,   101,  ...,     1,     1,     1],
        [    0,    72,  3418,  ...,     1,     1,     1],
        [    2,   164,   265,  ...,     1,     1,     1]], device='cuda:0')
torch.Size([32, 301])


TypeError: only integer tensors of a single element can be converted to an index