In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from evaluate import *

In [3]:
import pandas as pd
import numpy as np
import time
import spacy
import random
from pathlib import Path
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data 
import torchtext
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import csv
from itertools import islice
import nltk
nltk.download('punkt')
from nltk.tokenize import wordpunct_tokenize
import regex as re
import pickle
from collections import Counter
from tqdm import tqdm
import itertools
import torch.utils.data as data
from sklearn.model_selection import train_test_split
import math
from datetime import datetime

# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Using device: cuda

Tesla P100-PCIE-16GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


<torch._C.Generator at 0x7f84404b7df0>

In [0]:
path = "drive/My Drive/fiqa/"

In [0]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))
    
def pre_process(doc):
    doc = str(doc)
    x = re.sub('[…“”%!&"@#()\-\*\+,/:;<=>?@[\]\^_`{\}~]', ' ', doc)
    y = re.sub('[\.\']', "", x)
    z = y.lower()
    return z

def remove_empty(test_set):
    for index, row in enumerate(test_set):
        for doc in row[1]:
            if doc in empty_docs:
                del test_set[index]
    return test_set

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def save_pickle(path, data):
    with open(path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def pad_seq(seq, max_seq_len):
    # Pad each seq to be the same length to process in batch.
    # pad_token = 0
    if len(seq) >= max_seq_len:
        seq = seq[:max_seq_len]
    else:
        seq += [0]*(max_seq_len - len(seq))
    return seq

def vectorize(seq, vocab, max_seq_len):
    # Map tokens in seq to idx
    seq_idx = [vocab[token] for token in seq]
    # Pad seq idx
    # padded_seq_idx = [pad_seq(seq_idx, max_seq_len)]
    padded_seq_idx = pad_seq(seq_idx, max_seq_len)

    # return torch.tensor(padded_seq_idx)
    return padded_seq_idx

**Load pickle files**

In [0]:
# dict mapping of token to idx
vocab = load_pickle(path + 'data/word2index.pickle')
# dict mapping of docid to doc text
docid_to_tokenized_text = load_pickle(path + 'data/docid_to_tokenized_text.pickle')

# dict mapping of qid to question text
qid_to_tokenized_text = load_pickle(path + 'data/qid_to_tokenized_text.pickle')

test_qid_rel = load_pickle(path + "data/qid_rel_test.pickle")

train_set = load_pickle(path + 'data/train_set_50.pickle')
valid_set = load_pickle(path + 'data/valid_set_50.pickle')
test_set = load_pickle(path + 'data/test_set_50.pickle')
test_set_500 = load_pickle(path + 'data/test_set_500.pickle')

In [7]:
print("Vocabulary size: {}".format(len(vocab)))

Vocabulary size: 85034


In [8]:
print("Number of training samples: {}".format(len(train_set)))
print("Number of validation samples: {}".format(len(valid_set)))
print("Number of test samples: {}".format(len(test_set)))

Number of training samples: 5676
Number of validation samples: 631
Number of test samples: 333


**Model**

In [0]:
emb_dim = 100
vocab_size = len(vocab)
# n_epochs = 20
# batch_size = 64
hidden_size = 256
# max_seq_len = 128
dropout = 0.2

In [10]:
emb = torchtext.vocab.GloVe("6B", dim=emb_dim)
# dictionary mapping of word idx to glove vectors
emb_weights = np.zeros((vocab_size, emb_dim))
words_found = 0
print("Embedding dim: {}".format(emb_weights.shape))

for token, idx in vocab.items():
    # emb.stoi is a dict of token to idx mapping
    if token in emb.stoi:
        emb_weights[idx] = emb[token]
        words_found += 1

print("vocab size: ", vocab_size)
print(words_found, " words are found in GloVe")

# Convert numpy matrix to tensor
emb_weights = torch.from_numpy(emb_weights).float()

emb_weights.shape

Embedding dim: (85034, 100)
vocab size:  85034
50456  words are found in GloVe


torch.Size([85034, 100])

In [0]:
def create_emb_layer(emb_weights):
    vocab_size, emb_dim = emb_weights.shape
    emb_layer = nn.Embedding(vocab_size, emb_dim)
    emb_layer.load_state_dict({'weight': emb_weights})

    return emb_layer

def hinge_loss(pos_sim, neg_sim):
    margin = 0.2

    loss = torch.max(torch.tensor(0, dtype=torch.float).to(device), margin - pos_sim + neg_sim)

    return loss

In [0]:
class QA_LSTM(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, dropout):

        super(QA_LSTM, self).__init__()

        # Shape - (max_seq_len, emb_dim)
        self.embedding = create_emb_layer(emb_weights)

        self.shared_lstm = nn.LSTM(emb_size, hidden_size, num_layers=1, batch_first=True, bidirectional=True)
        self.cos = nn.CosineSimilarity(dim=1)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, q, a):
        # embedding
        q = self.embedding(q) # (bs, L, E)
        a = self.embedding(a) # (bs, L, E)

        # LSTM
        q, (hidden, cell) = self.shared_lstm(q) # (bs, L, 2H)
        a, (hidden, cell) = self.shared_lstm(a) # (bs, L, 2H)

        # Output shape (batch size, seq_len, num_direction * hidden_size)
        # There are n of word level biLSTM representations for the seq where n is the number of seq len
        # Use max pooling to generate the best representation
        q = torch.max(q, 1)[0]  
        a = torch.max(a, 1)[0] # (bs, 2H)

        q = self.dropout(q)
        a = self.dropout(a)

        return self.cos(q, a) # (bs,)

In [0]:
def get_lstm_input_data(dataset, max_seq_len):
    q_input_ids = []
    pos_input_ids = []
    neg_input_ids = []

    for i, seq in enumerate(tqdm(dataset)):
        qid, ans_labels, cands = seq[0], seq[1], seq[2]

        filtered_cands = list(set(cands)-set(ans_labels))

        pos_docid = random.choice(ans_labels)

        # Map question id to text
        q_text = qid_to_tokenized_text[qid]
        q_input_id = vectorize(q_text, vocab, max_seq_len)

        for neg_docid in filtered_cands:

            # Map the docid to text
            pos_ans_text = docid_to_tokenized_text[pos_docid]
            neg_ans_text = docid_to_tokenized_text[neg_docid]

            pos_input_id = vectorize(pos_ans_text, vocab, max_seq_len)
            neg_input_id = vectorize(neg_ans_text, vocab, max_seq_len)

            q_input_ids.append(q_input_id)
            pos_input_ids.append(pos_input_id)
            neg_input_ids.append(neg_input_id)

    return q_input_ids, pos_input_ids, neg_input_ids

In [14]:
train_q_input, train_pos_input, train_neg_input = get_lstm_input_data(train_set, 256)
valid_q_input, valid_pos_input, valid_neg_input = get_lstm_input_data(valid_set, 256)

100%|██████████| 5676/5676 [00:15<00:00, 360.06it/s]
100%|██████████| 631/631 [00:01<00:00, 449.25it/s]


In [15]:
print(len(train_q_input))
print(len(valid_q_input))

277827
30874


In [0]:
# valid_q_input = valid_q_input[50:55]
# valid_pos_input = valid_pos_input[50:55]
# valid_neg_input = valid_neg_input[50:55]

In [0]:
train_q_inputs = torch.tensor(train_q_input)
train_pos_inputs = torch.tensor(train_pos_input)
train_neg_inputs = torch.tensor(train_neg_input)

valid_q_inputs = torch.tensor(valid_q_input)
valid_pos_inputs = torch.tensor(valid_pos_input)
valid_neg_inputs = torch.tensor(valid_neg_input)

In [0]:
batch_size = 64

# Create the DataLoader for our training set.
train_data = TensorDataset(train_q_inputs, train_pos_inputs, train_neg_inputs)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(valid_q_inputs, valid_pos_inputs, valid_neg_inputs)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [0]:
def train(model, train_dataloader, optimizer):

    # Cumulated Training loss
    train_loss = 0.0

    # Set model to training mode
    model.train()
 
    # For each batch of training data...
    for step, batch in enumerate(tqdm(train_dataloader)):

        # batch contains eight PyTorch tensors:
        question = batch[0].to(device)
        pos_ans = batch[1].to(device)
        neg_ans = batch[2].to(device)

        # 1. Zero gradients
        model.zero_grad()
            
        # 2. Compute predictions
        pos_sim = model(question, pos_ans)    
        neg_sim = model(question, neg_ans)

        # 3. Compute loss
        loss = hinge_loss(pos_sim, neg_sim).mean()

        # 4. Use loss to compute gradients
        loss.backward()

        # 5. Use optimizer to take gradient step
        optimizer.step()
            
        train_loss += loss.item()

    avg_loss = train_loss/len(train_dataloader)
            
    return avg_loss

In [0]:
def validate(model, validation_dataloader):

    # Cumulated Training loss
    valid_loss = 0.0

    # Set model to evaluation mode
    model.eval()

    # Evaluate data for one epoch
    for batch in tqdm(validation_dataloader):
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        question, pos_ans, neg_ans = batch

        # Don't calculate the gradients
        with torch.no_grad():
                
            pos_sim = model(question, pos_ans)    
            neg_sim = model(question, neg_ans)

            loss = hinge_loss(pos_sim, neg_sim).mean()
                
            valid_loss += loss.item()
        
    avg_loss = valid_loss/len(validation_dataloader)
                
    return avg_loss

In [21]:
model = QA_LSTM(vocab_size, emb_dim, hidden_size, dropout)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

n_epochs = 4

# Lowest validation lost
best_valid_loss = float('inf')

for epoch in range(n_epochs):

    # Evaluate training loss
    train_loss = train(model, train_dataloader, optimizer)
    # Evaluate validation loss
    valid_loss = validate(model, validation_dataloader)
    
    # At each epoch, if the validation loss is the best
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        # Save the parameters of the model
        torch.save(model.state_dict(), path + 'model/'+str(epoch+1)+'_lstm50_256_64_1e3.pt')

    print("\n\n Epoch {}:".format(epoch+1))
    print("\t Train Loss: {}".format(round(train_loss, 3)))
    print("\t Validation Loss: {}\n".format(round(valid_loss, 3)))

100%|██████████| 4342/4342 [11:51<00:00,  6.10it/s]
100%|██████████| 483/483 [00:31<00:00, 15.21it/s]
  0%|          | 1/4342 [00:00<12:26,  5.82it/s]



 Epoch 1:
	 Train Loss: 0.103
	 Validation Loss: 0.276



100%|██████████| 4342/4342 [11:53<00:00,  6.08it/s]
100%|██████████| 483/483 [00:31<00:00, 15.15it/s]
  0%|          | 1/4342 [00:00<13:17,  5.44it/s]



 Epoch 2:
	 Train Loss: 0.037
	 Validation Loss: 0.284



100%|██████████| 4342/4342 [11:52<00:00,  6.09it/s]
100%|██████████| 483/483 [00:31<00:00, 15.39it/s]
  0%|          | 1/4342 [00:00<13:08,  5.51it/s]



 Epoch 3:
	 Train Loss: 0.018
	 Validation Loss: 0.262



100%|██████████| 4342/4342 [11:49<00:00,  6.12it/s]
100%|██████████| 483/483 [00:31<00:00, 15.35it/s]



 Epoch 4:
	 Train Loss: 0.012
	 Validation Loss: 0.263






In [0]:
def get_lstm_rank(model, test_set, qid_rel, max_seq_len):
    
    qid_pred_rank = {}

    model.eval()

    for i, seq in enumerate(tqdm(test_set)):

        ques, pos_ans, cands = seq[0], seq[1], seq[2]

        q_text = qid_to_tokenized_text[ques]
        q_vec = torch.tensor([vectorize(q_text, vocab, max_seq_len)]).to(device)

        cands_text = [docid_to_tokenized_text[c] for c in cands]

        scores = []

        cands_id = np.array(cands)

        for cand in cands_text:
            a_vec = torch.tensor([vectorize(cand, vocab, max_seq_len)]).to(device)
            scores.append(model(q_vec, a_vec).item())

        # Get the indices of the sorted similarity scores
        sorted_index = np.argsort(scores)[::-1]

        # Get the docid from the sorted indices
        ranked_ans = cands_id[sorted_index]

        # Dict - key: qid, value: ranked list of docids
        qid_pred_rank[ques] = ranked_ans

    return qid_pred_rank

**Test**

In [0]:
toy_test_label = dict(itertools.islice(test_qid_rel.items(), 2))
toy_test = test_set[:2]

In [26]:
# Load the model with the best validation loss
model.load_state_dict(torch.load(path+'model/3_lstm50_256_64_1e3.pt'))

qid_pred_rank = get_lstm_rank(model, test_set, test_qid_rel, max_seq_len=128)
# qid_pred_rank = get_lstm_rank(model, toy_test, toy_test_label, max_seq_len=128)

100%|██████████| 333/333 [01:57<00:00,  2.83it/s]


In [27]:
k = 10

num_q = len(test_set)

MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, test_qid_rel, k)
# MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, toy_test_label, k)

print("\n\nAverage nDCG@{} for {} queries: {}\n".format(k, num_q, average_ndcg))

print("MRR@{} for {} queries: {}\n".format(k, num_q, MRR))

print("Average Precision@{}: {}".format(1, precision))



Average nDCG@10 for 333 queries: 0.11789992950838662

MRR@10 for 333 queries: 0.07625840125840123

Average Precision@1: 0.036036036036036036
