In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from evaluate import *

In [3]:
import pandas as pd
import numpy as np
import time
import spacy
import random
from pathlib import Path
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data 
import torchtext
from nltk.tokenize.treebank import TreebankWordDetokenizer
import csv
from itertools import islice
import nltk
nltk.download('punkt')

from nltk import word_tokenize,sent_tokenize
from nltk.tokenize import wordpunct_tokenize
import regex as re
import pickle
from collections import Counter
from tqdm import tqdm
import itertools

# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)

path = "drive/My Drive/FiQA/"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Using device: cuda

Tesla T4
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [0]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))
    
def pre_process(doc):
    doc = str(doc)
    x = re.sub('[…“”%!&"@#()\-\*\+,/:;<=>?@[\]\^_`{\}~]', ' ', doc)
    y = re.sub('[\.\']', "", x)
    z = y.lower()
    return z

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def save_pickle(path, data):
    with open(path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def pad_seq(seq, max_seq_len):
    # Pad each seq to be the same length to process in batch.
    # pad_token = 0
    if len(seq) >= max_seq_len:
        seq = seq[:max_seq_len]
    else:
        seq += [0]*(max_seq_len - len(seq))
    return seq

def vectorize(seq, vocab, max_seq_len):
    # Map tokens in seq to idx
    seq_idx = [vocab[token] for token in seq]
    # Pad seq idx
    padded_seq_idx = [pad_seq(seq_idx, max_seq_len)]

    return torch.tensor(padded_seq_idx)

**Load pickle files**

In [0]:
# dict mapping of token to idx
vocab = load_pickle(path + 'vocab_full.pickle')
# dict mapping of docid to doc text
docid_to_text = load_pickle(path + 'label_ans.pickle')
# dict mapping of qid to question text
qid_to_text = load_pickle(path + 'qid_text.pickle')
# dict mapping of qid to relevant docs
qid_rel = load_pickle(path + 'qid_rel.pickle')
# dict mapping of qid to ranked candidates
qid_ranked_docs = load_pickle(path+'qid_ranked_docs_100.pickle')

**Example data**

In [19]:
toy_label = dict(itertools.islice(qid_rel.items(), 10))
toy_cand = dict(itertools.islice(qid_ranked_docs.items(), 10))

print(toy_label)

{0: [18850], 1: [14255], 2: [308938], 3: [296717, 100764, 314352, 146317], 4: [196463], 5: [69306], 6: [560251, 188530, 564488], 7: [411063], 8: [566392, 65404], 9: [509122, 184698]}


In [20]:
neg_ans = {}

for qid, pos_ans_lst in tqdm(toy_label.items()):
    for i, cand_lst in toy_cand.items():
        trimed_cand = [x for x in cand_lst if x not in pos_ans_lst]
    neg_ans_lst = random.sample(trimed_cand, len(pos_ans_lst))
    neg_ans[qid] = neg_ans_lst

# neg_ans

100%|██████████| 10/10 [00:00<00:00, 1487.55it/s]


In [0]:
# test = qid_docid[:177]
test = qid_docid[:16]

In [23]:
train_set = []

for index, row in test.iterrows():
    for k, v in neg_ans.items():
        if k == row['qid']:
            tmp = []
            tmp.append(row['qid'])
            tmp.append(row['docid'])
            tmp.append(v)
            train_set.append(tmp)

train_set

[[0, 18850, [25439]],
 [1, 14255, [562685]],
 [2, 308938, [55500]],
 [3, 296717, [547941, 356743, 35810, 250498]],
 [3, 100764, [547941, 356743, 35810, 250498]],
 [3, 314352, [547941, 356743, 35810, 250498]],
 [3, 146317, [547941, 356743, 35810, 250498]],
 [4, 196463, [492371]],
 [5, 69306, [562685]],
 [6, 560251, [50992, 465680, 547941]],
 [6, 188530, [50992, 465680, 547941]],
 [6, 564488, [50992, 465680, 547941]],
 [7, 411063, [588253]],
 [8, 566392, [100655, 244484]],
 [8, 65404, [100655, 244484]],
 [9, 509122, [466718, 35810]]]

**Model**

In [0]:
emb_dim = 100
vocab_size = len(vocab)
n_epochs = 2
batch_size = 256
hidden_size = 141
max_seq_len = 200

In [0]:
emb = torchtext.vocab.GloVe("6B", dim=emb_dim)

In [26]:
# dictionary mapping of word idx to glove vectors
emb_weights = np.zeros((vocab_size, emb_dim))
words_found = 0
print("Embedding dim: {}".format(emb_weights.shape))

for token, idx in vocab.items():
    # emb.stoi is a dict of token to idx mapping
    if token in emb.stoi:
        emb_weights[idx] = emb[token]
        words_found += 1

print("vocab size: ", vocab_size)
print(words_found, " words are found in GloVe")

# Convert numpy matrix to tensor
emb_weights = torch.from_numpy(emb_weights).float()

emb_weights.shape

Embedding dim: (85034, 100)
vocab size:  85034
50456  words are found in GloVe


torch.Size([85034, 100])

In [0]:
def create_emb_layer(emb_weights):
    vocab_size, emb_dim = emb_weights.shape
    emb_layer = nn.Embedding(vocab_size, emb_dim)
    emb_layer.load_state_dict({'weight': emb_weights})

    return emb_layer

def loss_fn(pos_sim, neg_sim):
    margin = 0.2

    loss = margin - pos_sim + neg_sim
    if loss.data[0] < 0:
        loss.data[0] = 0
    return loss

In [0]:
class QA_LSTM(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size):

        super(QA_LSTM, self).__init__()

        # Shape - (max_seq_len, emb_dim)
        self.embedding = create_emb_layer(emb_weights)

        self.shared_lstm = nn.LSTM(emb_size, hidden_size, num_layers=1, batch_first=True, bidirectional=True)
        self.cos = nn.CosineSimilarity(dim=1)

    def forward(self, q, a):
        # embedding
        q = self.embedding(q) # (bs, L, E)
        a = self.embedding(a) # (bs, L, E)

        # LSTM
        q, (hidden, cell) = self.shared_lstm(q) # (bs, L, 2H)
        a, (hidden, cell) = self.shared_lstm(a) # (bs, L, 2H)

        # Output shape (batch size, seq_len, num_direction * hidden_size)
        # There are n of word level biLSTM representations for the seq where n is the number of seq len
        # Use max pooling to generate the best representation
        q = torch.max(q, 1)[0] 
        a = torch.max(a, 1)[0] # (bs, 2H)

        return self.cos(q, a) # (bs,)

In [0]:
def train(model, train_set, optimizer, n_epochs=2, batch_size=256):

    for epoch in range(n_epochs):

        model.train()
        print('epoch', epoch)
        
        losses = []

        for i, seq in enumerate(tqdm(train_set)):

            ques, pos_ans, neg_ans = seq[0], seq[1], seq[2]

            q_text = qid_to_text[ques]
            q_vec = vectorize(q_text, vocab, max_seq_len).to(device)

            pos_ans_text = docid_to_text[pos_ans]
            pos_ans_vec = vectorize(pos_ans_text, vocab, max_seq_len).to(device)

            # sim score of pos q and a
            pos_sim = model(q_vec, pos_ans_vec)

            for docid in neg_ans:
                neg_ans_text = docid_to_text[docid]
                neg_ans_vec = vectorize(neg_ans_text, vocab, max_seq_len).to(device)
                
                neg_sim = model(q_vec, neg_ans_vec)

                loss = loss_fn(pos_sim, neg_sim)
                if loss.item() != 0:
                    losses.append(loss)
                    break

            if len(losses) == batch_size or i == len(train_set) - 1:
                loss = torch.mean(torch.stack(losses, 0).squeeze(), 0)
                # print(loss.data)
                print("\nepoch: {}, iteration: {}, loss: {}".format(epoch, i, loss.item()))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                losses = []

        # filename = '{}/Epoch-{}.model'.format('.', epoch)
        # save_checkpoint({
        #     'epoch': epoch + 1,
        #     'state_dict': model.state_dict(),
        #     'optimizer' : optimizer.state_dict(),
        # }, filename=filename)
        # test(model, test_data)

In [31]:
model = QA_LSTM(vocab_size, emb_dim, hidden_size)

model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=2e-2)

train(model, train_set, optimizer)

 31%|███▏      | 5/16 [00:00<00:00, 43.84it/s]

epoch 0


 94%|█████████▍| 15/16 [00:00<00:00, 43.05it/s]

epoch: 0, iteration: 15, loss: 0.19582203030586243


100%|██████████| 16/16 [00:00<00:00, 21.94it/s]
 31%|███▏      | 5/16 [00:00<00:00, 45.53it/s]

epoch 1


 94%|█████████▍| 15/16 [00:00<00:00, 45.67it/s]

epoch: 1, iteration: 15, loss: 0.09310953319072723


100%|██████████| 16/16 [00:00<00:00, 22.48it/s]


In [32]:
test_set = []

for qid, docid in toy_label.items():
    for k, v in toy_cand.items():
        if k == qid:
            tmp = []
            tmp.append(qid)
            tmp.append(docid)
            tmp.append(v)
            test_set.append(tmp)

print(test_set)

[[0, [18850], [531578, 417981, 324911, 524879, 397608, 216077, 173212, 104464, 326261, 434846, 528838, 234436, 571062, 481692, 207449, 338700, 196374, 153377, 327002, 421301, 11538, 375748, 406418, 238271, 322893, 130631, 73427, 560087, 483385, 156554, 531442, 541809, 192843, 553328, 562777, 209224, 351672, 324513, 18850, 283505, 55200, 367754, 297841, 455984, 540395, 160340, 577284, 565935, 354716, 552845, 287474, 179144, 292748, 310612, 194308, 76618, 100764, 534997, 392484, 155490, 83059, 11132, 557186, 348787, 136071, 192516, 234743, 391619, 468741, 12729, 219313, 365558, 396056, 462831, 146657, 178942, 79411, 292919, 309909, 447231, 400230, 540325, 74688, 354511, 245447, 79397, 120500, 237207, 32072, 588509, 308472, 258155, 388042, 18934, 358631, 381151, 145148, 594531, 81599, 195207]], [1, [14255], [231279, 470066, 392484, 14255, 31117, 146657, 257168, 156063, 354716, 365456, 100280, 183612, 208989, 349672, 156444, 81599, 141738, 560776, 216783, 528838, 216077, 47260, 18850, 4452

In [0]:
def test(model, test_set, qid_rel, k):
    max_sent_len = 200
    qid_pred_rank = {}

    for i, seq in enumerate(tqdm(test_set)):

        ques, pos_ans, cands = seq[0], [seq[1]], seq[2]

        q_text = qid_to_text[ques]
        q_vec = vectorize(q_text, vocab, max_seq_len).to(device)

        cands_text = [docid_to_text[c] if c is not 0 else "" for c in cands]

        scores = []

        cands_id = np.array(cands)

        for cand in cands_text:
            a_vec = vectorize(cand, vocab, max_seq_len).to(device)
            scores.append(model(q_vec, a_vec).item())

        sorted_index = np.argsort(scores)[::-1]

        ranked_ans = cands_id[sorted_index]

        qid_pred_rank[ques] = ranked_ans

    MRR, average_ndcg, precision = evaluate(qid_ranked_docs, qid_rel, k)

    return MRR, average_ndcg, precision

In [35]:
MRR, average_ndcg, precision = test(model, test_set, qid_rel, k=10)

100%|██████████| 10/10 [00:11<00:00,  1.14s/it]


In [36]:
num_q = len(qid_rel)
k = 10

print("Average nDCG@{} for {} queries: {}\n".format(k, num_q, average_ndcg))

print("MRR@{} for {} queries: {}\n".format(k, num_q, MRR))

print("Average Precision@{}: {}".format(1, precision))

Average nDCG@10 for 6648 queries: 0.36504981770986045

MRR@10 for 6648 queries: 0.3104752234828942

Average Precision@1: 0.23901925391095066
