In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from evaluate import *

In [18]:
import pandas as pd
import numpy as np
import time
import spacy
import random
from pathlib import Path
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data 
import torchtext
import csv
from itertools import islice
import nltk
nltk.download('punkt')
from nltk.tokenize import wordpunct_tokenize
import regex as re
import pickle
from collections import Counter
from tqdm import tqdm
import itertools
import torch.utils.data as data
from sklearn.model_selection import train_test_split
import math

# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)

path = "drive/My Drive/FiQA/"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Using device: cuda

Tesla P100-PCIE-16GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [0]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))
    
def pre_process(doc):
    doc = str(doc)
    x = re.sub('[…“”%!&"@#()\-\*\+,/:;<=>?@[\]\^_`{\}~]', ' ', doc)
    y = re.sub('[\.\']', "", x)
    z = y.lower()
    return z

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def save_pickle(path, data):
    with open(path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def timer(start_time, end_time):
    """
    Returns the minutes and seconds.
    """

    time = end_time - start_time
    mins = int(time / 60)
    secs = int(time - (mins * 60))

    return mins, secs

def pad_seq(seq, max_seq_len):
    # Pad each seq to be the same length to process in batch.
    # pad_token = 0
    if len(seq) >= max_seq_len:
        seq = seq[:max_seq_len]
    else:
        seq += [0]*(max_seq_len - len(seq))
    return seq

def vectorize(seq, vocab, max_seq_len):
    # Map tokens in seq to idx
    seq_idx = [vocab[token] for token in seq]
    # Pad seq idx
    padded_seq_idx = [pad_seq(seq_idx, max_seq_len)]
    # padded_seq_idx = pad_seq(seq_idx, max_seq_len)

    # return torch.tensor(padded_seq_idx)
    return padded_seq_idx

In [0]:
qid_docid = pd.read_csv(path + "FiQA_train_question_doc_final.tsv", sep="\t")
qid_docid = qid_docid [['qid', 'docid']]

**Load pickle files**

In [0]:
# dict mapping of token to idx
vocab = load_pickle(path + 'vocab_full.pickle')
# dict mapping of docid to doc text
docid_to_text = load_pickle(path + 'label_ans.pickle')
# dict mapping of qid to question text
qid_to_text = load_pickle(path + 'qid_text.pickle')
# dict mapping of qid to relevant docs
qid_rel = load_pickle(path + 'qid_rel.pickle')
# dict mapping of qid to ranked candidates
qid_ranked_docs = load_pickle(path+'qid_ranked_docs_100.pickle')

empty_docs = load_pickle(path+'empty_docs.pickle')

**Example data**

In [102]:
toy_label = dict(itertools.islice(qid_rel.items(), 100))
toy_cand = dict(itertools.islice(qid_ranked_docs.items(), 100))

print(toy_label)

{0: [18850], 1: [14255], 2: [308938], 3: [296717, 100764, 314352, 146317], 4: [196463], 5: [69306], 6: [560251, 188530, 564488], 7: [411063], 8: [566392, 65404], 9: [509122, 184698], 11: [596427], 12: [192516, 338700, 158738], 13: [503678], 14: [398960], 15: [325273], 16: [60590], 17: [146657], 18: [88124], 19: [315086, 142623], 20: [447231], 21: [497642], 23: [550624, 32102], 25: [107584, 562777], 26: [285255, 350819], 27: [537326], 28: [250640], 29: [274832, 114494, 189642, 103662], 30: [551175, 434082, 336922, 19233], 31: [156554], 32: [279480, 69623, 84645], 33: [519798, 425387], 34: [599545], 35: [498681, 80913], 36: [275249, 368649], 37: [523564], 38: [85517, 195207, 357037, 233751], 41: [176229], 42: [272709, 327263, 331981], 43: [76662], 44: [385881], 45: [284610, 261220], 46: [91325], 47: [133299], 48: [108062, 401260, 329810, 512151], 49: [352927], 51: [107817, 257168, 75195], 52: [566417, 125111], 53: [84077, 562798, 102362, 323269, 119308, 184852, 119210, 176196], 54: [5907

In [103]:
# Get train set

neg_ans = {}

train_set = []

cand_size = 50

for qid, pos_ans_lst in tqdm(toy_label.items()):
    num_sample = math.floor(cand_size/len(pos_ans_lst))
    for i, cand_lst in toy_cand.items():
        trimed_cand = [x for x in cand_lst if x not in pos_ans_lst]

    # If there is only 1 rel doc
    if num_sample == cand_size:
        for _ in range(cand_size):
            tmp = []
            tmp.append(qid)
            tmp.append(pos_ans_lst[0])
            neg_doc = random.choice(trimed_cand)
            tmp.append(neg_doc)
            train_set.append(tmp)
    else:
        for _ in range(num_sample):
            for j in range(len(pos_ans_lst)):
                tmp = []
                tmp.append(qid)
                tmp.append(pos_ans_lst[j])
                neg_doc = random.choice(trimed_cand)
                tmp.append(neg_doc)
                train_set.append(tmp)
        for k in range(cand_size % len(pos_ans_lst)):
            tmp = []
            tmp.append(qid)
            tmp.append(pos_ans_lst[k])
            neg_doc = random.choice(trimed_cand)
            tmp.append(neg_doc)
            train_set.append(tmp)

for row in train_set:
    assert len(row) == 3, "Train set len is incorrect!"


  0%|          | 0/100 [00:00<?, ?it/s][A
 28%|██▊       | 28/100 [00:00<00:00, 272.60it/s][A
 55%|█████▌    | 55/100 [00:00<00:00, 270.95it/s][A
 91%|█████████ | 91/100 [00:00<00:00, 290.51it/s][A
100%|██████████| 100/100 [00:00<00:00, 291.79it/s][A

In [104]:
print(len(train_set))

5000


In [105]:
c = 0
for row in train_set:
    if row[0] == 9:
        # print(row)
        c+=1
print(c)

50


In [0]:
# Question ID and Answer ID pair
qid_docid = pd.read_csv(path + "FiQA_train_question_doc_final.tsv", sep="\t")

qid_docid = qid_docid [['qid', 'docid']]

# test = qid_docid[:117]
test = qid_docid[:16]

In [0]:
# test_df = qid_docid
# test_df["neg_docid"] = test_df['qid'].map(neg_ans)
# test_df = test_df.explode('neg_docid')

# test_df.head(5)

In [46]:
test.head(5)

Unnamed: 0,qid,docid
0,0,18850
1,1,14255
2,2,308938
3,3,296717
4,3,100764


In [0]:
train_set = []

for index, row in test.iterrows():
    for qid, neg_doc in neg_ans.items():
        if len(neg_doc) == cand_size:
            if row['qid'] == qid:
                tmp = []
                tmp.append(row['qid'])
                tmp.append(row['docid'])
                tmp.append(neg_doc)
                train_set.append(tmp)
            else:
                tmp = []
                tmp.append(row['qid'])
                tmp.append(row['docid'])
                tmp.append(neg_doc)


# # train_set = load_pickle(path+'train_set_1000.pickle')

# # for idx, sample in enumerate(train_set):
# #     if len(sample[2]) > 1:
# #         sample[2] = random.choice(sample[2])
# #     else:
# #         sample[2] = sample[2][0]

# # count = 0

In [15]:
for row in train_set:
    if row[0] == 3:
        count += 1

count 

NameError: ignored

In [0]:
train_set = [x for x in train_set if x[1] not in empty_docs]

print(len(train_set))

In [0]:
train_set, valid_set = train_test_split(train_set, test_size=0.1)

print("Number of train data: {}".format(len(train_set)))
print("Number of validation data: {}".format(len(valid_set)))

**Model**

In [0]:
emb_dim = 100
vocab_size = len(vocab)
n_epochs = 2
batch_size = 8
hidden_size = 141
max_seq_len = 200


In [17]:
emb = torchtext.vocab.GloVe("6B", dim=emb_dim)
# dictionary mapping of word idx to glove vectors
emb_weights = np.zeros((vocab_size, emb_dim))
words_found = 0
print("Embedding dim: {}".format(emb_weights.shape))

for token, idx in vocab.items():
    # emb.stoi is a dict of token to idx mapping
    if token in emb.stoi:
        emb_weights[idx] = emb[token]
        words_found += 1

print("vocab size: ", vocab_size)
print(words_found, " words are found in GloVe")

# Convert numpy matrix to tensor
emb_weights = torch.from_numpy(emb_weights).float()

emb_weights.shape

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
100%|█████████▉| 398600/400000 [00:14<00:00, 28345.76it/s]

Embedding dim: (85034, 100)
vocab size:  85034
50456  words are found in GloVe


torch.Size([85034, 100])

In [0]:
def create_emb_layer(emb_weights):
    vocab_size, emb_dim = emb_weights.shape
    emb_layer = nn.Embedding(vocab_size, emb_dim)
    emb_layer.load_state_dict({'weight': emb_weights})

    return emb_layer

def loss_fn(pos_sim, neg_sim):
    margin = 0.05

    loss = margin - pos_sim + neg_sim
    if loss.data[0] < 0:
        loss.data[0] = 0
    return loss

In [0]:
class QA_LSTM(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size):

        super(QA_LSTM, self).__init__()

        # Shape - (max_seq_len, emb_dim)
        self.embedding = create_emb_layer(emb_weights)

        self.shared_lstm = nn.LSTM(emb_size, hidden_size, num_layers=1, batch_first=True, bidirectional=True)
        self.cos = nn.CosineSimilarity(dim=1)

    def forward(self, q, a):
        # embedding
        q = self.embedding(q) # (bs, L, E)
        a = self.embedding(a) # (bs, L, E)

        # LSTM
        q, (hidden, cell) = self.shared_lstm(q) # (bs, L, 2H)
        a, (hidden, cell) = self.shared_lstm(a) # (bs, L, 2H)

        # Output shape (batch size, seq_len, num_direction * hidden_size)
        # There are n of word level biLSTM representations for the seq where n is the number of seq len
        # Use max pooling to generate the best representation
        q = torch.max(q, 1)[0] 
        a = torch.max(a, 1)[0] # (bs, 2H)

        return self.cos(q, a) # (bs,)

In [0]:
import torch
from torch.utils import data

class Dataset(data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, q_lst, pos_ans, neg_ans):
        'Initialization'
        self.q_lst = q_lst
        self.pos_ans_lst = pos_ans
        self.neg_ans_lst = neg_ans

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.q_lst)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.q_lst[index]

        # Load data and get label
        q = self.q_lst[index]
        pos_ans = self.pos_ans_lst[index]
        neg_ans = self.neg_ans_lst[index]

        return q, pos_ans, neg_ans

In [0]:
def train(model, train_set, optimizer, batch_size):

    # Cumulated Training loss
    training_loss = 0.0

    q_lst = []
    pos_lst = []
    neg_lst = []

    # Set model to training mode
    model.train()
 
    for i, seq in enumerate(train_set):

        ques, pos_ans, neg_ans = seq[0], seq[1], seq[2]

        q_text = qid_to_text[ques]
        q_vec = vectorize(q_text, vocab, max_seq_len)

        q_lst.append(q_vec)

        pos_ans_text = docid_to_text[pos_ans]
        pos_ans_vec = vectorize(pos_ans_text, vocab, max_seq_len)

        pos_lst.append(pos_ans_vec)

        for docid in neg_ans:
            neg_ans_text = docid_to_text[neg_ans]
            neg_ans_vec = vectorize(neg_ans_text, vocab, max_seq_len)
            neg_lst.append(neg_ans_vec)

    q_lst = torch.tensor(q_lst)
    pos_lst = torch.tensor(pos_lst)
    neg_lst = torch.tensor(neg_lst)

    train_data = Dataset(q_lst, pos_lst, neg_lst)

    train_loader = data.DataLoader(train_data, batch_size=batch_size)

    for ques, pos_ans, neg_ans in tqdm(train_loader):
        # 1. Zero the gradients
        optimizer.zero_grad()

        for q in ques:
            batch_q = q.to(device)

        for p in pos_ans:
            batch_pos = p.to(device)

        for n in neg_ans:
            batch_neg = n.to(device)
            
        # 2. Compute predictions
        pos_sim = model(batch_q, batch_pos)    
        neg_sim = model(batch_q, batch_neg)

        # 3. Compute loss
        loss = loss_fn(pos_sim, neg_sim)

        # 4. Use loss to compute gradients
        loss.backward()

        # 5. Use optimizer to take gradient step
        optimizer.step()
            
        training_loss += loss.item()
            
    return training_loss / len(train_loader)

In [0]:
def validate(model, valid_set, batch_size):

    # Cumulated Training loss
    valid_loss = 0.0

    q_lst = []
    pos_lst = []
    neg_lst = []

    # Set model to evaluation mode
    model.eval()
 
    for i, seq in enumerate(train_set):

        ques, pos_ans, neg_ans = seq[0], seq[1], seq[2]

        q_text = qid_to_text[ques]
        q_vec = vectorize(q_text, vocab, max_seq_len)

        q_lst.append(q_vec)

        pos_ans_text = docid_to_text[pos_ans]
        pos_ans_vec = vectorize(pos_ans_text, vocab, max_seq_len)

        pos_lst.append(pos_ans_vec)

        neg_ans_text = docid_to_text[neg_ans]
        neg_ans_vec = vectorize(neg_ans_text, vocab, max_seq_len)

        neg_lst.append(neg_ans_vec)

    q_lst = torch.tensor(q_lst)
    pos_lst = torch.tensor(pos_lst)
    neg_lst = torch.tensor(neg_lst)

    train_data = Dataset(q_lst, pos_lst, neg_lst)

    train_loader = data.DataLoader(train_data, batch_size=batch_size)

        
    # Don't calculate the gradients
    with torch.no_grad():

        for ques, pos_ans, neg_ans in tqdm(train_loader):

            for q in ques:
                batch_q = q.to(device)

            for p in pos_ans:
                batch_pos = p.to(device)

            for n in neg_ans:
                batch_neg = n.to(device)
                
            pos_sim = model(batch_q, batch_pos)    
            neg_sim = model(batch_q, batch_neg)

            loss = loss_fn(pos_sim, neg_sim)
                
            valid_loss += loss.item()
                
        return valid_loss / len(train_loader)

In [0]:
model = QA_LSTM(vocab_size, emb_dim, hidden_size)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-2)

seq = train_set[0]
ques, pos_ans, neg_ans = seq[0], seq[1], seq[2]

q_text = qid_to_text[ques]
q_vec = torch.tensor(vectorize(q_text, vocab, max_seq_len)).to(device)

pos_ans_text = docid_to_text[pos_ans]
pos_ans_vec = torch.tensor(vectorize(pos_ans_text, vocab, max_seq_len)).to(device)

for docid in neg_ans:
    neg_ans_text = docid_to_text[docid]
    neg_ans_vec = torch.tensor(vectorize(neg_ans_text, vocab, max_seq_len)).to(device)

    pos_sim = model(q_vec, pos_ans_vec)
    neg_sim = model(q_vec, neg_ans_vec)

In [0]:
model = QA_LSTM(vocab_size, emb_dim, hidden_size)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-2)

# Lowest validation lost
best_valid_loss = float('inf')

for epoch in range(n_epochs):

    # Evaluate training loss
    train_loss = train(model, train_set, optimizer, batch_size)
    # Evaluate validation loss
    valid_loss = validate(model, valid_set, batch_size)
    
    # At each epoch, if the validation loss is the best
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        # Save the parameters of the model
        torch.save(model.state_dict(), 'model-train100.pt')

    print("\n\n Epoch {}:".format(epoch+1))
    print("\t Train Loss: {}".format(round(train_loss, 3)))
    print("\t Validation Loss: {}\n".format(round(valid_loss, 3)))

In [0]:
print(take(5, toy_label.items()))

# TODO:
# 1. Get cand for test data
# 2. Process test data
# 3. Get test_data

In [0]:
test_set = []

for qid, docid in toy_label.items():
    for k, v in toy_cand.items():
        if k == qid:
            tmp = []
            tmp.append(qid)
            tmp.append(docid)
            tmp.append(v)
            test_set.append(tmp)

test_set = test_set[:10]

# print(test_set)

In [0]:
def eval(model, test_set, qid_rel, max_seq_len, k):
    
    qid_pred_rank = {}

    for i, seq in enumerate(tqdm(test_set)):

        ques, pos_ans, cands = seq[0], seq[1], seq[2]

        q_text = qid_to_text[ques]
        q_vec = torch.tensor(vectorize(q_text, vocab, max_seq_len)).to(device)

        cands_text = [docid_to_text[c] if c is not 0 else "" for c in cands]

        scores = []

        cands_id = np.array(cands)

        for cand in cands_text:
            a_vec = torch.tensor(vectorize(cand, vocab, max_seq_len)).to(device)
            scores.append(model(q_vec, a_vec).item())

        sorted_index = np.argsort(scores)[::-1]

        ranked_ans = cands_id[sorted_index]

        qid_pred_rank[ques] = ranked_ans

    # return qid_pred_rank
    MRR, average_ndcg, precision = evaluate(qid_pred_rank, qid_rel, k)

    return MRR, average_ndcg, precision

**Test**

In [0]:
toy_test_label = dict(itertools.islice(qid_rel.items(), 10))

print(toy_test_label)

In [0]:
# Load the model with the best validation loss
model.load_state_dict(torch.load('model-train100.pt'))

rank = eval(model, test_set, toy_test_label, max_seq_len, k=10)

In [0]:
print(toy_test_label[0])

In [0]:
# Load the model with the best validation loss
model.load_state_dict(torch.load('model-train100.pt'))

k = 10

MRR, average_ndcg, precision = eval(model, test_set, toy_test_label, max_seq_len, k=10)

num_q = len(test_set)

print("\n\nAverage nDCG@{} for {} queries: {}\n".format(k, num_q, average_ndcg))

print("MRR@{} for {} queries: {}\n".format(k, num_q, MRR))

print("Average Precision@{}: {}".format(1, precision))