In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pickle
import random
from collections import Counter
from tqdm import tqdm
import itertools
import pandas as pd
from itertools import islice
import numpy as np
import random
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.functional import softmax

# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)

Using device: cuda

Tesla P100-PCIE-16GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


<torch._C.Generator at 0x7fafa8b835d0>

In [3]:
!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup



In [0]:
path = "drive/My Drive/fiqa/"

In [0]:
from evaluate import *
from utils import *

In [0]:
from transformers import AutoModelWithLMHead, AutoTokenizer, pipeline

# model = AutoModelWithLMHead.from_pretrained(path+"model/lm_model")
tokenizer = AutoTokenizer.from_pretrained(path+"model/lm_model")

# fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
# fill_mask("In both cases, [MASK] arrangements are designed to make a large profit for the owner.")

In [0]:
def load_cands(cands_path):

    qid_ranked_docs = {}

    with open(cands_path,'r') as f:
        for line in f:
            
            line = line.strip().split('\t')
            qid = int(line[0])
            doc_id = int(line[1])
            rank = int(line[2])

            if qid not in qid_ranked_docs:
                # Create a list for each query to store the candidates
                candidates = [0]*50
                qid_ranked_docs[qid] = candidates
            qid_ranked_docs[qid][rank-1] = doc_id

    return qid_ranked_docs

In [0]:
def create_dataset(qid_rel, cands):
    test_set = []

    for qid, docid in qid_rel.items():
        
        for ques, cand in cands.items():
            if 0 not in cand:
                cand_ans = cand
                if ques == qid:
                    tmp = []
                    tmp.append(qid)
                    tmp.append(docid)
                    tmp.append(cand_ans)
                    test_set.append(tmp)

    for row in test_set:
        assert len(row[2]) == 50, "Dataset size is incorrect!"
    
    return test_set

In [0]:
# Dictionary - key: qid, value: list of positive docid
train_qid_rel = load_pickle(path + "data/qid_rel_train.pickle")
test_qid_rel = load_pickle(path + "data/qid_rel_test.pickle")
valid_qid_rel = load_pickle(path + "data/qid_rel_valid.pickle")

# List of lists:
# Each element is a list containing [qid, positive docid, negative docid]
# train_set = load_pickle(path + 'new-data/data_75/train_set_75.pickle')
# valid_set = load_pickle(path + 'new-data/data_75/valid_set_75.pickle')
# train_set = load_pickle(path + 'new-data/data_50/train_set_50.pickle')
# valid_set = load_pickle(path + 'new-data/data_50/valid_set_50.pickle')
# train_set = load_pickle(path + 'new-data/data_25/train_set_25.pickle')
# valid_set = load_pickle(path + 'new-data/data_25/valid_set_25.pickle')
# train_set = load_pickle(path + 'new-data/data_10/train_set_10.pickle')
# valid_set = load_pickle(path + 'new-data/data_10/valid_set_10.pickle')
# train_set = load_pickle(path + 'new-data/train_set.pickle')
# valid_set = load_pickle(path + 'new-data/valid_set.pickle')

# List of lists:
# Each element is a list contraining [qid, list of pos docid, list of candidate docid]
# Contains candidates with all pos docids
# test_set = load_pickle(path + 'data/data_50/test_set_50.pickle')
# Contains candidates retrieved by BM25
# May be missing pos docids in candidates
# test_set_full = load_pickle(path + 'new-data/data_50/test_set_full_50.pickle')

# Dictionary mapping docid and qid to raw text
docid_to_text = load_pickle(path + 'data/docid_to_text.pickle')
qid_to_text = load_pickle(path + 'data/qid_to_text.pickle')

# train_cands = load_cands(path + 'data/cands_train_100.tsv')
# valid_cands = load_cands(path + 'data/cands_valid_100.tsv')
# test_cands = load_cands(path + 'data/cands_test_100.tsv')

# train_cands = load_cands(path + 'data/cands_train_50.tsv')
# valid_cands = load_cands(path + 'data/cands_valid_50.tsv')
# test_cands = load_cands(path + 'data/cands_test_50.tsv')

# train_set = load_pickle(path + 'data/train_set.pickle')
# valid_set = load_pickle(path + 'data/valid_set.pickle')
# train_set = create_dataset(train_qid_rel, train_cands)
# valid_set = create_dataset(valid_qid_rel, valid_cands)
# test_set = create_dataset(test_qid_rel, test_cands)


train_set = load_pickle(path + 'data/train_set_50.pickle')
valid_set = load_pickle(path + 'data/valid_set_50.pickle')
test_set = load_pickle(path + 'data/test_set_50.pickle')
# train_set = create_dataset(train_qid_rel, train_cands)
# valid_set = create_dataset(valid_qid_rel, valid_cands)

In [0]:
# save_pickle(path+'data/test_set_50.pickle', test_set)
# save_pickle(path+'data/valid_set_50.pickle', valid_set)
save_pickle(path+'data/train_set_50.pickle', train_set)

In [8]:
print("Number of training samples: {}".format(len(train_set)))
print("Number of validation samples: {}".format(len(valid_set)))
print("Number of test samples: {}".format(len(test_set)))

Number of training samples: 5676
Number of validation samples: 631
Number of test samples: 333


In [12]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

Loading BERT tokenizer...


HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [0]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    # Get the column with the higher probability
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [19]:
# Example of the training set [qid, pos docid, neg docid]
print(train_set[0])

[0, [18850], [531578, 417981, 324911, 524879, 397608, 216077, 173212, 434846, 104464, 326261, 528838, 234436, 571062, 196374, 481692, 207449, 338700, 153377, 406418, 327002, 421301, 11538, 375748, 238271, 322893, 130631, 483385, 73427, 560087, 531442, 156554, 541809, 562777, 192843, 553328, 283505, 209224, 351672, 324513, 18850, 55200, 540395, 297841, 367754, 455984, 160340, 577284, 287474, 565935, 354716]]


## **Pairwise**

In [0]:
def get_pairwise_input_data(dataset, max_seq_len):
    pos_input_ids = []
    neg_input_ids = []

    pos_type_ids = []
    neg_type_ids = []
    
    pos_masks = []
    neg_masks = []

    pos_labels = []
    neg_labels = []

    for i, seq in enumerate(tqdm(dataset)):
        qid, ans_labels, cands = seq[0], seq[1], seq[2]

        filtered_cands = list(set(cands)-set(ans_labels))

        pos_docid = random.choice(ans_labels)

        # Map question id to text
        q_text = qid_to_text[qid]

        for neg_docid in filtered_cands:

            # Map the docid to text
            pos_ans_text = docid_to_text[pos_docid]
            neg_ans_text = docid_to_text[neg_docid]

            pos_encoded_seq = tokenizer.encode_plus(q_text, pos_ans_text, 
                                                max_length=max_seq_len, 
                                                pad_to_max_length=True, 
                                                return_token_type_ids=True,
                                                return_attention_mask = True)
            
            neg_encoded_seq = tokenizer.encode_plus(q_text, neg_ans_text, 
                                                max_length=max_seq_len, 
                                                pad_to_max_length=True, 
                                                return_token_type_ids=True,
                                                return_attention_mask = True)

            pos_input_id = pos_encoded_seq['input_ids']
            pos_type_id = pos_encoded_seq['token_type_ids']
            pos_mask = pos_encoded_seq['attention_mask']

            neg_input_id = neg_encoded_seq['input_ids']
            neg_type_id = neg_encoded_seq['token_type_ids']
            neg_mask = neg_encoded_seq['attention_mask']

            pos_input_ids.append(pos_input_id)
            pos_type_ids.append(pos_type_id)
            pos_masks.append(pos_mask)
            pos_labels.append(1)

            neg_input_ids.append(neg_input_id)
            neg_type_ids.append(neg_type_id)
            neg_masks.append(neg_mask)
            neg_labels.append(0)

    return pos_input_ids, pos_type_ids, pos_masks, pos_labels, neg_input_ids, neg_type_ids, neg_masks, neg_labels

In [27]:
train_pos_input, train_pos_type_id, train_pos_att_mask, train_pos_labels, \
train_neg_input, train_neg_type_id, train_neg_att_mask, train_neg_labels  = get_pairwise_input_data(train_set, 128)

100%|██████████| 5676/5676 [37:18<00:00,  2.54it/s]


In [36]:
valid_pos_input, valid_pos_type_id, valid_pos_att_mask, valid_pos_labels, \
valid_neg_input, valid_neg_type_id, valid_neg_att_mask, valid_neg_labels = get_pairwise_input_data(valid_set, 128)

100%|██████████| 631/631 [04:01<00:00,  2.61it/s]


In [0]:
save_pickle(path+'pairwise-data/train_pos_labels_50.pickle', train_pos_labels)
save_pickle(path+'pairwise-data/train_neg_labels_50.pickle', train_neg_labels)
save_pickle(path+'pairwise-data/valid_pos_labels_50.pickle', valid_pos_labels)
save_pickle(path+'pairwise-data/valid_neg_labels_50.pickle', valid_neg_labels)

save_pickle(path+'pairwise-data/train_pos_input_128_50.pickle', train_pos_input)
save_pickle(path+'pairwise-data/train_neg_input_128_50.pickle', train_neg_input)
save_pickle(path+'pairwise-data/valid_pos_input_128_50.pickle', valid_pos_input)
save_pickle(path+'pairwise-data/valid_neg_input_128_50.pickle', valid_neg_input)

save_pickle(path+'pairwise-data/train_pos_type_id_128_50.pickle', train_pos_type_id)
save_pickle(path+'pairwise-data/train_neg_type_id_128_50.pickle', train_neg_type_id)
save_pickle(path+'pairwise-data/valid_pos_type_id_128_50.pickle', valid_pos_type_id)
save_pickle(path+'pairwise-data/valid_neg_type_id_128_50.pickle', valid_neg_type_id)

save_pickle(path+'pairwise-data/train_pos_mask_128_50.pickle', train_pos_att_mask)
save_pickle(path+'pairwise-data/train_neg_mask_128_50.pickle', train_neg_att_mask)
save_pickle(path+'pairwise-data/valid_pos_mask_128_50.pickle', valid_pos_att_mask)
save_pickle(path+'pairwise-data/valid_neg_mask_128_50.pickle', valid_neg_att_mask)

In [0]:
train_pos_labels = load_pickle(path+'pairwise-data/train_pos_labels_50.pickle')
train_neg_labels = load_pickle(path+'pairwise-data/train_neg_labels_50.pickle')
valid_pos_labels = load_pickle(path+'pairwise-data/valid_pos_labels_50.pickle')
valid_neg_labels = load_pickle(path+'pairwise-data/valid_neg_labels_50.pickle')

train_pos_input = load_pickle(path+'pairwise-data/train_pos_input_128_50.pickle')
train_neg_input = load_pickle(path+'pairwise-data/train_neg_input_128_50.pickle')
valid_pos_input = load_pickle(path+'pairwise-data/valid_pos_input_128_50.pickle')
valid_neg_input = load_pickle(path+'pairwise-data/valid_neg_input_128_50.pickle')

train_pos_type_id = load_pickle(path+'pairwise-data/train_pos_type_id_128_50.pickle')
train_neg_type_id = load_pickle(path+'pairwise-data/train_neg_type_id_128_50.pickle')
valid_pos_type_id = load_pickle(path+'pairwise-data/valid_pos_type_id_128_50.pickle')
valid_neg_type_id = load_pickle(path+'pairwise-data/valid_neg_type_id_128_50.pickle')

train_pos_mask = load_pickle(path+'pairwise-data/train_pos_mask_128_50.pickle')
train_neg_mask = load_pickle(path+'pairwise-data/train_neg_mask_128_50.pickle')
valid_pos_mask = load_pickle(path+'pairwise-data/valid_pos_mask_128_50.pickle')
valid_neg_mask = load_pickle(path+'pairwise-data/valid_neg_mask_128_50.pickle')

In [39]:
print(len(train_pos_input))
print(len(valid_pos_input))

277827
30874


In [0]:
# train_pos_labels = train_pos_labels[:100]
# train_neg_labels = train_neg_labels[:100]
# train_pos_input = train_pos_input[:100]
# train_neg_input = train_neg_input[:100]
# train_pos_type_id = train_pos_type_id[:100]
# train_neg_type_id = train_neg_type_id[:100]
# train_pos_mask = train_pos_mask[:100]
# train_neg_mask = train_neg_mask[:100]

# valid_pos_labels = valid_pos_labels[:10]
# valid_neg_labels = valid_neg_labels[:10]
# valid_pos_input = valid_pos_input[:10]
# valid_neg_input = valid_neg_input[:10]
# valid_pos_type_id = valid_pos_type_id[:10]
# valid_neg_type_id = valid_neg_type_id[:10]
# valid_pos_mask = valid_pos_mask[:10]
# valid_neg_mask = valid_neg_mask[:10]

In [0]:
# Convert lists to PyTorch tensors
train_pos_inputs = torch.tensor(train_pos_input)
train_neg_inputs = torch.tensor(train_neg_input)
valid_pos_inputs = torch.tensor(valid_pos_input)
valid_neg_inputs = torch.tensor(valid_neg_input)

train_pos_labels = torch.tensor(train_pos_labels)
train_neg_labels = torch.tensor(train_neg_labels)
valid_pos_labels = torch.tensor(valid_pos_labels)
valid_neg_labels = torch.tensor(valid_neg_labels)

train_pos_type_ids = torch.tensor(train_pos_type_id)
train_neg_type_ids = torch.tensor(train_neg_type_id)
valid_pos_type_ids = torch.tensor(valid_pos_type_id)
valid_neg_type_ids = torch.tensor(valid_neg_type_id)

train_pos_masks = torch.tensor(train_pos_mask)
train_neg_masks = torch.tensor(train_neg_mask)
valid_pos_masks = torch.tensor(valid_pos_mask)
valid_neg_masks = torch.tensor(valid_neg_mask)

In [0]:
# Create DataLoaders to train the model in batches

batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_pos_inputs, train_pos_type_ids, train_pos_masks, train_pos_labels, train_neg_inputs, train_neg_type_ids, train_neg_masks, train_neg_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(valid_pos_inputs, valid_pos_type_ids, valid_pos_masks, valid_pos_labels, valid_neg_inputs, valid_neg_type_ids, valid_neg_masks, valid_neg_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [0]:
def pairwise_loss(pos_scores, neg_scores):
    """
    Pairwise learning approach introduced in https://arxiv.org/pdf/1905.07588.pdf
    """

    cross_entropy_loss = -torch.log(pos_scores) - torch.log(1 - neg_scores)

    margin = 1

    hinge_loss = torch.max(torch.tensor(0, dtype=torch.float).to(device), margin - pos_scores + neg_scores)

    loss = (0.5 * cross_entropy_loss + 0.5 * hinge_loss)

    return loss

In [0]:
def train_pairwise(model, train_dataloader, optimizer, scheduler):

    # Reset the loss and accuracy for each epoch
    total_loss = 0
    nb_train_steps = 0
    train_accuracy = 0

    # Set model in training mode
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(tqdm(train_dataloader)):

        # batch contains eight PyTorch tensors:
        pos_input = batch[0].to(device)
        pos_type_id = batch[1].to(device)
        pos_mask = batch[2].to(device)
        pos_labels = batch[3].to(device)

        neg_input = batch[4].to(device)
        neg_type_id = batch[5].to(device)
        neg_mask = batch[6].to(device)
        neg_labels = batch[7].to(device)

        # Zero gradients
        model.zero_grad()

        # Compute predictinos for postive and negative QA pairs
        pos_outputs = model(pos_input, token_type_ids=pos_type_id, attention_mask=pos_mask, labels=pos_labels)
        neg_outputs = model(neg_input, token_type_ids=neg_type_id, attention_mask=neg_mask, labels=neg_labels)

        # Get the logits from the model for positive and negative QA pairs
        pos_logits = pos_outputs[1]
        neg_logits = neg_outputs[1]

        # Get the column of the relevant scores and apply activation function
        pos_scores = softmax(pos_logits, dim=1)[:,1]
        neg_scores = softmax(neg_logits, dim=1)[:,1]
        
        # Compute pairwise loss and get the mean of each batch
        loss = pairwise_loss(pos_scores, neg_scores).mean()

        # Move logits and labels to CPU
        p_logits = pos_logits.detach().cpu().numpy()
        p_labels = pos_labels.to('cpu').numpy()
        n_logits = neg_logits.detach().cpu().numpy()
        n_labels = neg_labels.to('cpu').numpy()

        # Calculate the accuracy for each batch
        tmp_pos_accuracy = flat_accuracy(p_logits, p_labels)
        tmp_neg_accuracy = flat_accuracy(n_logits, n_labels)

        # Accumulate the total accuracy.
        train_accuracy += tmp_pos_accuracy
        train_accuracy += tmp_neg_accuracy
        
        # Track the number of batches (2 for pos and neg accuracies)
        nb_train_steps += 2

        # Accumulate the training loss over all of the batches
        total_loss += loss.item()
    
        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update scheduler
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            

    # Compute accuracy for each epoch
    acc = train_accuracy/nb_train_steps

    return avg_train_loss, acc

In [0]:
def validate_pairwise(model, validation_dataloader):

    # Set model in evaluation mode
    model.eval()

    # Tracking variables 
    total_loss = 0
    nb_eval_steps = 0
    eval_accuracy = 0

    # Evaluate data for one epoch
    for batch in tqdm(validation_dataloader):
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        pos_input, pos_type_id, pos_mask, pos_labels, neg_input, neg_type_id, neg_mask, neg_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():
            # Compute predictinos for postive and negative QA pairs
            pos_outputs = model(pos_input, token_type_ids=pos_type_id, attention_mask=pos_mask, labels=pos_labels)
            neg_outputs = model(neg_input, token_type_ids=neg_type_id, attention_mask=neg_mask, labels=neg_labels)

            # Get logits
            pos_logits = pos_outputs[1]
            neg_logits = neg_outputs[1]

            # Apply activation function
            pos_scores = softmax(pos_logits, dim=1)[:,1]
            neg_scores = softmax(neg_logits, dim=1)[:,1]
        
        loss = pairwise_loss(pos_scores, neg_scores).mean()

        # Move logits and labels to CPU
        p_logits = pos_logits.detach().cpu().numpy()
        p_labels = pos_labels.to('cpu').numpy()
        n_logits = neg_logits.detach().cpu().numpy()
        n_labels = neg_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        tmp_pos_accuracy = flat_accuracy(p_logits, p_labels)
        tmp_neg_accuracy = flat_accuracy(n_logits, n_labels)

        # Accumulate the total accuracy.
        eval_accuracy += tmp_pos_accuracy
        eval_accuracy += tmp_neg_accuracy

        # Track the number of batches
        nb_eval_steps += 2

        total_loss += loss.item()

    avg_loss = total_loss / len(validation_dataloader)
    acc = eval_accuracy/nb_eval_steps

    return avg_loss, acc

## **Pointwise**

In [0]:
def get_input_data(dataset, max_seq_len):
    input_ids = []
    token_type_ids = []
    att_masks = []
    labels = []

    for i, seq in enumerate(tqdm(dataset)):
        qid, ans_labels, cands = seq[0], seq[1], seq[2]

        # Map question id to text
        q_text = qid_to_text[qid]

        for docid in cands:

            # Map the docid to text
            ans_text = docid_to_text[docid]

            encoded_seq = tokenizer.encode_plus(q_text, ans_text, 
                                                max_length=max_seq_len, 
                                                pad_to_max_length=True, 
                                                return_token_type_ids=True,
                                                return_attention_mask = True)

            input_id = encoded_seq['input_ids']
            token_type_id = encoded_seq['token_type_ids']
            att_mask = encoded_seq['attention_mask']

            if docid in ans_labels:
                label = 1
            else:
                label = 0

            assert len(input_id) == max_seq_len, "Input id dimension incorrect!"
            assert len(token_type_id) == max_seq_len, "Token type id dimension incorrect!"
            assert len(att_mask) == max_seq_len, "Attention mask dimension incorrect!"

            input_ids.append(input_id)
            token_type_ids.append(token_type_id)
            att_masks.append(att_mask)
            labels.append(label)

    return input_ids, token_type_ids, att_masks, labels

In [13]:
train_input, train_type_id, train_att_mask, train_label = get_input_data(train_set, 128)
valid_input, valid_type_id, valid_att_mask, valid_label = get_input_data(valid_set, 128)

100%|██████████| 5676/5676 [18:33<00:00,  5.10it/s]
100%|██████████| 631/631 [01:57<00:00,  5.38it/s]


In [0]:
save_pickle(path+'/pointwise-data/train_input_128_lm.pickle', train_input)
save_pickle(path+'/pointwise-data/train_type_id_128_lm.pickle', train_type_id)
save_pickle(path+'/pointwise-data/train_mask_128_lm.pickle', train_att_mask)
save_pickle(path+'/pointwise-data/train_labels_128_lm.pickle', train_label)

save_pickle(path+'/pointwise-data/valid_input_128_lm.pickle', valid_input)
save_pickle(path+'/pointwise-data/valid_type_id_128_lm.pickle', valid_type_id)
save_pickle(path+'/pointwise-data/valid_mask_128_lm.pickle', valid_att_mask)
save_pickle(path+'/pointwise-data/valid_labels_128_lm.pickle', valid_label)

In [0]:
train_input = load_pickle(path+'/pointwise-data/train_input_128_lm.pickle')
train_type_id = load_pickle(path+'/pointwise-data/train_type_id_128_lm.pickle')
train_att_mask = load_pickle(path+'/pointwise-data/train_mask_128_lm.pickle')
train_label = load_pickle(path+'/pointwise-data/train_labels_128_lm.pickle')

valid_input = load_pickle(path+'/pointwise-data/valid_input_128_lm.pickle')
valid_type_id = load_pickle(path+'/pointwise-data/valid_type_id_128_lm.pickle')
valid_att_mask = load_pickle(path+'/pointwise-data/valid_mask_128_lm.pickle')
valid_label = load_pickle(path+'/pointwise-data/valid_labels_128_lm.pickle')

In [0]:
train_input = load_pickle(path+'/pointwise-data/train_input_128_50.pickle')
train_type_id = load_pickle(path+'/pointwise-data/train_type_id_128_50.pickle')
train_att_mask = load_pickle(path+'/pointwise-data/train_mask_128_50.pickle')
train_label = load_pickle(path+'/pointwise-data/train_labels_128_50.pickle')

valid_input = load_pickle(path+'/pointwise-data/valid_input_128_50.pickle')
valid_type_id = load_pickle(path+'/pointwise-data/valid_type_id_128_50.pickle')
valid_att_mask = load_pickle(path+'/pointwise-data/valid_mask_128_50.pickle')
valid_label = load_pickle(path+'/pointwise-data/valid_labels_128_50.pickle')

In [0]:
# def get_sequence_df(dataset):
#     """
#     Converts training and validation data into a df with relevancy labels
#     and map the qid and docid to text.
    
#     Returns data_df: df with columns qid, docid, label, question (text), answer (text)
#     ---------------
#     dataset: train or validation set in the form of list of lists
#     """
#     # Load list into a dataframe
#     df = pd.DataFrame(dataset)
#     df = df.rename(columns={0: 'qid', 1: 'pos', 2:'neg'})
#     # Construct new df with positive docids
#     df_pos = df[['qid', 'pos']]
#     df_pos = df_pos.rename(columns={'pos': 'docid'})
#     # Add new column and assign positive label
#     df_pos['label'] = df_pos.apply(lambda x: 1, axis=1)
#     df_pos = df_pos.drop_duplicates()

#     # Construct new df with negative docids
#     df_neg = df[['qid', 'neg']]
#     df_neg = df_neg.rename(columns={'neg': 'docid'})
#     # Add new column and assign negative label
#     df_neg['label'] = df_neg.apply(lambda x: 0, axis=1)

#     # Concatenate the positive and negative df
#     data_df = pd.concat([df_pos, df_neg]).sort_values(by=['qid'])

#     # Map id to text
#     data_df['question'] = data_df['qid'].apply(lambda x: qid_to_text[x])
#     data_df['ans_cand'] = data_df['docid'].apply(lambda x: docid_to_text[x])

#     return data_df

In [0]:
# # Create train/validation data
# trainset = get_sequence_df(train_set)
# train_questions = trainset.question.values
# train_answers = trainset.ans_cand.values
# train_labels = trainset.label.values

# train_input, train_type_id, train_att_mask = get_input(train_questions, train_answers, 128)

# validset = get_sequence_df(valid_set)
# valid_questions = validset.question.values
# valid_answers = validset.ans_cand.values
# valid_labels = validset.label.values

# valid_input, valid_type_id, valid_att_mask = get_input(valid_questions, valid_answers, 128)

# save_pickle(path+'/pointwise-data/train_labels_128_75.pickle', train_labels)
# save_pickle(path+'/pointwise-data/valid_labels_128_75.pickle', valid_labels)

# save_pickle(path+'/pointwise-data/train_input_128_75.pickle', train_input)
# save_pickle(path+'/pointwise-data/valid_input_128_75.pickle', valid_input)

# save_pickle(path+'/pointwise-data/train_type_id_128_75.pickle', train_type_id)
# save_pickle(path+'/pointwise-data/valid_type_id_128_75.pickle', valid_att_mask)

# save_pickle(path+'/pointwise-data/train_mask_128_75.pickle', train_att_mask)
# save_pickle(path+'/pointwise-data/valid_mask_128_75.pickle', valid_att_mask)

In [0]:
# # Load train/validation data
# train_label = load_pickle(path+'/pointwise-data/train_labels_128_50.pickle')
# valid_label = load_pickle(path+'/pointwise-data/valid_labels_128_50.pickle')

# # bert-base tokenizer
# train_input = load_pickle(path+'/pointwise-data/train_input_128_50.pickle')
# valid_input = load_pickle(path+'/pointwise-data/valid_input_128_50.pickle')
# train_type_id = load_pickle(path+'/pointwise-data/train_type_id_128_50.pickle')
# valid_type_id = load_pickle(path+'/pointwise-data/valid_type_id_128_50.pickle')
# train_att_mask = load_pickle(path+'/pointwise-data/train_mask_128_50.pickle')
# valid_att_mask = load_pickle(path+'/pointwise-data/valid_mask_128_50.pickle')

# train_input = load_pickle(path+'/pointwise-data/train_input_256_10.pickle')
# valid_input = load_pickle(path+'/pointwise-data/valid_input_256_10.pickle')
# train_type_id = load_pickle(path+'/pointwise-data/train_type_id_256_10.pickle')
# valid_type_id = load_pickle(path+'/pointwise-data/valid_type_id_256_10.pickle')
# train_att_mask = load_pickle(path+'/pointwise-data/train_mask_256_10.pickle')
# valid_att_mask = load_pickle(path+'/pointwise-data/valid_mask_256_10.pickle')


# bert-large tokenizer
# train_input = load_pickle(path+'/pointwise-data/train_input_256_large.pickle')
# valid_input = load_pickle(path+'/pointwise-data/valid_input_256_large.pickle')
# train_type_id = load_pickle(path+'/pointwise-data/train_type_id_256_large.pickle')
# valid_type_id = load_pickle(path+'/pointwise-data/valid_type_id_256_large.pickle')
# train_att_mask = load_pickle(path+'/pointwise-data/train_mask_256_large.pickle')
# valid_att_mask = load_pickle(path+'/pointwise-data/valid_mask_256_large.pickle')

In [11]:
print(len(train_input))
print(len(valid_input))

283800
31550


In [12]:
# Convert all inputs and labels into torch tensors
train_inputs = torch.tensor(train_input)
train_type_ids = torch.tensor(train_type_id)
train_masks = torch.tensor(train_att_mask)
train_labels = torch.tensor(train_label)

validation_inputs = torch.tensor(valid_input)
validation_type_ids = torch.tensor(valid_type_id)
validation_masks = torch.tensor(valid_att_mask)
validation_labels = torch.tensor(valid_label)

# Create DataLoader to train in bacthes

batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_type_ids, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_type_ids, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

print("Size of the DataLoader for the training set: {}".format(len(train_dataloader)))
print("Size of the DataLoader for the validation set: {}".format(len(validation_dataloader)))

Size of the DataLoader for the training set: 8869
Size of the DataLoader for the validation set: 986


In [0]:
def train(model, train_dataloader, optimizer, scheduler):

    # Reset the total loss each epoch
    total_loss = 0
    train_accuracy = 0
    # Track the number of batches
    num_steps = 0

    # Set model in train mode
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(tqdm(train_dataloader)):

        # batch contains four PyTorch tensors:
        #   [0]: input ids
        #   [1]: token_type_ids
        #   [2]: attention masks
        #   [3]: labels 
        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)

        # Zero the gradients
        model.zero_grad()        

        # Forward pass
        # The model will return the loss and the logits
        outputs = model(b_input_ids, 
                    token_type_ids = b_token_type_ids, 
                    attention_mask = b_input_mask, 
                    labels = b_labels)

        loss = outputs[0]
        logits = outputs[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch
        tmp_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        train_accuracy += tmp_accuracy

        # Track the number of batches
        num_steps += 1

        # Accumulate the training loss over all of the batches
        total_loss += loss.item()
    
        # Perform a backward pass to calculate the gradients
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update scheduler
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            

    acc = train_accuracy/num_steps

    return avg_train_loss, acc

In [0]:
def validate(model, validation_dataloader):

    # Evaluation mode
    model.eval()

    total_loss = 0
    eval_accuracy = 0
    nb_eval_steps = 0

    # For each batch of the validation data
    for batch in tqdm(validation_dataloader):
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from the dataloader
        b_input_ids, b_token_type_ids, b_input_masks, b_labels = batch
        
        # Don't to compute or store gradients
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids = b_token_type_ids, 
                            attention_mask = b_input_masks,
                            labels= b_labels)
        
        loss = outputs[0]
        logits = outputs[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

        total_loss += loss.item()

    acc = eval_accuracy/nb_eval_steps
    avg_loss = total_loss / len(validation_dataloader)

    return avg_loss, acc

## **Training**

In [15]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top

# model_path = path + "model/fin_model"
model_path = path + "model/lm_model"
model = BertForSequenceClassification.from_pretrained(model_path, cache_dir=None, num_labels=2)

# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", cache_dir=None, num_labels=2)
# model = BertForSequenceClassification.from_pretrained("bert-large-uncased", cache_dir=None, num_labels=2)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28989, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [0]:
optimizer = AdamW(model.parameters(), lr = 3e-6, weight_decay=0.01)

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 10000,
                                            num_training_steps = total_steps)

In [0]:
# Lowest validation lost
best_valid_loss = float('inf')

for epoch in range(epochs):

    train_loss_values = []
    train_acc_values = []

    valid_loss_values = []
    valid_acc_values = []

    # Evaluate training loss
    train_loss, train_acc = train(model, train_dataloader, optimizer, scheduler)
    # train_loss, train_acc = train_pairwise(model, train_dataloader, optimizer, scheduler)
    train_loss_values.append(train_loss)
    train_acc_values.append(train_acc)

    # Evaluate validation loss
    valid_loss, valid_acc = validate(model, validation_dataloader)
    # valid_loss, valid_acc = validate_pairwise(model, validation_dataloader)
    valid_loss_values.append(valid_loss)
    valid_acc_values.append(valid_acc)
    
    # At each epoch, if the validation loss is the best
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
    torch.save(model.state_dict(), path + 'model/' + str(epoch+1)+'_pointwise_lm_128_32_3e6.pt')

    print("\n\n Epoch {}:".format(epoch+1))
    print("\t Train Loss: {} | Train Accuracy: {}%".format(round(train_loss, 3), round(train_acc*100, 2)))
    print("\t Validation Loss: {} | Validation Accuracy: {}%\n".format(round(valid_loss, 3), round(valid_acc*100, 2)))

  1%|          | 110/8869 [00:44<59:39,  2.45it/s]

## **Evalulation**

In [0]:
def get_rank(model, test_set, qid_rel, max_seq_len):
    """
    Returns a dictionary - key: qid, value: list of ranked candidates
    -------------------
    model - PyTorch model
    test_set - List of lists:
            Each element is a list contraining 
            [qid, list of pos docid, list of candidate docid]
    qid_rel: Dictionary
            key: qid, value: list of relevant answer id
    max_seq_len: int
            Maximum sequence length
    """

    # Initiate empty dictionary
    qid_pred_rank = {}

    # Set model to evaluation mode
    model.eval()

    # For each element in the test set
    for i, seq in enumerate(tqdm(test_set)):
        
        # question id, list of rel answers, list of candidates
        qid, label, cands = seq[0], seq[1], seq[2]

        # Map question id to text
        q_text = qid_to_text[qid]

        # Convert list to numpy array
        cands_id = np.array(cands)

        # Empty list for the probability scores of relevancy
        scores = []

        # For each answer in the candidates
        for docid in cands:

            # Map the docid to text
            ans_text = docid_to_text[docid]

            # Create inputs for the model
            encoded_seq = tokenizer.encode_plus(q_text, ans_text, 
                                            max_length=max_seq_len, 
                                            pad_to_max_length=True, 
                                            return_token_type_ids=True,
                                            return_attention_mask = True)

            # Numericalized, padded, clipped seq with special tokens
            input_ids = torch.tensor([encoded_seq['input_ids']]).to(device)
            # Specify question seq and answer seq
            token_type_ids = torch.tensor([encoded_seq['token_type_ids']]).to(device)
            # Sepecify which position is part of the seq which is padded
            att_mask = torch.tensor([encoded_seq['attention_mask']]).to(device)

            # Don't calculate gradients
            with torch.no_grad():
            # Forward pass, calculate logit predictions for each QA pair
                outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=att_mask)

            # Get the predictions
            logits = outputs[0]

            # Apply activation function
            pred = softmax(logits, dim=1)
            # pred = torch.sigmoid(logits)

            # Move logits and labels to CPU
            pred = pred.detach().cpu().numpy()

            # Append relevant scores to list (where label = 1)
            scores.append(pred[:,1][0])

        # print(scores)

        # Get the indices of the sorted similarity scores
        sorted_index = np.argsort(scores)[::-1]

        # Get the list of docid from the sorted indices
        ranked_ans = cands_id[sorted_index]

        # Dict - key: qid, value: ranked list of docids
        qid_pred_rank[qid] = ranked_ans

    return qid_pred_rank

In [0]:
toy_test_label = dict(itertools.islice(test_qid_rel.items(), 3))
toy_test = test_set[:3]
# toy_test = [[14, [398960], [84963, 14255, 398960]],
#             [68, [19183], [107584, 562777, 19183]],
#             [70, [327002], [107584, 327002, 19183]]]

In [19]:
model.load_state_dict(torch.load(path+'model/1_pairwisewise50_128_32_3e6_05.pt'))

qid_pred_rank = get_rank(model, test_set, test_qid_rel, max_seq_len=128)
# qid_pred_rank = get_rank(model, toy_test, toy_test_label, max_seq_len=128)

100%|██████████| 333/333 [05:29<00:00,  1.01it/s]


In [21]:
k = 10

num_q = len(test_set)

MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, test_qid_rel, k)
# MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, toy_test_label, k)

print("\n\nAverage nDCG@{} for {} queries: {}\n".format(k, num_q, average_ndcg))

print("MRR@{} for {} queries: {}\n".format(k, num_q, MRR))

print("Average Precision@{}: {}".format(1, precision))



Average nDCG@10 for 333 queries: 0.40536723247075324

MRR@10 for 333 queries: 0.3471876638543306

Average Precision@1: 0.2702702702702703


In [0]:
save_pickle(path+'rank/rank_1_pointwise50_128_32_3e6.pickle', qid_pred_rank)