In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import pickle
import random
from collections import Counter
from tqdm import tqdm
import itertools
import pandas as pd
from itertools import islice
import numpy as np
import random
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.functional import softmax

# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)

Using device: cuda

Tesla K80
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


<torch._C.Generator at 0x7fe6961502f0>

In [0]:
!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup



In [0]:
path = "drive/My Drive/FiQA/"

In [0]:
from evaluate import *
from utils import *

In [0]:
# Dictionary - key: qid, value: list of positive docid
train_qid_rel = load_pickle(path + "new-data/qid_rel_train.pickle")
test_qid_rel = load_pickle(path + "new-data/qid_rel_test.pickle")
valid_qid_rel = load_pickle(path + "new-data/qid_rel_valid.pickle")

# List of lists:
# Each element is a list containing [qid, positive docid, negative docid]
train_set = load_pickle(path + 'new-data/data_50/train_set_50.pickle')
valid_set = load_pickle(path + 'new-data/data_50/valid_set_50.pickle')
# train_set = load_pickle(path + 'new-data/data_25/train_set_25.pickle')
# valid_set = load_pickle(path + 'new-data/data_25/valid_set_25.pickle')
# train_set = load_pickle(path + 'new-data/data_10/train_set_10.pickle')
# valid_set = load_pickle(path + 'new-data/data_10/valid_set_10.pickle')
# train_set = load_pickle(path + 'new-data/train_set.pickle')
# valid_set = load_pickle(path + 'new-data/valid_set.pickle')

# List of lists:
# Each element is a list contraining [qid, list of pos docid, list of candidate docid]
# Contains candidates with all pos docids
test_set = load_pickle(path + 'new-data/data_50/test_set_50.pickle')
# Contains candidates retrieved by BM25
# May be missing pos docids in candidates
test_set_full = load_pickle(path + 'new-data/data_50/test_set_full_50.pickle')

# Dictionary mapping docid and qid to raw text
docid_to_text = load_pickle(path + 'new-data/docid_to_text.pickle')
qid_to_text = load_pickle(path + 'new-data/qid_to_text.pickle')

In [0]:
print("Number of training samples: {}".format(len(train_set)))
print("Number of validation samples: {}".format(len(valid_set)))
print("Number of test samples: {}".format(len(test_set)))

Number of training samples: 142025
Number of validation samples: 15800
Number of test samples: 333


In [0]:
# Example of the training set [qid, pos docid, neg docid]
print(train_set[:10])

[[0, 18850, 378523], [0, 18850, 403025], [0, 18850, 173088], [0, 18850, 142631], [0, 18850, 59638], [0, 18850, 592891], [0, 18850, 53244], [0, 18850, 481339], [0, 18850, 22916], [0, 18850, 8891]]


In [0]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [0]:
def get_input(questions, answers, max_seq_len):
    """
    Returns input objects for training:
        input_ids: List of lists
                Each element contains a list of padded/clipped numericalized
                tokens of the sequences including [CLS] and [SEP] tokens
                e.g. [[101, 2054, 2003, 102, 2449, 1029, 102], ...]
        token_type_ids: List of lists
                Each element contains a list of segment token indices to 
                indicate first and second portions of the inputs. 
                0 corresponds to a question token, 1 corresponds an answer token
                e.g. [[0, 0, 0, 0, 1, 1, 1], ...]
        att_masks: List of lists
                Each element contains a list of mask values
                Mask to avoid performing attention on padding token indices. 
                1 for tokens that are NOT MASKED, 0 for MASKED tokens.
                e.g. [[1, 1, 1, 1, 1, 1, 1], ...]
    -----------------
    questions: List of strings
            Each element contains a question string
    answers: List of strings
            Each element contains an asnwer string
    max_seq_len: int
            Maximum sequence length
    """
    input_ids = []
    token_type_ids = []
    att_masks = []

    for i in tqdm(range(len(questions))):
        a = questions[i]
        b = answers[i]

        # Tokenize the questions and answers, apply padding, and trim the vectors
        # to the max_seq_len
        encoded_seq = tokenizer.encode_plus(a, b, 
                                            max_length=max_seq_len, 
                                            pad_to_max_length=True, 
                                            return_token_type_ids=True,
                                            return_attention_mask = True)

        input_id = encoded_seq['input_ids']
        token_type_id = encoded_seq['token_type_ids']
        att_mask = encoded_seq['attention_mask']

        assert len(input_id) == max_seq_len, "Input id dimension incorrect!"
        assert len(token_type_id) == max_seq_len, "Token type id dimension incorrect!"
        assert len(att_mask) == max_seq_len, "Attention mask dimension incorrect!"

        input_ids.append(input_id)
        token_type_ids.append(token_type_id)
        att_masks.append(att_mask)

    return input_ids, token_type_ids, att_masks

In [0]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    # Get the column with the higher probability
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

## **Pairwise**

In [0]:
# def get_pairwise_sequence_df(dataset):
#     """
#     Converts training and validation data into a df with relevancy labels
#     and map the qid and docid to text.
    
#     Returns data_df: df with columns qid, pos docid,
#             neg docid, pos label, neg_label, question (text), 
#             pos answer (text), neg answer (text)
#     ---------------
#     dataset: train or validation set in the form of list of lists
#     """
#     df = pd.DataFrame(dataset)
#     df = df.rename(columns={0: 'qid', 1: 'pos_id', 2:'neg_id'})
#     df['pos_label'] = df.apply(lambda x: 1, axis=1)
#     df['neg_label'] = df.apply(lambda x: 0, axis=1)

#     df['question'] = df['qid'].apply(lambda x: qid_to_text[x])
#     df['pos_ans'] = df['pos_id'].apply(lambda x: docid_to_text[x])
#     df['neg_ans'] = df['neg_id'].apply(lambda x: docid_to_text[x])

#     return df

In [0]:
# trainset = get_pairwise_sequence_df(train_set)
# train_questions = trainset.question.values
# train_pos_answers = trainset.pos_ans.values
# train_neg_answers = trainset.neg_ans.values

# train_pos_labels = trainset.pos_label.values
# train_neg_labels = trainset.neg_label.values

# train_pos_input, train_pos_type_id, train_pos_att_mask = get_input(train_questions, train_pos_answers, 256)
# train_neg_input, train_neg_type_id, train_neg_att_mask = get_input(train_questions, train_neg_answers, 256)

In [0]:
# validset = get_pairwise_sequence_df(valid_set)
# valid_questions = validset.question.values
# valid_pos_answers = validset.pos_ans.values
# valid_neg_answers = validset.neg_ans.values

# valid_pos_labels = validset.pos_label.values
# valid_neg_labels = validset.neg_label.values

# valid_pos_input, valid_pos_type_id, valid_pos_att_mask = get_input(valid_questions, valid_pos_answers, 256)
# valid_neg_input, valid_neg_type_id, valid_neg_att_mask = get_input(valid_questions, valid_neg_answers, 256)

In [0]:
# save_pickle(path+'/data-bert/train_pos_labels_small.pickle', train_pos_labels)
# save_pickle(path+'/data-bert/train_neg_labels_small.pickle', train_neg_labels)
# save_pickle(path+'/data-bert/valid_pos_labels_small.pickle', valid_pos_labels)
# save_pickle(path+'/data-bert/valid_neg_labels_small.pickle', valid_neg_labels)

# save_pickle(path+'/data-bert/train_pos_input_256_small.pickle', train_pos_input)
# save_pickle(path+'/data-bert/train_neg_input_256_small.pickle', train_neg_input)
# save_pickle(path+'/data-bert/valid_pos_input_256_small.pickle', valid_pos_input)
# save_pickle(path+'/data-bert/valid_neg_input_256_small.pickle', valid_neg_input)

# save_pickle(path+'/data-bert/train_pos_type_id_256_small.pickle', train_pos_type_id)
# save_pickle(path+'/data-bert/train_neg_type_id_256_small.pickle', train_neg_type_id)
# save_pickle(path+'/data-bert/valid_pos_type_id_256_small.pickle', valid_pos_type_id)
# save_pickle(path+'/data-bert/valid_neg_type_id_256_small.pickle', valid_neg_type_id)

# save_pickle(path+'/data-bert/train_pos_mask_256_small.pickle', train_pos_att_mask)
# save_pickle(path+'/data-bert/train_neg_mask_256_small.pickle', train_neg_att_mask)
# save_pickle(path+'/data-bert/valid_pos_mask_256_small.pickle', valid_pos_att_mask)
# save_pickle(path+'/data-bert/valid_neg_mask_256_small.pickle', valid_neg_att_mask)

In [0]:
# train_pos_labels = load_pickle(path+'/data-bert/train_pos_labels_small.pickle')
# train_neg_labels = load_pickle(path+'/data-bert/train_neg_labels_small.pickle')
# valid_pos_labels = load_pickle(path+'/data-bert/valid_pos_labels_small.pickle')
# valid_neg_labels = load_pickle(path+'/data-bert/valid_neg_labels_small.pickle')

# train_pos_input = load_pickle(path+'/data-bert/train_pos_input_256_small.pickle')
# train_neg_input = load_pickle(path+'/data-bert/train_neg_input_256_small.pickle')
# valid_pos_input = load_pickle(path+'/data-bert/valid_pos_input_256_small.pickle')
# valid_neg_input = load_pickle(path+'/data-bert/valid_neg_input_256_small.pickle')

# train_pos_type_id = load_pickle(path+'/data-bert/train_pos_type_id_256_small.pickle')
# train_neg_type_id = load_pickle(path+'/data-bert/train_neg_type_id_256_small.pickle')
# valid_pos_type_id = load_pickle(path+'/data-bert/valid_pos_type_id_256_small.pickle')
# valid_neg_type_id = load_pickle(path+'/data-bert/valid_neg_type_id_256_small.pickle')

# train_pos_mask = load_pickle(path+'/data-bert/train_pos_mask_256_small.pickle')
# train_neg_mask = load_pickle(path+'/data-bert/train_neg_mask_256_small.pickle')
# valid_pos_mask = load_pickle(path+'/data-bert/valid_pos_mask_256_small.pickle')
# valid_neg_mask = load_pickle(path+'/data-bert/valid_neg_mask_256_small.pickle')

In [0]:
# print(len(train_pos_input))
# print(len(valid_pos_input))

In [0]:
# train_pos_labels = train_pos_labels[:100]
# train_neg_labels = train_neg_labels[:100]
# train_pos_input = train_pos_input[:100]
# train_neg_input = train_neg_input[:100]
# train_pos_type_id = train_pos_type_id[:100]
# train_neg_type_id = train_neg_type_id[:100]
# train_pos_mask = train_pos_mask[:100]
# train_neg_mask = train_neg_mask[:100]

# valid_pos_labels = valid_pos_labels[:10]
# valid_neg_labels = valid_neg_labels[:10]
# valid_pos_input = valid_pos_input[:10]
# valid_neg_input = valid_neg_input[:10]
# valid_pos_type_id = valid_pos_type_id[:10]
# valid_neg_type_id = valid_neg_type_id[:10]
# valid_pos_mask = valid_pos_mask[:10]
# valid_neg_mask = valid_neg_mask[:10]

In [0]:
# # Convert lists to PyTorch tensors
# train_pos_inputs = torch.tensor(train_pos_input)
# train_neg_inputs = torch.tensor(train_neg_input)
# valid_pos_inputs = torch.tensor(valid_pos_input)
# valid_neg_inputs = torch.tensor(valid_neg_input)

# train_pos_labels = torch.tensor(train_pos_labels)
# train_neg_labels = torch.tensor(train_neg_labels)
# valid_pos_labels = torch.tensor(valid_pos_labels)
# valid_neg_labels = torch.tensor(valid_neg_labels)

# train_pos_type_ids = torch.tensor(train_pos_type_id)
# train_neg_type_ids = torch.tensor(train_neg_type_id)
# valid_pos_type_ids = torch.tensor(valid_pos_type_id)
# valid_neg_type_ids = torch.tensor(valid_neg_type_id)

# train_pos_masks = torch.tensor(train_pos_mask)
# train_neg_masks = torch.tensor(train_neg_mask)
# valid_pos_masks = torch.tensor(valid_pos_mask)
# valid_neg_masks = torch.tensor(valid_neg_mask)

In [0]:
# # Create DataLoaders to train the model in batches

# batch_size = 16

# # Create the DataLoader for our training set.
# train_data = TensorDataset(train_pos_inputs, train_pos_type_ids, train_pos_masks, train_pos_labels, train_neg_inputs, train_neg_type_ids, train_neg_masks, train_neg_labels)
# train_sampler = RandomSampler(train_data)
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# # Create the DataLoader for our validation set.
# validation_data = TensorDataset(valid_pos_inputs, valid_pos_type_ids, valid_pos_masks, valid_pos_labels, valid_neg_inputs, valid_neg_type_ids, valid_neg_masks, valid_neg_labels)
# validation_sampler = SequentialSampler(validation_data)
# validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [0]:
# def pairwise_loss(pos_scores, neg_scores):
#     """
#     Pairwise learning approach introduced in https://arxiv.org/pdf/1905.07588.pdf
#     """

#     cross_entropy_loss = -torch.log(pos_scores) - torch.log(1 - neg_scores)

#     margin = 0.5

#     hinge_loss = torch.max(torch.tensor(0, dtype=torch.float).to(device), margin - pos_scores + neg_scores)

#     loss = (0.5 * cross_entropy_loss + 0.5 * hinge_loss)

#     return loss

In [0]:
# def train_pairwise(model, train_dataloader, optimizer, scheduler):

#     # Store the average loss after each epoch so we can plot them.
#     loss_values = []

#     # Reset the loss and accuracy for each epoch
#     total_loss = 0
#     nb_train_steps = 0
#     train_accuracy = 0

#     # Set model in training mode
#     model.train()

#     # For each batch of training data...
#     for step, batch in enumerate(tqdm(train_dataloader)):

#         # batch contains eight PyTorch tensors:
#         pos_input = batch[0].to(device)
#         pos_type_id = batch[1].to(device)
#         pos_mask = batch[2].to(device)
#         pos_labels = batch[3].to(device)

#         neg_input = batch[4].to(device)
#         neg_type_id = batch[5].to(device)
#         neg_mask = batch[6].to(device)
#         neg_labels = batch[7].to(device)

#         # Zero gradients
#         model.zero_grad()

#         # Compute predictinos for postive and negative QA pairs
#         pos_outputs = model(pos_input, token_type_ids=pos_type_id, attention_mask=pos_mask, labels=pos_labels)
#         neg_outputs = model(neg_input, token_type_ids=neg_type_id, attention_mask=neg_mask, labels=neg_labels)

#         # Get the logits from the model for positive and negative QA pairs
#         pos_logits = pos_outputs[1]
#         neg_logits = neg_outputs[1]

#         # Get the column of the relevant scores and apply activation function
#         pos_scores = softmax(pos_logits, dim=1)[:,1]
#         neg_scores = softmax(neg_logits, dim=1)[:,1]
        
#         # Compute pairwise loss and get the mean of each batch
#         loss = pairwise_loss(pos_scores, neg_scores).mean()

#         # Move logits and labels to CPU
#         p_logits = pos_logits.detach().cpu().numpy()
#         p_labels = pos_labels.to('cpu').numpy()
#         n_logits = neg_logits.detach().cpu().numpy()
#         n_labels = neg_labels.to('cpu').numpy()

#         # Calculate the accuracy for each batch
#         tmp_pos_accuracy = flat_accuracy(p_logits, p_labels)
#         tmp_neg_accuracy = flat_accuracy(n_logits, n_labels)

#         # Accumulate the total accuracy.
#         train_accuracy += tmp_pos_accuracy
#         train_accuracy += tmp_neg_accuracy
        
#         # Track the number of batches (2 for pos and neg accuracies)
#         nb_train_steps += 2

#         # Accumulate the training loss over all of the batches
#         total_loss += loss.item()
    
#         # Perform a backward pass to calculate the gradients.
#         loss.backward()

#         # Clip the norm of the gradients to 1.0.
#         # This is to help prevent the "exploding gradients" problem.
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

#         # Update parameters and take a step using the computed gradient.
#         optimizer.step()

#         # Update scheduler
#         scheduler.step()

#     # Calculate the average loss over the training data.
#     avg_train_loss = total_loss / len(train_dataloader)            
    
#     # Store the loss value for plotting the learning curve.
#     loss_values.append(avg_train_loss)

#     # Compute accuracy for each epoch
#     acc = train_accuracy/nb_train_steps

#     return avg_train_loss, acc, loss_values

In [0]:
# def validate_pairwise(model, validation_dataloader):

#     # Set model in evaluation mode
#     model.eval()

#     # Tracking variables 
#     total_loss = 0
#     nb_eval_steps = 0
#     eval_accuracy = 0

#     # Evaluate data for one epoch
#     for batch in tqdm(validation_dataloader):
        
#         # Add batch to GPU
#         batch = tuple(t.to(device) for t in batch)
        
#         # Unpack the inputs from our dataloader
#         pos_input, pos_type_id, pos_mask, pos_labels, neg_input, neg_type_id, neg_mask, neg_labels = batch
        
#         # Telling the model not to compute or store gradients, saving memory and
#         # speeding up validation
#         with torch.no_grad():
#             # Compute predictinos for postive and negative QA pairs
#             pos_outputs = model(pos_input, token_type_ids=pos_type_id, attention_mask=pos_mask, labels=pos_labels)
#             neg_outputs = model(neg_input, token_type_ids=neg_type_id, attention_mask=neg_mask, labels=neg_labels)

#             # Get logits
#             pos_logits = pos_outputs[1]
#             neg_logits = neg_outputs[1]

#             # Apply activation function
#             pos_scores = softmax(pos_logits, dim=1)[:,1]
#             neg_scores = softmax(neg_logits, dim=1)[:,1]
        
#         loss = pairwise_loss(pos_scores, neg_scores).mean()

#         # Move logits and labels to CPU
#         p_logits = pos_logits.detach().cpu().numpy()
#         p_labels = pos_labels.to('cpu').numpy()
#         n_logits = neg_logits.detach().cpu().numpy()
#         n_labels = neg_labels.to('cpu').numpy()

#         # Calculate the accuracy for this batch of test sentences.
#         tmp_pos_accuracy = flat_accuracy(p_logits, p_labels)
#         tmp_neg_accuracy = flat_accuracy(n_logits, n_labels)

#         # Accumulate the total accuracy.
#         eval_accuracy += tmp_pos_accuracy
#         eval_accuracy += tmp_neg_accuracy

#         # Track the number of batches
#         nb_eval_steps += 2

#         total_loss += loss.item()

#     avg_loss = total_loss / len(validation_dataloader)
#     acc = eval_accuracy/nb_eval_steps

#     return avg_loss, acc

## **Pointwise**

In [0]:
def get_sequence_df(dataset):
    """
    Converts training and validation data into a df with relevancy labels
    and map the qid and docid to text.
    
    Returns data_df: df with columns qid, docid, label, question (text), answer (text)
    ---------------
    dataset: train or validation set in the form of list of lists
    """
    # Load list into a dataframe
    df = pd.DataFrame(dataset)
    df = df.rename(columns={0: 'qid', 1: 'pos', 2:'neg'})
    # Construct new df with positive docids
    df_pos = df[['qid', 'pos']]
    df_pos = df_pos.rename(columns={'pos': 'docid'})
    # Add new column and assign positive label
    df_pos['label'] = df_pos.apply(lambda x: 1, axis=1)
    df_pos = df_pos.drop_duplicates()

    # Construct new df with negative docids
    df_neg = df[['qid', 'neg']]
    df_neg = df_neg.rename(columns={'neg': 'docid'})
    # Add new column and assign negative label
    df_neg['label'] = df_neg.apply(lambda x: 0, axis=1)

    # Concatenate the positive and negative df
    data_df = pd.concat([df_pos, df_neg]).sort_values(by=['qid'])

    # Map id to text
    data_df['question'] = data_df['qid'].apply(lambda x: qid_to_text[x])
    data_df['ans_cand'] = data_df['docid'].apply(lambda x: docid_to_text[x])

    return data_df

In [0]:
# # Create train/validation data
# trainset = get_sequence_df(train_set)
# train_questions = trainset.question.values
# train_answers = trainset.ans_cand.values
# train_labels = trainset.label.values

# train_input, train_type_id, train_att_mask = get_input(train_questions, train_answers, 128)

# validset = get_sequence_df(valid_set)
# valid_questions = validset.question.values
# valid_answers = validset.ans_cand.values
# valid_labels = validset.label.values

# valid_input, valid_type_id, valid_att_mask = get_input(valid_questions, valid_answers, 128)

# save_pickle(path+'/pointwise-data/train_labels_128_50.pickle', train_labels)
# save_pickle(path+'/pointwise-data/valid_labels_128_50.pickle', valid_labels)

# save_pickle(path+'/pointwise-data/train_input_128_50.pickle', train_input)
# save_pickle(path+'/pointwise-data/valid_input_128_50.pickle', valid_input)

# save_pickle(path+'/pointwise-data/train_type_id_128_50.pickle', train_type_id)
# save_pickle(path+'/pointwise-data/valid_type_id_128_50.pickle', valid_att_mask)

# save_pickle(path+'/pointwise-data/train_mask_128_50.pickle', train_att_mask)
# save_pickle(path+'/pointwise-data/valid_mask_128_50.pickle', valid_att_mask)

100%|██████████| 298662/298662 [29:18<00:00, 169.81it/s]
100%|██████████| 33195/33195 [02:35<00:00, 213.79it/s]


In [0]:
# Load train/validation data
train_label = load_pickle(path+'/pointwise-data/train_labels_128_50.pickle')
valid_label = load_pickle(path+'/pointwise-data/valid_labels_128_50.pickle')

# bert-base tokenizer
train_input = load_pickle(path+'/pointwise-data/train_input_128_50.pickle')
valid_input = load_pickle(path+'/pointwise-data/valid_input_128_50.pickle')
train_type_id = load_pickle(path+'/pointwise-data/train_type_id_128_50.pickle')
valid_type_id = load_pickle(path+'/pointwise-data/valid_type_id_128_50.pickle')
train_att_mask = load_pickle(path+'/pointwise-data/train_mask_128_50.pickle')
valid_att_mask = load_pickle(path+'/pointwise-data/valid_mask_128_50.pickle')

# train_input = load_pickle(path+'/pointwise-data/train_input_256_10.pickle')
# valid_input = load_pickle(path+'/pointwise-data/valid_input_256_10.pickle')
# train_type_id = load_pickle(path+'/pointwise-data/train_type_id_256_10.pickle')
# valid_type_id = load_pickle(path+'/pointwise-data/valid_type_id_256_10.pickle')
# train_att_mask = load_pickle(path+'/pointwise-data/train_mask_256_10.pickle')
# valid_att_mask = load_pickle(path+'/pointwise-data/valid_mask_256_10.pickle')


# bert-large tokenizer
# train_input = load_pickle(path+'/pointwise-data/train_input_256_large.pickle')
# valid_input = load_pickle(path+'/pointwise-data/valid_input_256_large.pickle')
# train_type_id = load_pickle(path+'/pointwise-data/train_type_id_256_large.pickle')
# valid_type_id = load_pickle(path+'/pointwise-data/valid_type_id_256_large.pickle')
# train_att_mask = load_pickle(path+'/pointwise-data/train_mask_256_large.pickle')
# valid_att_mask = load_pickle(path+'/pointwise-data/valid_mask_256_large.pickle')

In [0]:
print(len(train_input))
print(len(valid_input))

298662
33195


In [0]:
# Convert all inputs and labels into torch tensors
train_labels = torch.tensor(train_label)
validation_labels = torch.tensor(valid_label)

train_inputs = torch.tensor(train_input)
validation_inputs = torch.tensor(valid_input)

train_type_ids = torch.tensor(train_type_id)
validation_type_ids = torch.tensor(valid_type_id)

train_masks = torch.tensor(train_att_mask)
validation_masks = torch.tensor(valid_att_mask)

# Create DataLoader to train in bacthes

batch_size = 256

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_type_ids, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_type_ids, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

print("Size of the DataLoader for the training set: {}".format(len(train_dataloader)))
print("Size of the DataLoader for the validation set: {}".format(len(validation_dataloader)))

Size of the DataLoader for the training set: 1167
Size of the DataLoader for the validation set: 130


In [0]:
def train(model, train_dataloader, optimizer, scheduler):

    # Reset the total loss each epoch
    total_loss = 0
    train_accuracy = 0
    # Track the number of batches
    num_steps = 0

    # Set model in train mode
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(tqdm(train_dataloader)):

        # batch contains four PyTorch tensors:
        #   [0]: input ids
        #   [1]: token_type_ids
        #   [2]: attention masks
        #   [3]: labels 
        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)

        # Zero the gradients
        model.zero_grad()        

        # Forward pass
        # The model will return the loss and the logits
        outputs = model(b_input_ids, 
                    token_type_ids = b_token_type_ids, 
                    attention_mask = b_input_mask, 
                    labels = b_labels)

        loss = outputs[0]
        logits = outputs[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch
        tmp_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        train_accuracy += tmp_accuracy

        # Track the number of batches
        num_steps += 1

        # Accumulate the training loss over all of the batches
        total_loss += loss.item()
    
        # Perform a backward pass to calculate the gradients
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update scheduler
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            

    acc = train_accuracy/num_steps

    return avg_train_loss, acc

In [0]:
def validate(model, validation_dataloader):

    # Evaluation mode
    model.eval()

    total_loss = 0
    eval_accuracy = 0
    nb_eval_steps = 0

    # For each batch of the validation data
    for batch in tqdm(validation_dataloader):
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from the dataloader
        b_input_ids, b_token_type_ids, b_input_masks, b_labels = batch
        
        # Don't to compute or store gradients
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids = b_token_type_ids, 
                            attention_mask = b_input_masks,
                            labels= b_labels)
        
        loss = outputs[0]
        logits = outputs[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

        total_loss += loss.item()

    acc = eval_accuracy/nb_eval_steps
    avg_loss = total_loss / len(validation_dataloader)

    return avg_loss, acc

## **Training**

In [0]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top

# model_path = "/content/drive/My Drive/FiQA/model/fin_model"
# model = BertForSequenceClassification.from_pretrained(model_path, cache_dir=None, num_labels=2)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", cache_dir=None, num_labels=2)
# model = BertForSequenceClassification.from_pretrained("bert-large-uncased", cache_dir=None, num_labels=2)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [0]:
optimizer = AdamW(model.parameters(), lr = 2e-7, eps = 1e-8)

# Number of training epochs (authors recommend between 2 and 4)
epochs = 6

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [0]:
# Lowest validation lost
best_valid_loss = float('inf')

for epoch in range(epochs):

    train_loss_values = []
    train_acc_values = []

    valid_loss_values = []
    valid_acc_values = []

    # Evaluate training loss
    train_loss, train_acc = train(model, train_dataloader, optimizer, scheduler)
    # train_loss, train_acc = train_pairwise(model, train_dataloader, optimizer, scheduler)
    train_loss_values.append(train_loss)
    train_acc_values.append(train_acc)

    # Evaluate validation loss
    valid_loss, valid_acc = validate(model, validation_dataloader)
    # valid_loss, valid_acc = validate_pairwise(model, validation_dataloader)
    valid_loss_values.append(valid_loss)
    valid_acc_values.append(valid_acc)
    
    # At each epoch, if the validation loss is the best
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    # torch.save(model.state_dict(), path + 'model/' + str(epoch+1)+'_pointwise_128_50_2e7.pt')

    print("\n\n Epoch {}:".format(epoch+1))
    print("\t Train Loss: {} | Train Accuracy: {}%".format(round(train_loss, 3), round(train_acc*100, 2)))
    print("\t Validation Loss: {} | Validation Accuracy: {}%\n".format(round(valid_loss, 3), round(valid_acc*100, 2)))

  0%|          | 0/1167 [00:00<?, ?it/s]


RuntimeError: ignored

## **Evalulation**

In [0]:
def get_rank(model, test_set, qid_rel, max_seq_len):
    """
    Returns a dictionary - key: qid, value: list of ranked candidates
    -------------------
    model - PyTorch model
    test_set - List of lists:
            Each element is a list contraining 
            [qid, list of pos docid, list of candidate docid]
    qid_rel: Dictionary
            key: qid, value: list of relevant answer id
    max_seq_len: int
            Maximum sequence length
    """

    # Initiate empty dictionary
    qid_pred_rank = {}

    # Set model to evaluation mode
    model.eval()

    # For each element in the test set
    for i, seq in enumerate(tqdm(test_set)):
        
        # question id, list of rel answers, list of candidates
        qid, label, cands = seq[0], seq[1], seq[2]

        # Map question id to text
        q_text = qid_to_text[qid]

        # Convert list to numpy array
        cands_id = np.array(cands)

        # Empty list for the probability scores of relevancy
        scores = []

        # For each answer in the candidates
        for docid in cands:

            # Map the docid to text
            ans_text = docid_to_text[docid]

            # Create inputs for the model
            encoded_seq = tokenizer.encode_plus(q_text, ans_text, 
                                            max_length=max_seq_len, 
                                            pad_to_max_length=True, 
                                            return_token_type_ids=True,
                                            return_attention_mask = True)

            # Numericalized, padded, clipped seq with special tokens
            input_ids = torch.tensor([encoded_seq['input_ids']]).to(device)
            # Specify question seq and answer seq
            token_type_ids = torch.tensor([encoded_seq['token_type_ids']]).to(device)
            # Sepecify which position is part of the seq which is padded
            att_mask = torch.tensor([encoded_seq['attention_mask']]).to(device)

            # Don't calculate gradients
            with torch.no_grad():
            # Forward pass, calculate logit predictions for each QA pair
                outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=att_mask)

            # Get the predictions
            logits = outputs[0]

            # Apply activation function
            pred = softmax(logits, dim=1)
            # pred = torch.sigmoid(logits)

            # Move logits and labels to CPU
            pred = pred.detach().cpu().numpy()

            # Append relevant scores to list (where label = 1)
            scores.append(pred[:,1][0])

        print(scores)

        # Get the indices of the sorted similarity scores
        sorted_index = np.argsort(scores)[::-1]

        # Get the list of docid from the sorted indices
        ranked_ans = cands_id[sorted_index]

        # Dict - key: qid, value: ranked list of docids
        qid_pred_rank[qid] = ranked_ans

    return qid_pred_rank

In [0]:
toy_test_label = dict(itertools.islice(test_qid_rel.items(), 3))
toy_test = test_set[:3]
# toy_test = [[14, [398960], [84963, 14255, 398960]],
#             [68, [19183], [107584, 562777, 19183]],
#             [70, [327002], [107584, 327002, 19183]]]

In [0]:
model.load_state_dict(torch.load(path+'model/2_pointwise_128_50_2e7.pt'))

# qid_pred_rank = get_rank(model, test_set, test_qid_rel, max_seq_len=256)
qid_pred_rank = get_rank(model, toy_test, toy_test_label, max_seq_len=128)

 33%|███▎      | 1/3 [00:09<00:18,  9.39s/it]

[0.8286438, 0.72305965, 0.011733182, 0.66405743, 0.0017544054, 0.6940986, 0.61294454, 0.81399083, 0.77299845, 0.7044675, 0.14875504, 0.41618046, 0.007451423, 0.013855331, 0.5748976, 0.7867979, 0.45340636, 0.76467717, 0.27105865, 0.008734003, 0.034240425, 0.7437239, 0.69202936, 0.8479345, 0.2737445, 0.8497187, 0.6374513, 0.0065103024, 0.45250896, 0.80903476, 0.0055502118, 0.68835175, 0.6404603, 0.71484506, 0.82673657, 0.3671019, 0.045627806, 0.554348, 0.00242398, 0.011698064, 0.2303327, 0.18343966, 0.019879868, 0.7884072, 0.6814348, 0.22711255, 0.23317909, 0.5559166, 0.007007909, 0.005729991, 0.6709125, 0.82528657, 0.06396706, 0.8109578, 0.34104696, 0.33541587, 0.40592748, 0.0077982196, 0.6434781, 0.04711554, 0.8567093, 0.013972649, 0.01646979, 0.8189694, 0.029761527, 0.8610558, 0.008540276, 0.38938007, 0.84518677, 0.71175677, 0.83031505, 0.06795997, 0.5186739, 0.01461953, 0.6108138, 0.17782144, 0.033869997, 0.018984089, 0.045874123, 0.7540156, 0.004031275, 0.85207224, 0.76301336, 0.480

 67%|██████▋   | 2/3 [00:19<00:09,  9.52s/it]

[0.86213356, 0.003980151, 0.035144392, 0.6132624, 0.75796264, 0.83089435, 0.6005238, 0.8118866, 0.46177718, 0.06186548, 0.8687662, 0.8243675, 0.14816026, 0.6996984, 0.8429698, 0.0076438594, 0.04756771, 0.3202796, 0.83714217, 0.13839525, 0.8545097, 0.38070533, 0.7365635, 0.8578779, 0.8620319, 0.051974375, 0.5307797, 0.8057818, 0.34916183, 0.2959169, 0.84927046, 0.21821316, 0.036304943, 0.6785242, 0.659052, 0.8550491, 0.8547522, 0.008818584, 0.8287596, 0.7810859, 0.8441255, 0.05264803, 0.8324131, 0.7346262, 0.8373968, 0.85325235, 0.043774836, 0.024039563, 0.82511085, 0.01290045, 0.8545681, 0.83262146, 0.024370007, 0.105772905, 0.80041295, 0.8532373, 0.8424274, 0.70840776, 0.6116571, 0.8212868, 0.8559321, 0.022429608, 0.012032435, 0.21483144, 0.080795385, 0.8026127, 0.81140584, 0.42879778, 0.16733481, 0.8472511, 0.861845, 0.3102774, 0.8437901, 0.8605962, 0.8456415, 0.86160094, 0.62944347, 0.0016501349, 0.8620463, 0.5898789, 0.85533804, 0.79665655, 0.0957264, 0.78953516, 0.023331624, 0.272

100%|██████████| 3/3 [00:29<00:00,  9.70s/it]

[0.8341422, 0.12613793, 0.8576452, 0.8566885, 0.84671885, 0.8575311, 0.84274924, 0.0793846, 0.83219993, 0.8535068, 0.85551965, 0.8499258, 0.8434709, 0.8499898, 0.85103434, 0.84640324, 0.85477054, 0.79664224, 0.8550273, 0.8506198, 0.015806949, 0.010430862, 0.36927533, 0.009509663, 0.029927423, 0.8507745, 0.8631282, 0.8429468, 0.0021748375, 0.8420089, 0.08560479, 0.8641538, 0.48489487, 0.25142258, 0.12776917, 0.8396991, 0.85695004, 0.8445097, 0.84370077, 0.8541948, 0.8562248, 0.8623709, 0.8517543, 0.84793603, 0.84275573, 0.84331775, 0.7375057, 0.25290534, 0.7485661, 0.84953004, 0.8533376, 0.7493143, 0.30629075, 0.8611337, 0.856114, 0.8476673, 0.8200375, 0.85774875, 0.86326253, 0.5735194, 0.85880405, 0.8196411, 0.09433267, 0.8213588, 0.84555787, 0.013839574, 0.83858967, 0.85905945, 0.8420442, 0.8289877, 0.8625771, 0.4052716, 0.36470538, 0.85907024, 0.856382, 0.8167418, 0.6382231, 0.8611037, 0.030615874, 0.8408913, 0.7989988, 0.8332019, 0.8608829, 0.8528213, 0.7518338, 0.8302951, 0.8466927




In [0]:
k = 10

num_q = len(test_set)

# MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, test_qid_rel, k)
MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, toy_test_label, k)

print("\n\nAverage nDCG@{} for {} queries: {}\n".format(k, num_q, average_ndcg))

print("MRR@{} for {} queries: {}\n".format(k, num_q, MRR))

print("Average Precision@{}: {}".format(1, precision))



Average nDCG@10 for 333 queries: 0.16666666666666666

MRR@10 for 333 queries: 0.08333333333333333

Average Precision@1: 0.0


In [0]:
save_pickle(path+'rank/rank_pairwise_001.pickle', qid_pred_rank)