In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········


In [0]:
import pickle
import random
from collections import Counter
from tqdm import tqdm
import itertools
import pandas as pd
from itertools import islice
import numpy as np
import random
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.functional import softmax

# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)

Using device: cuda

Tesla K80
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


<torch._C.Generator at 0x7fb81509ea90>

In [0]:
!pip install transformers
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

In [0]:
path = "drive/My Drive/FiQA/"

In [0]:
from evaluate import *
from utils import *

In [0]:
# Dictionary - key: qid, value: list of positive docid
train_qid_rel = load_pickle(path + "new-data/qid_rel_train.pickle")
test_qid_rel = load_pickle(path + "new-data/qid_rel_test.pickle")
valid_qid_rel = load_pickle(path + "new-data/qid_rel_valid.pickle")

# List of lists:
# Each element is a list containing [qid, positive docid, negative docid]
# train_set = load_pickle(path + 'new-data/data_50/train_set_50.pickle')
# valid_set = load_pickle(path + 'new-data/data_50/valid_set_50.pickle')
train_set = load_pickle(path + 'new-data/data_25/train_set_25.pickle')
valid_set = load_pickle(path + 'new-data/data_25/valid_set_25.pickle')

# List of lists:
# Each element is a list contraining [qid, list of pos docid, list of candidate docid]
# Contains candidates with all pos docids
test_set = load_pickle(path + 'new-data/data_50/test_set_50.pickle')
# Contains candidates retrieved by BM25
# May be missing pos docids in candidates
test_set_full = load_pickle(path + 'new-data/data_50/test_set_full_50.pickle')

# Dictionary mapping docid and qid to raw text
docid_to_text = load_pickle(path + 'new-data/docid_to_text.pickle')
qid_to_text = load_pickle(path + 'new-data/qid_to_text.pickle')

In [0]:
print("Number of training samples: {}".format(len(train_set)))
print("Number of validation samples: {}".format(len(valid_set)))
print("Number of test samples: {}".format(len(test_set)))

Number of training samples: 142025
Number of validation samples: 15800
Number of test samples: 333


In [0]:
# Example of the training set [qid, pos docid, neg docid]
print(train_set[:10])

[[0, 18850, 378523], [0, 18850, 403025], [0, 18850, 173088], [0, 18850, 142631], [0, 18850, 59638], [0, 18850, 592891], [0, 18850, 53244], [0, 18850, 481339], [0, 18850, 22916], [0, 18850, 8891]]


In [0]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [0]:
def get_sequence_df(dataset):
    """
    Converts training and validation data into a df with relevancy labels
    and map the qid and docid to text.
    
    Returns data_df: df with columns qid, docid, label, question (text), answer (text)
    ---------------
    dataset: train or validation set in the form of list of lists
    """
    # Load list into a dataframe
    df = pd.DataFrame(dataset)
    df = df.rename(columns={0: 'qid', 1: 'pos', 2:'neg'})
    # Construct new df with positive docids
    df_pos = df[['qid', 'pos']]
    df_pos = df_pos.rename(columns={'pos': 'docid'})
    # Add new column and assign positive label
    df_pos['label'] = df_pos.apply(lambda x: 1, axis=1)
    df_pos = df_pos.drop_duplicates()

    # Construct new df with negative docids
    df_neg = df[['qid', 'neg']]
    df_neg = df_neg.rename(columns={'neg': 'docid'})
    # Add new column and assign negative label
    df_neg['label'] = df_neg.apply(lambda x: 0, axis=1)

    # Concatenate the positive and negative df
    data_df = pd.concat([df_pos, df_neg]).sort_values(by=['qid'])

    # Map id to text
    data_df['question'] = data_df['qid'].apply(lambda x: qid_to_text[x])
    data_df['ans_cand'] = data_df['docid'].apply(lambda x: docid_to_text[x])

    return data_df

In [0]:
def get_input(questions, answers, max_seq_len):
    """
    Returns input objects for training:
        input_ids: List of lists
                Each element contains a list of padded/clipped numericalized
                tokens of the sequences including [CLS] and [SEP] tokens
                e.g. [[101, 2054, 2003, 102, 2449, 1029, 102], ...]
        token_type_ids: List of lists
                Each element contains a list of segment token indices to 
                indicate first and second portions of the inputs. 
                0 corresponds to a question token, 1 corresponds an answer token
                e.g. [[0, 0, 0, 0, 1, 1, 1], ...]
        att_masks: List of lists
                Each element contains a list of mask values
                Mask to avoid performing attention on padding token indices. 
                1 for tokens that are NOT MASKED, 0 for MASKED tokens.
                e.g. [[1, 1, 1, 1, 1, 1, 1], ...]
    -----------------
    questions: List of strings
            Each element contains a question string
    answers: List of strings
            Each element contains an asnwer string
    max_seq_len: int
            Maximum sequence length
    """
    input_ids = []
    token_type_ids = []
    att_masks = []

    for i in tqdm(range(len(questions))):
        a = questions[i]
        b = answers[i]

        # Tokenize the questions and answers, apply padding, and trim the vectors
        # to the max_seq_len
        encoded_seq = tokenizer.encode_plus(a, b, 
                                            max_length=max_seq_len, 
                                            pad_to_max_length=True, 
                                            return_token_type_ids=True,
                                            return_attention_mask = True)

        input_id = encoded_seq['input_ids']
        token_type_id = encoded_seq['token_type_ids']
        att_mask = encoded_seq['attention_mask']

        assert len(input_id) == max_seq_len, "Input id dimension incorrect!"
        assert len(token_type_id) == max_seq_len, "Token type id dimension incorrect!"
        assert len(att_mask) == max_seq_len, "Attention mask dimension incorrect!"

        input_ids.append(input_id)
        token_type_ids.append(token_type_id)
        att_masks.append(att_mask)

    return input_ids, token_type_ids, att_masks

In [0]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    # Get the column with the higher probability
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

## **Pointwise**

In [0]:
# Create train/validation data
trainset = get_sequence_df(train_set)
train_questions = trainset.question.values
train_answers = trainset.ans_cand.values
train_labels = trainset.label.values

train_input, train_type_id, train_att_mask = get_input(train_questions, train_answers, 256)

validset = get_sequence_df(valid_set)
valid_questions = validset.question.values
valid_answers = validset.ans_cand.values
valid_labels = validset.label.values

valid_input, valid_type_id, valid_att_mask = get_input(valid_questions, valid_answers, 256)

# save_pickle(path+'/pointwise-data/train_labels.pickle', train_labels)
# save_pickle(path+'/pointwise-data/valid_labels.pickle', valid_labels)

save_pickle(path+'/pointwise-data/train_input_256_large.pickle', train_input)
save_pickle(path+'/pointwise-data/valid_input_256_large.pickle', valid_input)

save_pickle(path+'/pointwise-data/train_type_id_256_large.pickle', train_type_id)
save_pickle(path+'/pointwise-data/valid_type_id_256_large.pickle', valid_att_mask)

save_pickle(path+'/pointwise-data/train_mask_256_large.pickle', train_att_mask)
save_pickle(path+'/pointwise-data/valid_mask_256_large.pickle', valid_att_mask)

100%|██████████| 298662/298662 [23:08<00:00, 215.10it/s]
100%|██████████| 33195/33195 [02:00<00:00, 275.83it/s]


In [0]:
# Load train/validation data
train_label = load_pickle(path+'/pointwise-data/train_labels.pickle')
valid_label = load_pickle(path+'/pointwise-data/valid_labels.pickle')

## bert-base tokenizer
# train_input = load_pickle(path+'/pointwise-data/train_input_256.pickle')
# valid_input = load_pickle(path+'/pointwise-data/valid_input_256.pickle')
# train_type_id = load_pickle(path+'/pointwise-data/train_type_id_256.pickle')
# valid_type_id = load_pickle(path+'/pointwise-data/valid_type_id_256.pickle')
# train_att_mask = load_pickle(path+'/pointwise-data/train_mask_256.pickle')
# valid_att_mask = load_pickle(path+'/pointwise-data/valid_mask_256.pickle')

# bert-large tokenizer
train_input = load_pickle(path+'/pointwise-data/train_input_256_large.pickle')
valid_input = load_pickle(path+'/pointwise-data/valid_input_256_large.pickle')
train_type_id = load_pickle(path+'/pointwise-data/train_type_id_256_large.pickle')
valid_type_id = load_pickle(path+'/pointwise-data/valid_type_id_256_large.pickle')
train_att_mask = load_pickle(path+'/pointwise-data/train_mask_256_large.pickle')
valid_att_mask = load_pickle(path+'/pointwise-data/valid_mask_256_large.pickle')

In [0]:
print("Number of training samples: {}".format(len(train_input)))
print("Number of validation samples: {}".format(len(valid_input)))

Number of training samples: 298662
Number of validation samples: 33195


In [0]:
print("Example of the train_input\n")
print(train_input[0])

print("\nExample of the train_type_id\n")
print(train_type_id[0])

print("\nExample of the train_att_mask\n")
print(train_att_mask[0])

Example of the train_input

[101, 2054, 2003, 2641, 1037, 2449, 10961, 2006, 1037, 2449, 4440, 1029, 102, 1996, 25760, 8606, 20246, 2000, 1996, 3395, 1012, 1999, 2236, 1996, 2190, 1045, 2064, 2360, 2003, 2115, 2449, 10961, 2089, 2022, 2139, 8566, 6593, 7028, 1012, 2021, 2009, 9041, 2006, 1996, 6214, 1998, 2054, 2009, 2003, 2017, 2215, 2000, 2139, 8566, 6593, 1012, 3604, 26457, 2040, 3604, 2185, 2013, 2188, 2006, 2449, 2089, 2139, 8566, 6593, 3141, 11727, 1010, 2164, 1996, 3465, 1997, 4285, 2037, 7688, 1010, 1996, 3465, 1997, 26859, 1998, 12278, 1998, 2060, 6623, 1998, 4072, 11727, 1012, 26457, 2024, 2641, 1523, 7118, 2185, 2013, 2188, 1524, 2065, 2037, 5704, 5478, 2068, 2000, 2022, 2185, 2013, 2188, 12381, 2936, 2084, 2019, 6623, 2154, 1521, 1055, 2147, 1998, 2027, 2342, 2000, 3637, 2030, 2717, 2000, 3113, 1996, 7670, 1997, 2037, 2147, 1012, 1996, 5025, 3465, 1997, 12278, 1998, 5043, 2389, 11727, 2089, 2022, 2139, 29510, 2030, 1996, 26980, 2089, 2224, 1037, 3115, 7954, 21447, 1998, 435

In [0]:
# train_labels = train_labels[:10000]
# train_input = train_input[:10000]
# train_type_id = train_type_id[:10000]
# train_att_mask = train_att_mask[:10000]

# valid_labels = valid_labels[:1000]
# valid_input = valid_input[:1000]
# valid_type_id = valid_type_id[:1000]
# valid_att_mask = valid_att_mask[:1000]

# print(len(train_input))
# print(len(valid_input))

In [0]:
# Convert all inputs and labels into torch tensors
train_labels = torch.tensor(train_label)
validation_labels = torch.tensor(valid_label)

train_inputs = torch.tensor(train_input)
validation_inputs = torch.tensor(valid_input)

train_type_ids = torch.tensor(train_type_id)
validation_type_ids = torch.tensor(valid_type_id)

train_masks = torch.tensor(train_att_mask)
validation_masks = torch.tensor(valid_att_mask)

In [0]:
# Create DataLoader to train in bacthes

batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_type_ids, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_type_ids, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

print("Size of the DataLoader for the training set: {}".format(len(train_dataloader)))
print("Size of the DataLoader for the validation set: {}".format(len(validation_dataloader)))

Size of the DataLoader for the training set: 37333
Size of the DataLoader for the validation set: 4150


In [0]:
def train(model, train_dataloader, optimizer, scheduler):

    # Store the average loss after each epoch for plotting
    loss_values = []

    # Reset the total loss each epoch
    total_loss = 0
    train_accuracy = 0
    # Track the number of batches
    num_steps = 0

    # Set model in train mode
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(tqdm(train_dataloader)):

        # batch contains four PyTorch tensors:
        #   [0]: input ids
        #   [1]: token_type_ids
        #   [2]: attention masks
        #   [3]: labels 
        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)

        # Zero the gradients
        model.zero_grad()        

        # Forward pass
        # The model will return the loss and the logits
        outputs = model(b_input_ids, 
                    token_type_ids = b_token_type_ids, 
                    attention_mask = b_input_mask, 
                    labels = b_labels)

        loss = outputs[0]
        logits = outputs[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch
        tmp_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        train_accuracy += tmp_accuracy

        # Track the number of batches
        num_steps += 1

        # Accumulate the training loss over all of the batches
        total_loss += loss.item()
    
        # Perform a backward pass to calculate the gradients
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update scheduler
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    acc = _accuracy/nb_eval_steps

    return avg_train_loss, acc, loss_values

In [0]:
def validate(model, validation_dataloader):

    # Evaluation mode
    model.eval()

    total_loss = 0
    eval_accuracy = 0
    nb_eval_steps = 0

    # For each batch of the validation data
    for batch in tqdm(validation_dataloader):
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from the dataloader
        b_input_ids, b_token_type_ids, b_input_masks, b_labels = batch
        
        # Don't to compute or store gradients
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids = b_token_type_ids, 
                            attention_mask = b_input_masks,
                            labels= b_labels)
        
        loss = outputs[0]
        logits = outputs[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

        total_loss += loss.item()

    acc = eval_accuracy/nb_eval_steps
    avg_loss = total_loss / len(validation_dataloader) 

    return avg_loss, acc

## **Training**

In [0]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top

# model_path = "/content/drive/My Drive/FiQA/model/fin_model"
# model = BertForSequenceClassification.from_pretrained(model_path, cache_dir=None, num_labels=2)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", cache_dir=None, num_labels=2)
# model = BertForSequenceClassification.from_pretrained("bert-large-uncased", cache_dir=None, num_labels=2)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [0]:
optimizer = AdamW(model.parameters(), lr = 2e-6, eps = 1e-8)

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [0]:
# Lowest validation lost
best_valid_loss = float('inf')

for epoch in range(epochs):

    # Evaluate training loss
    train_loss, train_acc, loss_values = train(model, train_dataloader, optimizer, scheduler)
    # Evaluate validation loss
    valid_loss, valid_acc = validate(model, validation_dataloader)
    
    # At each epoch, if the validation loss is the best
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    torch.save(model.state_dict(), path + 'model/' + str(epoch+1)+'_pointwise_25.pt')

    print("\n\n Epoch {}:".format(epoch+1))
    print("\t Train Loss: {} | Train Accuracy: {}%".format(round(train_loss, 3), round(train_acc*100, 2)))
    print("\t Validation Loss: {} | Validation Accuracy: {}%\n".format(round(valid_loss, 3), round(valid_acc*100, 2)))

## **Evalulation**

In [0]:
def get_rank(model, test_set, qid_rel, max_seq_len):
    """
    Returns a dictionary - key: qid, value: list of ranked candidates
    -------------------
    model - PyTorch model
    test_set - List of lists:
            Each element is a list contraining 
            [qid, list of pos docid, list of candidate docid]
    qid_rel: Dictionary
            key: qid, value: list of relevant answer id
    max_seq_len: int
            Maximum sequence length
    """

    # Initiate empty dictionary
    qid_pred_rank = {}

    # Set model to evaluation mode
    model.eval()

    # For each element in the test set
    for i, seq in enumerate(tqdm(test_set)):
        
        # question id, list of rel answers, list of candidates
        qid, label, cands = seq[0], seq[1], seq[2]

        # Map question id to text
        q_text = qid_to_text[qid]

        # Convert list to numpy array
        cands_id = np.array(cands)

        # Empty list for the probability scores of relevancy
        scores = []

        # For each answer in the candidates
        for docid in cands:

            # Map the docid to text
            ans_text = docid_to_text[docid]

            # Create inputs for the model
            encoded_seq = tokenizer.encode_plus(q_text, ans_text, 
                                            max_length=max_seq_len, 
                                            pad_to_max_length=True, 
                                            return_token_type_ids=True,
                                            return_attention_mask = True)

            # Numericalized, padded, clipped seq with special tokens
            input_ids = torch.tensor([encoded_seq['input_ids']]).to(device)
            # Specify question seq and answer seq
            token_type_ids = torch.tensor([encoded_seq['token_type_ids']]).to(device)
            # Sepecify which position is part of the seq which is padded
            att_mask = torch.tensor([encoded_seq['attention_mask']]).to(device)

            # Don't calculate gradients
            with torch.no_grad():
            # Forward pass, calculate logit predictions for each QA pair
                outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=att_mask)

            # Get the predictions
            logits = outputs[0]

            # Apply activation function
            pred = softmax(logits, dim=1)
            # pred = torch.sigmoid(logits)

            # Move logits and labels to CPU
            pred = pred.detach().cpu().numpy()

            # Append relevant scores to list (where label = 1)
            scores.append(pred[:,1][0])

        print(scores)

        # Get the indices of the sorted similarity scores
        sorted_index = np.argsort(scores)[::-1]

        # Get the list of docid from the sorted indices
        ranked_ans = cands_id[sorted_index]

        # Dict - key: qid, value: ranked list of docids
        qid_pred_rank[qid] = ranked_ans

    return qid_pred_rank

In [0]:
toy_test_label = dict(itertools.islice(test_qid_rel.items(), 2))
toy_test = test_set[:2]
# toy_test = [[14, [398960], [84963, 14255, 398960]],
#             [68, [19183], [107584, 562777, 19183]],
#             [70, [327002], [107584, 327002, 19183]]]

In [0]:
model.load_state_dict(torch.load(path+'model/1_pairwise_25.pt'))

# qid_pred_rank = get_rank(model, test_set, test_qid_rel, max_seq_len=512)
qid_pred_rank = get_rank(model, toy_test, toy_test_label, max_seq_len=256)


  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:07<00:07,  7.39s/it][A

[0.9999527, 0.999949, 0.18187124, 0.999884, 0.000103418584, 0.99862635, 0.9999441, 0.99994326, 0.9999306, 0.999944, 0.9999107, 0.011065468, 0.99990654, 0.9999043, 0.9996063, 0.99993455, 0.999925, 0.9999478, 0.9997577, 0.9284304, 0.124180555, 0.9999361, 0.9197168, 0.9999304, 0.999887, 0.99994993, 0.9999486, 0.00038539234, 0.999936, 0.9999198, 0.98532575, 0.9999027, 0.9998845, 0.99889565, 0.99994695, 0.89547914, 0.9347405, 0.99992275, 0.004050158, 0.0004175653, 0.70191604, 0.9997336, 0.61686075, 0.9996581, 0.99994326, 0.992488, 0.9998895, 0.99988997, 0.0010666391, 0.8459604, 0.9999151, 0.9999553, 0.99987435, 0.99994063, 0.9998692, 0.9999106, 0.77538884, 0.99957675, 0.9996093, 0.9997527, 0.99994993, 0.9999403, 0.8784555, 0.99993014, 0.99838126, 0.9999491, 0.9998518, 0.9999471, 0.9999548, 0.999882, 0.99994826, 0.99995434, 0.99989235, 0.7867769, 0.99981743, 0.74969816, 0.9998274, 0.9997148, 0.00039994254, 0.99994624, 0.087119706, 0.9999229, 0.99973744, 0.99985087, 0.9978452, 0.99994147, 0.9


100%|██████████| 2/2 [00:15<00:00,  7.49s/it][A
[A

[0.9999287, 0.0072370716, 0.99994373, 0.9999317, 0.9999299, 0.99995506, 0.99957365, 0.9999552, 0.00057716575, 0.9998795, 0.9999534, 0.99996006, 0.9999405, 0.9999261, 0.99993515, 0.7964893, 0.99969685, 0.9999058, 0.9999336, 0.8458747, 0.9999441, 0.9999472, 0.99995685, 0.99995244, 0.9999547, 0.99401265, 0.9999547, 0.99992085, 0.9999304, 0.9998442, 0.99995935, 0.99789006, 0.99994314, 0.999928, 0.9999423, 0.99995923, 0.99995387, 0.9999374, 0.9999492, 0.9999448, 0.999943, 0.99992156, 0.99994683, 0.9996908, 0.99995124, 0.9999485, 0.9999329, 0.00013012807, 0.99995685, 0.0009288644, 0.99994457, 0.9998565, 0.022961553, 0.99992335, 0.99994254, 0.9998714, 0.9999542, 0.9999484, 0.9996592, 0.99994934, 0.9999589, 0.99993706, 0.9998497, 0.99993575, 0.9999453, 0.99995613, 0.9999505, 0.9998099, 0.011141944, 0.99994516, 0.9999552, 0.9999478, 0.99994814, 0.9999567, 0.99995184, 0.99995375, 0.9999583, 0.00016959306, 0.9999527, 0.9999472, 0.9999584, 0.9999287, 0.99984765, 0.9999304, 0.00031214024, 0.9999368

In [0]:
k = 10

num_q = len(test_set)

# MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, test_qid_rel, k)
MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, toy_test_label, k)

print("\n\nAverage nDCG@{} for {} queries: {}\n".format(k, num_q, average_ndcg))

print("MRR@{} for {} queries: {}\n".format(k, num_q, MRR))

print("Average Precision@{}: {}".format(1, precision))



Average nDCG@10 for 333 queries: 0.4510568402073561

MRR@10 for 333 queries: 0.44166666666666665

Average Precision@1: 0.4


In [0]:
save_pickle(path+'rank/rank_bert_256_full.pickle', qid_pred_rank)