In [None]:
def evaluate(qid_ranked_docs, qid_rel, k):
    """
    Evaluate. Computes the MRR@k, average nDCG@k, and average precision@k1

    Returns:
        MRR: float
        average_ndcg: float
        avg_precision: float
        r_pos: int
    ----------
    Arguments:
        qid_ranked_docs: dictionary
            key - qid
            value - list of cand ans
        qid_rel:  dinctionary
            key- qid
            value - list of relevant ans
    """
    cumulated_reciprocal_rank = 0
    num_rel_docs = 0
    # Dictionary of the top-k relevancy scores of docs in the candidate answers
    rel_scores = {}
    precision_list = {}
    rank_pos = []

    # For each query
    for qid in qid_ranked_docs:
        # If the query has a relevant passage
        if qid in qid_rel:
            # Get the list of relevant docs for a query
            rel_docs = qid_rel[qid]
            # Get the list of ranked docs for a query
            cand_docs = qid_ranked_docs[qid]
            # Compute relevant scores of the candidates
            if qid not in rel_scores:
                rel_scores[qid] = []
                for i in range(0, k):
                    if cand_docs[i] in rel_docs:
                        rel_scores[qid].append(1)
                    else:
                        rel_scores[qid].append(0)
            # Compute th reciprocal rank and rank positions
            cumulated_reciprocal_rank, r_pos = compute_RR(cand_docs, rel_docs, cumulated_reciprocal_rank, rank_pos, k)

    # Compute the average MRR@k across all queries
    MRR = cumulated_reciprocal_rank/len(qid_ranked_docs)
    # Compute the nDCG@k across all queries
    average_ndcg = avg_ndcg(rel_scores, k)

    # Compute precision@1
    precision_at_k = []
    for qid, score in rel_scores.items():
        num_rel = 0
        for i in range(0, 1):
            if score[i] == 1:
                num_rel += 1
        precision_at_k.append(num_rel/1)

    avg_precision = mean(precision_at_k)

    return MRR, average_ndcg, avg_precision, r_pos


In [None]:
def predict(model, q_text, cands, max_seq_len):
    """Re-ranks the candidates answers for each question.

    Returns:
        ranked_ans: list of re-ranked candidate docids
        sorted_scores: list of relevancy scores of the answers
    -------------------
    Arguments:
        model - PyTorch model
        q_text - str - query
        cands -List of retrieved candidate docids
        max_seq_len - int
    """
    # Convert list to numpy array
    cands_id = np.array(cands)
    # Empty list for the probability scores of relevancy
    scores = []
    # For each answer in the candidates
    for docid in cands:
        # Map the docid to text
        ans_text = docid_to_text[docid]
        # Create inputs for the model
        encoded_seq = tokenizer.encode_plus(q_text, ans_text,
                                            max_length=max_seq_len,
                                            pad_to_max_length=True,
                                            return_token_type_ids=True,
                                            return_attention_mask = True)

        # Numericalized, padded, clipped seq with special tokens
        input_ids = torch.tensor([encoded_seq['input_ids']]).to(device)
        # Specify question seq and answer seq
        token_type_ids = torch.tensor([encoded_seq['token_type_ids']]).to(device)
        # Sepecify which position is part of the seq which is padded
        att_mask = torch.tensor([encoded_seq['attention_mask']]).to(device)
        # Don't calculate gradients
        with torch.no_grad():
            # Forward pass, calculate logit predictions for each QA pair
            outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=att_mask)
        # Get the predictions
        logits = outputs[0]
        # Apply activation function
        pred = softmax(logits, dim=1)
        # Move logits and labels to CPU
        pred = pred.detach().cpu().numpy()
        # Append relevant scores to list (where label = 1)
        scores.append(pred[:,1][0])
        # Get the indices of the sorted similarity scores
        sorted_index = np.argsort(scores)[::-1]
        # Get the list of docid from the sorted indices
        ranked_ans = list(cands_id[sorted_index])
        sorted_scores = list(np.around(sorted(scores, reverse=True),decimals=3))

    return ranked_ans, sorted_scores

In [None]:
def get_rank(model, test_set, max_seq_len):
    """Re-ranks the candidates answers for each question.

    Returns:
        qid_pred_rank: Dictionary
            key - qid
            value - list of re-ranked candidates
    -------------------
    Arguments:
        model - PyTorch model
        test_set  List of lists
        max_seq_len - int
    """
    # Initiate empty dictionary
    qid_pred_rank = {}
    # Set model to evaluation mode
    model.eval()
    # For each element in the test set
    for i, seq in enumerate(tqdm(test_set)):
        # question id, list of rel answers, list of candidates
        qid, label, cands = seq[0], seq[1], seq[2]
        # Map question id to text
        q_text = qid_to_text[qid]

        # List of re-ranked docids and the corresponding probabilities
        ranked_ans, sorted_scores = predict(model, q_text, cands, max_seq_len)

        # Dict - key: qid, value: ranked list of docids
        qid_pred_rank[qid] = ranked_ans

    return qid_pred_rank


get_trained_model() downloads a fine-tuned model - the available model_name are:

    finbert-qa: 'bert-qa' fine-tuned on FiQA
    finbert-domain: 'finbert-domain' fine-tuned on FiQA
    finbert-task: 'finberr-task' fine-tuned on FiQA
    bert-pointwise: 'bert-base-uncase' fine-tuned on FiQA using the cross-entropy loss
    bert-pairwise: 'bert-base-uncase' fine-tuned on FiQA using a pairwise loss



In [1]:
model_name = 'finbert-qa'
checkpoint = get_trained_model(model_name)

trained_model_path = "model/trained/" + model_name + "/" + checkpoint

NameError: name 'get_trained_model' is not defined

In [None]:
print("Evaluating:\n")
# Load model
model.load_state_dict(torch.load(trained_model_path))

# Get rank
qid_pred_rank = get_rank(model, test_set, config['max_seq_len'])

k = 10
num_q = len(test_set)

# Evaluate
MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, labels, k)

print("\n\nAverage nDCG@{0} for {1} queries: {2:.3f}".format(k, num_q, average_ndcg))
print("MRR@{0} for {1} queries: {2:.3f}".format(k, num_q, MRR))
print("Average Precision@1 for {0} queries: {1:.3f}".format(num_q, precision))