Imports:

In [1]:
import json
import math
import numpy as np
import os

Parse the documents:

In [2]:
def parse_qrel(doc):
    rels = {}
    with open(doc, 'r') as f:
        for line in f.readlines():
            query, iteration, document, relevancy = line.split()
            if int(query) not in rels:
                rels[int(query)] = {document: int(relevancy)}
            else:
                rels[int(query)][document] = int(relevancy)
    return rels

def parse_results(doc, rels):
    with open(doc, 'r') as f:
        current_query, _, document, rank, _, _ = next(f).split()
        rank_label_list = [(int(rank), rels[int(current_query)][document])]
        sorted_labels = []
        for line in f:
            query, _, document, rank, _, _ = line.split()
            if int(query) != int(current_query):
                sorted_labels = [x[1] for x in sorted(rank_label_list)]
                yield np.array(sorted_labels, dtype=np.int32)
                
                current_query = query
                rank_label_list = []
                
            rank_label_list.append((int(rank), rels[int(current_query)][document]))
    sorted_labels = [x[1] for x in sorted(rank_label_list)]
    return np.array(sorted_labels, dtype=np.int32)

Evaluation Metrics:

In [3]:
def evaluate(qrl, k_sizes=[1, 5, 10, 25]):
    qrl = [i for i in qrl]
    results = {}
    for k in k_sizes:
        results['p'+str(k)] = np.mean([precision(i, k) for i in qrl])
        results['r'+str(k)] = np.mean([recall(i, k) for i in qrl])
        results['f'+str(k)] = np.mean([F_score(i, k) for i in qrl])
        results['d'+str(k)] = np.mean([DCG(i, k) for i in qrl])
        results['n'+str(k)] = np.mean([NDCG(i, k) for i in qrl])
    results['map'] = np.mean([MAP(i) for i in qrl])
    results['mrr'] = np.mean([MRR(i) for i in qrl])

    return results

def precision(query_relevancy_labels, k):
    score = 0
    for i, relevance in enumerate(query_relevancy_labels, 1):
        if i <= k: score += relevance
    if i<k: score += 0*(k-i)
    return score/k

def recall(query_relevancy_labels, k):
    score = 0
    doc = 0
    for i, relevance in enumerate(query_relevancy_labels, 1):
        if i <= k: score += relevance
        doc += relevance
    
    if doc == 0: return 0
    if i<k: score += 0*(k-i)
    
    return score/doc

def F_score(query_relevancy_labels, k):
    p = precision(query_relevancy_labels, k)
    r = recall(query_relevancy_labels, k)
    
    if p == 0 and r == 0: return 0
    return (2*p*r)/(p+r)

def DCG(query_relevancy_labels, k):
    # Use log with base 2
    score = 0
    for i, relevance in enumerate(query_relevancy_labels, 1):
        if i <= k: score += ((2**relevance)-1)/(np.log2(1+i))
    return score

def NDCG(query_relevancy_labels, k):
    score = DCG(query_relevancy_labels, k)
    
    max_qrl = [i for i in query_relevancy_labels if i!=0]
    max_score = DCG(max_qrl, k)
    
    if max_score == 0: return 0
    return score/max_score

def MAP(query_relevancy_labels):
    score = 0
    doc = 0
    for i, relevance in enumerate(query_relevancy_labels, 1):
        score += relevance*precision(query_relevancy_labels, i)
        doc += relevance
    if doc == 0: return 0
    return score/doc

def MRR(query_relevancy_labels):
    score = 0
    for i, relevance in enumerate(query_relevancy_labels, 1):
        if relevance == 1 and (i/relevance < score or score == 0):
            score = i/relevance
    if score == 0: return 0
    return 1/score

In [5]:
def evaluate_all_results(res_dir='./output/results', qrel_doc='../test_documents/qrels.robust2004.txt', k_sizes=[1, 5, 10, 25]):
    rels = parse_qrel(qrel_doc)
    
    for doc in os.scandir(res_dir):
        res_doc = os.path.join(res_dir, doc.name)
        sorted_labels = [i for i in parse_results(res_doc, rels)]
        metrics =  evaluate(sorted_labels, k_sizes)
        return metrics
    
evaluate_all_results()

KeyError: 'FBIS4-44962'