# Leaderboards for all 32 Benchmarks in the IR Experiment Platform


### Import Libraries

In [1]:
from tira.rest_api_client import Client
from tqdm import tqdm
import json
from trectools import TrecQrel, TrecRun, TrecEval
from statistics import mean
import os

tira = Client()

# We organize the pilot benchmarks in the IR Experiment platform inside one "virtual pseudo shared task"
TASK = 'ir-benchmarks'


### Create the Table


In [18]:
import json
from statistics import mean, median

corpus_to_benchmarks = json.load(open('corpus-to-benchmark.json'))
all_evaluations = json.load(open('all_evaluations.json'))
types_of_retrieval_softwares = json.load(open('type-of-retrieval-softwares.json'))

scores_per_approach = {}

best_scores = {
    'Antique': 0.54,
    'Args.me': 0.56,
    'CORD-19': 0.695,
    'ClueWeb09': 0.23,
    'ClueWeb12': 0.36,
    'Cranfield': 0.011,
    'Disks4+5': 0.56,
    'Gov': 0.28,
    'Gov2': 0.50,
    'MARCO': 0.73,
    'Medline': 0.41,
    'NFCorpus': 0.31,
    'Vaswani': 0.47,
    'WaPo': 0.49
}

def aggregate_score_for_corpus(measure, corpus, approach):
    benchmarks = corpus_to_benchmarks[corpus]
    ret = []
    
    for benchmark in set(benchmarks):
        if approach not in all_evaluations[benchmark] or not all_evaluations[benchmark][approach] or not all_evaluations[benchmark][approach][measure]:
            continue
        ret += [all_evaluations[benchmark][approach][measure]]
    
    return (None, 0, len(benchmarks)) if len(ret) == 0 else (mean(ret), len(ret), len(benchmarks))

def format_s(i, corpus, approach):
    scores, included, expected = i
    if type(scores) == list:
        return f(mean(scores), sum(included), sum(expected))
    
    if scores is None:
        return '---'

    if approach not in scores_per_approach:
        scores_per_approach[approach] = {}
    
    assert corpus not in scores_per_approach[approach]
    
    scores_per_approach[approach][corpus] = scores
    
    ret = '{0:.2f}'.format(scores)
    
    if included != expected:
        return '{\\color{red} ' + ret + ' }'
    elif scores >= best_scores[corpus]:
        return '\\textbf{' + ret + '}'
    else:
        return ret

def report_best_median_worst(corpus, measure, group):
    scores = []
    expected = []
    included = []
    
    nones = 0
    
    for software in types_of_retrieval_softwares[group]:
        tmp = aggregate_score_for_corpus(measure, corpus, software)
        if tmp is None:
            nones += 1
            continue
        if tmp[0] is None:
            nones += 1
            continue

        scores += [tmp[0]]
        included += [tmp[1]]
        expected += [tmp[2]]
    
    if sum(expected) != sum(included) or nones > 0:
        expected = 2
        included = 1
    
    return format_s([max(scores), expected, included], corpus, group + '-best') + \
        ' & ' + format_s([median(scores), expected, included], corpus, group + '-median') + \
        ' & ' + format_s([min(scores), expected, included], corpus, group + '-worst')

def table_line(corpus, measure):
    expected_evaluations = len(corpus_to_benchmarks[corpus])
    ret = corpus + ' & '
    
    if 'clueweb' in corpus.lower():
        ret += format_s(aggregate_score_for_corpus(measure, corpus, 'ChatNoir'), corpus, 'ChatNoir') + ' & '
    else:
        ret += ' --- & ' 
    
    ret += format_s(aggregate_score_for_corpus(measure, corpus, 'BM25 Re-Rank (tira-ir-starter-pyterrier)'), corpus, 'BM25') + ' & '
    
    ret += report_best_median_worst(corpus, measure, 'Lexical')
    
    ret += ' & ' + format_s(aggregate_score_for_corpus(measure, corpus, 'ColBERT Re-Rank (tira-ir-starter-pyterrier)'), corpus, 'ColBERT') + ' & '
    
    ret += format_s(aggregate_score_for_corpus(measure, corpus, 'TASB msmarco-distilbert-base-cos (tira-ir-starter-beir)'), corpus, 'TASB') + ' & '
    
    ret += report_best_median_worst(corpus, measure, 'Bi-Encoder')
    
    ret += ' & ' + format_s(aggregate_score_for_corpus(measure, corpus, "DuoT5 base-10k-ms-marco Top-25 (tira-ir-starter-pyterrier)"), corpus, 'DT5-base') + ' & '
    ret += format_s(aggregate_score_for_corpus(measure, corpus, "DuoT5 Top-25 (tira-ir-starter-pyterrier)"), corpus, 'DT5') + ' & '
    ret += format_s(aggregate_score_for_corpus(measure, corpus, "DuoT5 3b-ms-marco Top-25 (tira-ir-starter-pyterrier)"), corpus, 'DT5-3b') + ' & '
    
    
    ret += format_s(aggregate_score_for_corpus(measure, corpus, 'MonoT5 Base (tira-ir-starter-gygaggle)'), corpus, 'MonoT5') + ' & '
    
    ret += report_best_median_worst(corpus, measure, 'PyGaggle')
    
    return ret + ' \\\\\n'

def format_avg(approach):
    return '{0:.2f}'.format(mean(scores_per_approach[approach].values()))

def create_table(measure):
    return """\\begin{table*}
\\centering
\\small
\\renewcommand{\\arraystretch}{0.8}%
\\setlength{\\tabcolsep}{3.2pt}%
\\caption{The effectiveness of the 50~retrieval models on the 31~benchmarks (Touch{\\'e}~23 is excluded as this shared task is still ongoing). We report the nDCG@10 scores for selected models (BM25, ColBERT, TAS-B, monoT5, and three duoT5 variants) and the best, median, and worst submission for the groups of the 20~lexical models and the 17~bi-encoder models. For corpora that have multiple benchmarks, we report the macro average.}
\\label{table-retrieval-effectiveness}
\\vspace{-2ex}
\\begin{tabular}{@{}lrlcllllllllllllll@{}}
\\toprule
\\bfseries Corpus & \\bfseries ChatNoir & \\multicolumn{4}{@{}c@{}}{\\bfseries Lexical} & \\bfseries Late Int. & \\multicolumn{4}{@{}c@{}}{\\bfseries Bi-Encoder} & \\multicolumn{3}{@{}c@{}}{\\bfseries duoT5} & \\multicolumn{4}{@{}c@{}}{\\bfseries PyGaggle}\\\\

\\cmidrule(r@{\\tabcolsep}){2-2}
\\cmidrule(r@{\\tabcolsep}){3-6}
\\cmidrule(r@{\\tabcolsep}){7-7}
\\cmidrule(r@{\\tabcolsep}){8-11}
\\cmidrule(r@{\\tabcolsep}){12-14}
\\cmidrule{15-18}
&  & BM25 & Best & Median & Worst & ColBERT & TAS-B & Best & Median & Worst & Base & Large & 3b & MonoT5 & Best & Median & Worst \\\\

\\midrule

""" + "\n\n".join(table_line(i, measure) for i in sorted(list(corpus_to_benchmarks.keys()))) + """

\\midrule

Avg. &  & """ + (' & '.join(format_avg(i) for i in ['BM25', 'Lexical-best', 'Lexical-median', 'Lexical-worst', 'ColBERT', 'TASB', 'Bi-Encoder-best', 'Bi-Encoder-median', 'Bi-Encoder-worst', 'DT5-base', 'DT5', 'DT5-3b', 'MonoT5', 'PyGaggle-best', 'PyGaggle-median', 'PyGaggle-worst'])) + """\\\\

\\bottomrule
\\end{tabular}
\\end{table*}
 
"""

print(create_table('ndcg@10'))

\begin{table*}
\centering
\small
\renewcommand{\arraystretch}{0.8}%
\setlength{\tabcolsep}{3.2pt}%
\caption{The effectiveness of the 50~retrieval models on the 31~benchmarks (Touch{\'e}~23 is excluded as this shared task is still ongoing). We report the nDCG@10 scores for selected models (BM25, ColBERT, TAS-B, monoT5, and three duoT5 variants) and the best, median, and worst submission for the groups of the 20~lexical models and the 17~bi-encoder models. For corpora that have multiple benchmarks, we report the macro average.}
\label{table-retrieval-effectiveness}
\vspace{-2ex}
\begin{tabular}{@{}lrlcllllllllllllll@{}}
\toprule
\bfseries Corpus & \bfseries ChatNoir & \multicolumn{4}{@{}c@{}}{\bfseries Lexical} & \bfseries Late Int. & \multicolumn{4}{@{}c@{}}{\bfseries Bi-Encoder} & \multicolumn{3}{@{}c@{}}{\bfseries duoT5} & \multicolumn{4}{@{}c@{}}{\bfseries PyGaggle}\\

\cmidrule(r@{\tabcolsep}){2-2}
\cmidrule(r@{\tabcolsep}){3-6}
\cmidrule(r@{\tabcolsep}){7-7}
\cmidrule(r@{\tabcolsep

In [7]:
scores_per_approach.keys()

dict_keys(['BM25', 'Lexical-best', 'Lexical-median', 'Lexical-worst', 'ColBERT', 'TASB', 'Bi-Encoder-best', 'Bi-Encoder-median', 'Bi-Encoder-worst', 'DT5-base', 'DT5', 'DT5-3b', 'MonoT5', 'PyGaggle-best', 'PyGaggle-median', 'PyGaggle-worst', 'ChatNoir'])

### Download all submissions

In [3]:
dry_run = False
dataset_to_submissions = {}

for dataset in tqdm(json.load(open('benchmarks-in-pilot-study.json')).keys(), 'Load all Submissions'):
    dataset_to_submissions[dataset] = tira.submissions(TASK, dataset)

qrels = {}

for dataset in tqdm(json.load(open('benchmarks-in-pilot-study.json')).keys(), 'Load all Qrels'):
    qrels[dataset] = TrecQrel(f'../data/qrels/{dataset}-qrels.txt')
    
evaluations = [(i, j) for j in json.load(open('retrieval-softwares-in-pilot-study.json')).keys() for i in dataset_to_submissions.keys()]
evaluations_with_parsed_runs = {}

for dataset, approach in tqdm(evaluations):
    if (approach, dataset) not in evaluations_with_parsed_runs:
        evaluations_with_parsed_runs[(approach, dataset)] = []  
    for _, submission in dataset_to_submissions[dataset][(dataset_to_submissions[dataset]['software'] == approach) & (dataset_to_submissions[dataset]['is_evaluation'] == False)].iterrows():

        src_dir = tira.download_zip_to_cache_directory(submission['task'], submission['dataset'], submission['team'], submission['run_id'])
        
        if dry_run:
            if os.path.isdir(src_dir.split('/output')[0]) and not os.path.isfile(src_dir + '/run.txt'):
                src_dir = src_dir.split('/output')[0]
                !rm -rf {src_dir}
        
            if os.path.isfile(src_dir + '/run.txt'):
                evaluations_with_parsed_runs[(approach, dataset)] += [src_dir + '/run.txt']
        
        if not dry_run:
            try:
                run = TrecRun(src_dir +'/run.txt')
                evaluations_with_parsed_runs[(approach, dataset)] += [run]
            except:
                try:
                    !rm -rf {src_dir}
                except:
                    pass


Load all Submissions: 100%|███████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:22<00:00,  1.40it/s]
Load all Qrels: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:02<00:00, 12.07it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1581/1581 [17:39<00:00,  1.49it/s]


In [4]:
def evaluate_run(approach, dataset):
    if (approach, dataset) not in evaluations_with_parsed_runs or not evaluations_with_parsed_runs[(approach, dataset)]:
        return None
    
    te = TrecEval(evaluations_with_parsed_runs[(approach, dataset)][0], qrels[dataset])
    
    return {
        'approach': approach,
        'dataset': dataset,
        'ndcg@10': te.get_ndcg(depth=10),
        'precision@10': te.get_precision(depth=10),
        'unjudged@10': te.get_unjudged(depth=10)
    }

all_evaluations = {}
for dataset, approach in tqdm(evaluations):
    if dataset not in all_evaluations:
        all_evaluations[dataset] = {}
    
    all_evaluations[dataset][approach] = evaluate_run(approach, dataset)

json.dump(all_evaluations, open('all_evaluations.json', 'w'))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1581/1581 [09:12<00:00,  2.86it/s]


In [7]:
df = {}

tmp_to_execute = [k for k, v in evaluations_with_parsed_runs.items() if not v or len(v) <= 0]
resources = json.load(open('retrieval-softwares-in-pilot-study.json'))
to_execute = []

for i in tqdm(list(tmp_to_execute)):
    #if resources[i[0]] == 'small-resources-gpu' and (i[0] not in {'senior-platform', 'claret-fortress', 'ectilinear-credits'}):
    if resources[i[0]] == 'small-resources-gpu':
        prefix = 'docker-id-244-on-' if 'clueweb' not in i[1] else 'docker-id-242-on-'
        
        to_execute += [{'approach': f'ir-benchmarks/tira-ir-starter/{i[0]}', 'dataset': i[1], 'rerank_dataset': prefix + i[1], 'resources': 'small-resources-gpu'}]


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 221259.16it/s]


In [9]:
for i in tqdm(to_execute):
    try:
        if 'rectilinear-credits' in i['approach'] or 'DuoT5 base-10k-ms-marco Top-25 (tira-ir-starter-pyterrier)' in i['approach']:
            tira.run_software(approach=i['approach'], dataset=i['dataset'], rerank_dataset=i['rerank_dataset'], resources=i['resources'])
    except:
        pass

0it [00:00, ?it/s]
