In [2]:
!pip3 install trectools --no-deps
!pip3 install sarge --no-deps

Collecting trectools
  Downloading trectools-0.0.49.tar.gz (28 kB)
Building wheels for collected packages: trectools
  Building wheel for trectools (setup.py) ... [?25ldone
[?25h  Created wheel for trectools: filename=trectools-0.0.49-py3-none-any.whl size=27139 sha256=050c6b04c5007a86a1acf7961858a823f9220cda585ee02520d911c0bb021d21
  Stored in directory: /root/.cache/pip/wheels/6a/cd/17/9a6b28af70445d948c97018b43b9181acd2fdd23e115ee2055
Successfully built trectools
Installing collected packages: trectools
Successfully installed trectools-0.0.49
You should consider upgrading via the '/usr/bin/python3.8 -m pip install --upgrade pip' command.[0m
Collecting sarge
  Downloading sarge-0.1.7.post1-py2.py3-none-any.whl (18 kB)
Installing collected packages: sarge
Successfully installed sarge-0.1.7.post1
You should consider upgrading via the '/usr/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [12]:
from tira.rest_api_client import Client
import pandas as pd
from tqdm import tqdm
import ir_datasets
import json
from tqdm import tqdm
from trectools import TrecRun, TrecEval, TrecQrel

tira = Client()

TASK = 'ir-benchmarks'
TEAM = 'ows'
    
datasets = ['longeval-heldout-20230513-training', 'longeval-long-september-20230513-training',
            'longeval-short-july-20230513-training', 'longeval-train-20230513-training']


In [5]:
def load_qrels(dataset):
    ds = ir_datasets.load(dataset)
    ds = pd.DataFrame([{"query": i.query_id, "q0": "0", "docid": i.doc_id, "rel": i.relevance} for i in ds.qrels_iter()])
    ret = TrecQrel()
    ret.qrels_data = ds
    
    return ret

QRELS = {
    'longeval-heldout-20230513-training': load_qrels('longeval/heldout'),
    'longeval-long-september-20230513-training': load_qrels('longeval/b-long-september'),
    'longeval-short-july-20230513-training': load_qrels('longeval/a-short-july'),
    'longeval-train-20230513-training': load_qrels('longeval/train'),
}


In [6]:
def report_effectiveness(name, run, dataset):
    te = TrecEval(run, QRELS[dataset])
    
    return {'name': name,
            'Unjudged@10': te.get_unjudged(depth=10),
            'nDCG@10': te.get_ndcg(depth=10, removeUnjudged=True),
            'MRR': te.get_reciprocal_rank(removeUnjudged=True),
            'Dataset': dataset,
            }

In [17]:
df = []

for dataset in datasets:
    bm25 = TrecRun(tira.get_run_output(f'{TASK}/{TEAM}/PyTerrier-Index >> BM25', dataset) + '/run.txt')
    df += [report_effectiveness('bm25', bm25, dataset)]
    
    for wmodel in tqdm(['BM25', 'DPH', 'DirichletLM', 'LGD', 'PL2'], f'Evaluation on {dataset}'):
        for prompt in ['1', '2']:
            for variants in ['3', '5', '10']:
                run = TrecRun(f'all-results/{dataset}/{wmodel}-{variants}-variants-prompt-{prompt}-fused-run.txt.gz')
            
                df += [report_effectiveness(f'{wmodel}-{variants}-variants-prompt-{prompt}', run, dataset)]

df = pd.DataFrame(df)

Evaluation on longeval-heldout-20230513-training: 100%|█████████████████████████████████████████████████████████████| 5/5 [00:46<00:00,  9.32s/it]
Evaluation on longeval-long-september-20230513-training: 100%|██████████████████████████████████████████████████████| 5/5 [07:21<00:00, 88.24s/it]
Evaluation on longeval-short-july-20230513-training: 100%|██████████████████████████████████████████████████████████| 5/5 [07:29<00:00, 89.94s/it]
Evaluation on longeval-train-20230513-training: 100%|███████████████████████████████████████████████████████████████| 5/5 [05:24<00:00, 64.86s/it]


In [18]:
df.to_json('complete-evaluation.jsonl', lines=True, orient='records')

In [19]:
!ls -lh complete-evaluation.jsonl

-rw-rw-r--    1 root     users      18.6K Jun  6 09:21 [0;0mcomplete-evaluation.jsonl[m
