In [14]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt

In [15]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

In [16]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

# Baseline Query Expansions
bo1 = pt.rewrite.Bo1QueryExpansion(index)
kl = pt.rewrite.KLQueryExpansion(index)
rm3 = pt.rewrite.RM3(index)

# Example pipeline
pipeline_bo1 = bm25 >> bo1 >> bm25
pipeline_kl = bm25 >> kl >> bm25
pipeline_rm3 = bm25 >> rm3 >> bm25

In [17]:
run_bo1 = pipeline_bo1(pt_dataset.get_topics('text'))
run_kl = pipeline_kl(pt_dataset.get_topics('text'))
run_rm3 = pipeline_rm3(pt_dataset.get_topics('text'))

persist_and_normalize_run(run_bo1, system_name='bo1', default_output='../runs/bo1')
persist_and_normalize_run(run_kl, system_name='kl', default_output='../runs/kl')
persist_and_normalize_run(run_rm3, system_name='rm3', default_output='../runs/rm3')


19:31:14.322 [main] WARN org.terrier.querying.RM1 - Did not identify any usable candidate expansion terms from docid 125137 among 5 possibilities
19:31:14.949 [main] WARN org.terrier.querying.RM1 - Did not identify any usable candidate expansion terms from docid 116910 among 4 possibilities
The run file is normalized outside the TIRA sandbox, I will store it at "../runs/bo1".
Done. run file is stored under "../runs/bo1/run.txt".
The run file is normalized outside the TIRA sandbox, I will store it at "../runs/kl".
Done. run file is stored under "../runs/kl/run.txt".
The run file is normalized outside the TIRA sandbox, I will store it at "../runs/rm3".
Done. run file is stored under "../runs/rm3/run.txt".


In [18]:
# Some baselines that were executed in TIRA

bm25_baseline = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 (tira-ir-starter-pyterrier)', pt_dataset)
sparse_cross_encoder = tira.pt.from_submission('ir-benchmarks/fschlatt/sparse-cross-encoder-4-512', pt_dataset)
rank_zephyr = tira.pt.from_submission('workshop-on-open-web-search/fschlatt/rank-zephyr', pt_dataset)

# This assumes we have execited the ../baseline-retrieval-system/baseline-retrieval-system.ipynb notebook before.
bo1_result = pt.io.read_results('../runs/bo1/run.txt')
kl_result = pt.io.read_results('../runs/kl/run.txt')
rm3_result = pt.io.read_results('../runs/rm3/run.txt')

pt.Experiment(
    [bo1_result, kl_result, rm3_result, bm25_baseline, sparse_cross_encoder, rank_zephyr],
    pt_dataset.get_topics(),
    pt_dataset.get_qrels(),
    ["ndcg_cut.10", "recip_rank", "recall_100"],
    names=["BO1", "KL", "RM3", "BM 25 (Baseline)", "Sparse Cross Encoder", "RankZephyr"]
)


There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


Unnamed: 0,name,ndcg_cut.10,recip_rank,recall_100
0,BO1,0.37822,0.571535,0.595545
1,KL,0.379452,0.56509,0.583871
2,RM3,0.333132,0.556066,0.558675
3,BM 25 (Baseline),0.374041,0.579877,0.601333
4,Sparse Cross Encoder,0.36646,0.61298,0.601333
5,RankZephyr,0.34707,0.568413,0.601333
