In [1]:
# Start/initialize PyTerrier.
from pyterrier import started, init

if not started():
    init()

PyTerrier 0.7.2 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
# Import everything.
from pathlib import Path

from pyterrier import IndexRef
from pyterrier.batchretrieve import BatchRetrieve
from pyterrier.datasets import get_dataset, Dataset
from pyterrier.index import IterDictIndexer
from pyterrier.pipelines import Experiment
from pyterrier.text import get_text

from ir_axioms.axiom.term_frequency import TFC1
from ir_axioms.backend.pyterrier.transformers import AxiomaticReranker

In [3]:
dataset: Dataset = get_dataset("irds:antique/test")
dataset

IRDSDataset('antique/test')

In [4]:
index_dir = Path('./data/indices/antique').absolute()

In [5]:
index_ref: IndexRef
if index_dir.exists():
    index_ref = IndexRef.of(str(index_dir))
else:
    indexer = IterDictIndexer(str(index_dir))
    index_ref = indexer.index(dataset.get_corpus_iter(), fields=['text'])

In [6]:
topics = dataset.get_topics()

In [7]:
# # Original run 'RmitLm' from TREC 18, cut off at 10 documents per query.
# pipeline_run = get_transformer(read_results(
#     "/mnt/ceph/storage/data-in-progress/data-research/web-search/web-search-trec/trec-system-runs/trec18/web.adhoc/input.RmitLm.gz"
# )) % 10
# pipeline_run(topics)

In [8]:
# BM25 baseline retrieval, cut off at 10 documents per query.
pipeline_bm25 = BatchRetrieve(index_ref, wmodel='BM25') % 10
pipeline_bm25(topics)

Unnamed: 0,qid,docid,docno,rank,score,query
0,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething
1,3990512,30676,3931664_0,1,15.621619,how can we get concentration onsomething
2,3990512,173781,4366141_0,2,15.395085,how can we get concentration onsomething
3,3990512,179429,1011598_10,3,15.134176,how can we get concentration onsomething
4,3990512,194913,4222212_0,4,15.134176,how can we get concentration onsomething
...,...,...,...,...,...,...
187638,1340574,87627,1972970_3,7,29.386170,why do some people only go to church on easter...
187639,1340574,357982,2980015_23,8,29.323234,why do some people only go to church on easter...
187640,1340574,116656,2036141_7,9,29.157311,why do some people only go to church on easter...
188631,1971899,234068,1971899_6,0,28.234530,what is masturbat


In [9]:
pipeline_axiom = pipeline_bm25 >> get_text(dataset, "text") >> AxiomaticReranker(TFC1(), index_ref)
pipeline_axiom(topics)

Reranking axiomatically: 100%|██████████| 198/198 [00:02<00:00, 96.28 topics/s] 


Unnamed: 0,docno,rank,score,qid,docid,query,text
0,3186954_1,1,10,100653,237982,how do i go about getting copies of letters of...,San Diego has much better weather.
1,221513_0,2,9,100653,61554,how do i go about getting copies of letters of...,San Diego doesn't have a supreme court. The su...
2,2335842_1,3,8,100653,49847,how do i go about getting copies of letters of...,"Call a freight forwarder, preferrably in your ..."
3,1948316_0,4,7,100653,151703,how do i go about getting copies of letters of...,San Diego! I told yer sis about this one.
4,1000063_0,5,6,100653,403508,how do i go about getting copies of letters of...,How would you like to have San Diego underwate...
...,...,...,...,...,...,...,...
1947,2144392_0,6,5,953489,45292,why do i have to dumb myself down all the time...,IM DUMB IM DUMB IM DUMB!!!!!!!!!!!!!!!! hehe :...
1948,3528380_5,7,4,953489,116709,why do i have to dumb myself down all the time...,I am a Democrat and am not afraid of free spee...
1949,1378072_14,8,3,953489,388739,why do i have to dumb myself down all the time...,Because......... I am answering dumb questions...
1950,3577456_1,9,2,953489,275766,why do i have to dumb myself down all the time...,libs are dumb i bet ima get thumbs down for th...


In [10]:
qrels = dataset.get_qrels()

In [11]:
Experiment(
    [pipeline_bm25, pipeline_axiom],
    topics,
    qrels,
    ['ndcg_cut_10', 'ndcg_cut_20', 'recip_rank', 'P.10', 'P.5'],
    ["bm25", "axiom"],
).sort_values('ndcg_cut_10', ascending=False)

Reranking axiomatically: 100%|██████████| 198/198 [00:01<00:00, 109.24 topics/s]


Unnamed: 0,name,ndcg_cut_10,ndcg_cut_20,recip_rank,P.10,P.5
1,axiom,0.51025,0.380903,0.937042,0.7455,0.827
0,bm25,0.509908,0.380645,0.934708,0.7455,0.836
