In [28]:
from sys import modules

IN_COLAB = 'google.colab' in modules
if IN_COLAB:
    !pip install -q ir_axioms[examples] python-terrier

In [29]:
# Start/initialize PyTerrier.
from pyterrier import started, init

if not started():
    init(tqdm="auto")

In [30]:
from pyterrier.datasets import get_dataset, Dataset

# Load dataset.
dataset_name = "antique/test"
dataset: Dataset = get_dataset(f"irds:{dataset_name}")

In [31]:
from pathlib import Path
from pyterrier.index import IterDictIndexer
from ir_axioms.backend.pyterrier import IndexRef, IndexFactory

# Load documents and build index.
index_dir = Path(f"cache/indices/{dataset_name.replace('/', '-')}").absolute()
index_ref: IndexRef
if index_dir.exists():
    index_ref = IndexRef.of(str(index_dir))
else:
    # Don't forget to include the 'text' field in the meta index.
    indexer = IterDictIndexer(str(index_dir))
    index_ref = indexer.index(dataset.get_corpus_iter(), fields=["text"])
index = IndexFactory.of(index_ref)

In [32]:
# Load topics and qrels.
topics = dataset.get_topics()
qrels = dataset.get_qrels()

In [33]:
from pyterrier.batchretrieve import BatchRetrieve

# BM25 baseline retrieval, cut off at 10 documents per query.
pipeline_bm25 = BatchRetrieve(index, wmodel="BM25", num_results=50)
pipeline_bm25(topics)

Unnamed: 0,qid,docid,docno,rank,score,query
0,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething
1,3990512,30676,3931664_0,1,15.621619,how can we get concentration onsomething
2,3990512,173781,4366141_0,2,15.395085,how can we get concentration onsomething
3,3990512,179429,1011598_10,3,15.134176,how can we get concentration onsomething
4,3990512,194913,4222212_0,4,15.134176,how can we get concentration onsomething
...,...,...,...,...,...,...
9667,1340574,228996,1357684_2,47,22.565443,why do some people only go to church on easter...
9668,1340574,352987,3604708_13,48,22.333112,why do some people only go to church on easter...
9669,1340574,167132,603435_12,49,22.245608,why do some people only go to church on easter...
9670,1971899,234068,1971899_6,0,28.234530,what is masturbat


In [34]:
from ir_axioms.axiom import ORIG, ASPECT_REG_fastText, PROX1, PROX2, PROX4, STMC2
from ir_axioms.backend.pyterrier.transformers import AxiomaticReranker

# Rerank the baseline retrieval using KwikSort with preferences from the axiom above.
pipeline_axiom = pipeline_bm25 >> AxiomaticReranker(
    ~((ASPECT_REG_fastText() % PROX1() % PROX2() % PROX4() % STMC2()) | ORIG()),
    index, dataset_name, cache_dir=Path("cache/"), verbose=True
)
pipeline_axiom(topics)

Reranking query axiomatically:   0%|          | 0/198 [00:00<?, ?query/s]

Unnamed: 0,docno,rank,score,qid,docid,query
0,3077638_1,1,50,3990512,102622,how can we get concentration onsomething
1,3931664_0,2,49,3990512,30676,how can we get concentration onsomething
2,4366141_0,3,48,3990512,173781,how can we get concentration onsomething
3,2295947_1,4,47,3990512,188552,how can we get concentration onsomething
4,1011598_10,5,46,3990512,179429,how can we get concentration onsomething
...,...,...,...,...,...,...
9667,1357684_2,48,3,1340574,228996,why do some people only go to church on easter...
9668,3604708_13,49,2,1340574,352987,why do some people only go to church on easter...
9669,603435_12,50,1,1340574,167132,why do some people only go to church on easter...
9670,1971899_6,1,2,1971899,234068,what is masturbat


In [35]:
from ir_measures import nDCG, MAP
from pyterrier.pipelines import Experiment

# Compare the baseline retrieval with the reranked pipeline in an experiment.
results = Experiment(
    [pipeline_bm25, pipeline_axiom],
    topics,
    qrels,
    [nDCG @ 5, nDCG @ 10, nDCG @ 20, MAP],
    ["BM25", "BM25 + Axiomatic Reranking"],
)
results.sort_values("nDCG@10", ascending=False, inplace=True)

Reranking query axiomatically:   0%|          | 0/198 [00:00<?, ?query/s]

In [36]:
results

Unnamed: 0,name,nDCG@5,nDCG@10,nDCG@20,AP
1,BM25 + Axiomatic Reranking,0.530281,0.511388,0.478413,0.396841
0,BM25,0.529428,0.510402,0.478976,0.399066
