In [1]:
from sys import modules

IN_COLAB = 'google.colab' in modules
if IN_COLAB:
    !pip install -q ir_axioms python-terrier

In [2]:
# Start/initialize PyTerrier.
from pyterrier import started, init

if not started():
    init(tqdm="notebook")

PyTerrier 0.8.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
from pyterrier.datasets import get_dataset, Dataset

# Load dataset.
dataset_name = "antique/test"
dataset: Dataset = get_dataset(f"irds:{dataset_name}")

In [4]:
from pathlib import Path
from pyterrier.index import IterDictIndexer
from ir_axioms.backend.pyterrier import IndexRef, IndexFactory

# Load documents and build index.
index_dir = Path(f"./data/indices/{dataset_name}").absolute()
index_ref: IndexRef
if index_dir.exists():
    index_ref = IndexRef.of(str(index_dir))
else:
    # Don't forget to include the 'text' field in the meta index.
    indexer = IterDictIndexer(str(index_dir))
    index_ref = indexer.index(dataset.get_corpus_iter(), fields=["text"])
index = IndexFactory.of(index_ref)

In [5]:
# Load topics and qrels.
topics = dataset.get_topics()
qrels = dataset.get_qrels()

In [6]:
from pyterrier.batchretrieve import BatchRetrieve

# BM25 baseline retrieval, cut off at 10 documents per query.
pipeline_bm25 = BatchRetrieve(index, wmodel="BM25", num_results=10)
pipeline_bm25(topics)

Unnamed: 0,qid,docid,docno,rank,score,query
0,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething
1,3990512,30676,3931664_0,1,15.621619,how can we get concentration onsomething
2,3990512,173781,4366141_0,2,15.395085,how can we get concentration onsomething
3,3990512,179429,1011598_10,3,15.134176,how can we get concentration onsomething
4,3990512,194913,4222212_0,4,15.134176,how can we get concentration onsomething
...,...,...,...,...,...,...
1947,1340574,87627,1972970_3,7,29.386170,why do some people only go to church on easter...
1948,1340574,357982,2980015_23,8,29.323234,why do some people only go to church on easter...
1949,1340574,116656,2036141_7,9,29.157311,why do some people only go to church on easter...
1950,1971899,234068,1971899_6,0,28.234530,what is masturbat


In [7]:
from ir_axioms.backend.pyterrier.axiom import OracleAxiom
from ir_axioms.axiom import TFC1, LNC1, OriginalAxiom

# Define axiom for reranker and permutations.
tfc1_axiom = TFC1()  # Query term frequency in documents.
lnc1_axiom = LNC1()  # Similarity between query and document terms.
original_axiom = OriginalAxiom()  # Similarity between query and document terms.
oracle_axiom = OracleAxiom(topics, qrels)

In [8]:
from ir_axioms.backend.pyterrier.transformers import AxiomaticPreferences

# Add column counting the permutations of the BM25 baseline retrieval compared to the axiom.
pipeline_bm25_preferences = pipeline_bm25 >> AxiomaticPreferences(
    [original_axiom,
     oracle_axiom,
     tfc1_axiom, lnc1_axiom],
    index, dataset_name, verbose=True
)
pipeline_bm25_preferences(topics)

AxiomaticPreferences: 100%|██████████| 198/198 [00:06<00:00, 31.86query/s]


Unnamed: 0,qid,docid_a,docno_a,rank_a,score_a,query,docid_b,docno_b,rank_b,score_b,original_preference,oracle_preference,TFC1_preference,LNC1_preference
0,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething,102622,3077638_1,0,15.887435,0,,0,0
1,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething,30676,3931664_0,1,15.621619,1,,0,0
2,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething,173781,4366141_0,2,15.395085,1,,0,0
3,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething,179429,1011598_10,3,15.134176,1,,0,0
4,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething,194913,4222212_0,4,15.134176,1,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19449,1340574,116656,2036141_7,9,29.157311,why do some people only go to church on easter...,116656,2036141_7,9,29.157311,0,0.0,0,0
19450,1971899,234068,1971899_6,0,28.234530,what is masturbat,234068,1971899_6,0,28.234530,0,0.0,0,0
19451,1971899,234068,1971899_6,0,28.234530,what is masturbat,87610,2667237_0,1,12.560643,1,0.0,0,0
19452,1971899,87610,2667237_0,1,12.560643,what is masturbat,234068,1971899_6,0,28.234530,-1,0.0,0,0


In [9]:
from ir_axioms.backend.pyterrier.transformers import AxiomaticReranker

# Rerank the baseline retrieval using KwikSort with preferences from the axiom above.
pipeline_axiom = pipeline_bm25 >> AxiomaticReranker(
    tfc1_axiom + lnc1_axiom + original_axiom * 2.0,
    index, dataset_name, verbose=True
)
pipeline_axiom(topics)

AxiomaticReranker: 100%|██████████| 198/198 [00:03<00:00, 64.76query/s]


Unnamed: 0,docno,rank,score,qid,docid,query
0,3077638_1,1,10,3990512,102622,how can we get concentration onsomething
1,3931664_0,2,9,3990512,30676,how can we get concentration onsomething
2,4366141_0,3,8,3990512,173781,how can we get concentration onsomething
3,1011598_10,4,7,3990512,179429,how can we get concentration onsomething
4,4222212_0,5,6,3990512,194913,how can we get concentration onsomething
...,...,...,...,...,...,...
1947,1972970_3,8,3,1340574,87627,why do some people only go to church on easter...
1948,2980015_23,9,2,1340574,357982,why do some people only go to church on easter...
1949,2036141_7,10,1,1340574,116656,why do some people only go to church on easter...
1950,1971899_6,1,2,1971899,234068,what is masturbat


In [10]:
from ir_measures import nDCG, RR, P
from pyterrier.pipelines import Experiment

# Compare the baseline retrieval with the reranked pipeline in an experiment.
results = Experiment(
    [pipeline_bm25, pipeline_axiom],
    topics,
    qrels,
    [nDCG @ 5, nDCG @ 10, RR, P @ 5, P @ 10],
    ["BM25", "BM25 + Axiomatic Reranking"],
)
results.sort_values("nDCG@10", ascending=False, inplace=True)
results

AxiomaticReranker: 100%|██████████| 198/198 [00:02<00:00, 72.39query/s]


Unnamed: 0,name,nDCG@5,nDCG@10,RR,P@5,P@10
1,BM25 + Axiomatic Reranking,0.529076,0.510466,0.937875,0.833,0.7455
0,BM25,0.529428,0.509908,0.934708,0.836,0.7455
