In [1]:
# Start/initialize PyTerrier.
from pyterrier import started, init

if not started():
    init(tqdm="notebook")

PyTerrier 0.7.2 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
# Imports
from pathlib import Path

from ir_measures import nDCG, RR, P
from pyterrier import IndexRef, IndexFactory
from pyterrier.batchretrieve import BatchRetrieve
from pyterrier.datasets import get_dataset, Dataset
from pyterrier.index import IterDictIndexer
from pyterrier.pipelines import Experiment
from pyterrier.text import get_text

from ir_axioms.axiom import Axiom
from ir_axioms.axiom.term_frequency import TFC1
from ir_axioms.axiom.length_norm import LNC1
from ir_axioms.backend.pyterrier.transformers import AxiomaticReranker, AxiomaticPermutationsCount, AxiomaticPreferences

In [3]:
# Load dataset.
dataset: Dataset = get_dataset("irds:antique/test")

In [4]:
# Load documents and build index.
index_dir = Path("./data/indices/antique").absolute()
index_ref: IndexRef
if index_dir.exists():
    index_ref = IndexRef.of(str(index_dir))
else:
    indexer = IterDictIndexer(str(index_dir))
    index_ref = indexer.index(dataset.get_corpus_iter(), fields=["text"])
index = IndexFactory.of(index_ref)

In [5]:
# Load topics.
topics = dataset.get_topics()

In [6]:
# BM25 baseline retrieval, cut off at 10 documents per query.
pipeline_bm25 = BatchRetrieve(index, wmodel="BM25") % 10
# pipeline_bm25(topics)

In [7]:
# Define axiom for reranker and permutations.
tfc1_axiom = TFC1()  # Query term frequency in documents.
lnc1_axiom = LNC1()  # Similarity between query and document terms.
axiom: Axiom = (tfc1_axiom * 1.5) + lnc1_axiom
# axiom: Axiom = (TFC1() * 1.5) + LNC1()  # We can also use the constructors directly.
axiom  # The combination of both axioms is itself an axiom.

AggregatedAxiom(axioms=[WeightedAxiom(axiom=<ir_axioms.axiom.term_frequency.TFC1 object at 0x7ff779aa81c0>, weight=1.5), <ir_axioms.axiom.length_norm.LNC1 object at 0x7ff779aa8130>])

In [8]:
# Add column counting the permutations of the BM25 baseline retrieval compared to the axiom.
pipeline_bm25_preferences = pipeline_bm25 >> get_text(dataset, "text") >> AxiomaticPreferences(axiom, index_ref)
pipeline_bm25_preferences(topics)

Compute axiomatic preferences: 100%|██████████| 198/198 [00:02<00:00, 88.88 topics/s] 


Unnamed: 0,qid,docid_a,docno_a,rank_a,score_a,query,text_a,docid_b,docno_b,rank_b,score_b,text_b,preference
0,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething,Learn to concentrate.. First you really must w...,102622,3077638_1,0,15.887435,Learn to concentrate.. First you really must w...,0.0
1,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething,Learn to concentrate.. First you really must w...,30676,3931664_0,1,15.621619,"When u ""love"" the subject u can concentrate.",0.0
2,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething,Learn to concentrate.. First you really must w...,173781,4366141_0,2,15.395085,You can improve your ability to concentrate by...,0.0
3,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething,Learn to concentrate.. First you really must w...,179429,1011598_10,3,15.134176,"its green, and has many vitamins that are conc...",0.0
4,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething,Learn to concentrate.. First you really must w...,194913,4222212_0,4,15.134176,"The less you talk, the more you think. Then y...",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19449,1340574,116656,2036141_7,9,29.157311,why do some people only go to church on easter...,We do celebrate Easter. Many churches and peop...,116656,2036141_7,9,29.157311,We do celebrate Easter. Many churches and peop...,0.0
19450,1971899,234068,1971899_6,0,28.234530,what is masturbat,what is masturbat***?,234068,1971899_6,0,28.234530,what is masturbat***?,0.0
19451,1971899,234068,1971899_6,0,28.234530,what is masturbat,what is masturbat***?,87610,2667237_0,1,12.560643,Sperm only comes out either when a man is arou...,0.0
19452,1971899,87610,2667237_0,1,12.560643,what is masturbat,Sperm only comes out either when a man is arou...,234068,1971899_6,0,28.234530,what is masturbat***?,0.0


In [9]:
# Add column counting the permutations of the BM25 baseline retrieval compared to the axiom.
pipeline_bm25_permutations = pipeline_bm25 >> get_text(dataset, "text") >> AxiomaticPermutationsCount(axiom, index_ref)
pipeline_bm25_permutations(topics)

Counting permutations compared to axiom preferences: 100%|██████████| 198/198 [00:01<00:00, 132.56 topics/s]


Unnamed: 0,qid,docid,docno,rank,score,query,text,permutations_count
0,3990512,102622,3077638_1,0,15.887435,how can we get concentration onsomething,Learn to concentrate.. First you really must w...,0.7
1,3990512,30676,3931664_0,1,15.621619,how can we get concentration onsomething,"When u ""love"" the subject u can concentrate.",0.8
2,3990512,173781,4366141_0,2,15.395085,how can we get concentration onsomething,You can improve your ability to concentrate by...,0.6
3,3990512,179429,1011598_10,3,15.134176,how can we get concentration onsomething,"its green, and has many vitamins that are conc...",0.8
4,3990512,194913,4222212_0,4,15.134176,how can we get concentration onsomething,"The less you talk, the more you think. Then y...",0.8
...,...,...,...,...,...,...,...,...
1947,1340574,87627,1972970_3,7,29.386170,why do some people only go to church on easter...,Easter is this Sunday in April!!!!,0.8
1948,1340574,357982,2980015_23,8,29.323234,why do some people only go to church on easter...,Mine is...but why would it stop people from go...,0.8
1949,1340574,116656,2036141_7,9,29.157311,why do some people only go to church on easter...,We do celebrate Easter. Many churches and peop...,0.8
1950,1971899,234068,1971899_6,0,28.234530,what is masturbat,what is masturbat***?,0.0


In [10]:
# Rerank the baseline retrieval using KwikSort with preferences from the axiom above.
pipeline_axiom = pipeline_bm25 >> get_text(dataset, "text") >> AxiomaticReranker(axiom, index_ref)
pipeline_axiom(topics)

Reranking with axiom preferences: 100%|██████████| 198/198 [00:01<00:00, 109.42 topics/s]


Unnamed: 0,docno,rank,score,qid,docid,query,text
0,3077638_1,1,10,3990512,102622,how can we get concentration onsomething,Learn to concentrate.. First you really must w...
1,3931664_0,2,9,3990512,30676,how can we get concentration onsomething,"When u ""love"" the subject u can concentrate."
2,4366141_0,3,8,3990512,173781,how can we get concentration onsomething,You can improve your ability to concentrate by...
3,1011598_10,4,7,3990512,179429,how can we get concentration onsomething,"its green, and has many vitamins that are conc..."
4,4222212_0,5,6,3990512,194913,how can we get concentration onsomething,"The less you talk, the more you think. Then y..."
...,...,...,...,...,...,...,...
1947,1972970_3,8,3,1340574,87627,why do some people only go to church on easter...,Easter is this Sunday in April!!!!
1948,2980015_23,9,2,1340574,357982,why do some people only go to church on easter...,Mine is...but why would it stop people from go...
1949,2036141_7,10,1,1340574,116656,why do some people only go to church on easter...,We do celebrate Easter. Many churches and peop...
1950,1971899_6,1,2,1971899,234068,what is masturbat,what is masturbat***?


In [11]:
# Load qrels.
qrels = dataset.get_qrels()

In [12]:
# Compare the baseline retrieval with the reranked pipeline in an experiment.
results = Experiment(
    [pipeline_bm25, pipeline_axiom],
    topics,
    qrels,
    [nDCG @ 5, nDCG @ 10, RR, P @ 5, P @ 10],
    ["BM25", "BM25 + Axiomatic Reranking"],
)
results.sort_values("nDCG@10", ascending=False, inplace=True)
results

Reranking with axiom preferences: 100%|██████████| 198/198 [00:01<00:00, 101.54 topics/s]


Unnamed: 0,name,nDCG@5,nDCG@10,RR,P@5,P@10
0,BM25,0.529428,0.509908,0.934708,0.836,0.7455
1,BM25 + Axiomatic Reranking,0.525695,0.50762,0.930375,0.831,0.7455
