In [1]:
from sys import modules

IN_COLAB = 'google.colab' in modules
if IN_COLAB:
    !pip install -q ir_axioms[examples] python-terrier

In [2]:
# Start/initialize PyTerrier.
from pyterrier import started, init

if not started():
    init(tqdm="auto")

PyTerrier 0.8.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
from pyterrier.datasets import get_dataset, Dataset

# Load dataset.
dataset_name = "msmarco-passage"
dataset: Dataset = get_dataset(f"irds:{dataset_name}")
dataset_test: Dataset = get_dataset(f"irds:{dataset_name}/trec-dl-2020/judged")

In [4]:
from pathlib import Path

cache_dir = Path("cache/")
index_dir = cache_dir / "indices" / dataset_name.split("/")[0]

In [5]:
from pyterrier.index import IterDictIndexer

if not index_dir.exists():
    indexer = IterDictIndexer(str(index_dir.absolute()))
    indexer.index(
        dataset.get_corpus_iter(),
        fields=["text"]
    )

In [6]:
from pyterrier.batchretrieve import BatchRetrieve

# BM25 baseline retrieval.
bm25 = BatchRetrieve(str(index_dir.absolute()), wmodel="BM25")

In [7]:
from ir_axioms.axiom import (
    PROX1, PROX2, PROX3, PROX4, PROX5, TFC1, TFC3, STMC1_f, STMC2_f, LNC1, TF_LNC, LB1, REG_f, ORIG,
    AndAxiom, LEN_M_TDC, LEN_AND, LEN_DIV
)

axiom = (
        AndAxiom([
            TFC1(), TFC3(), LEN_M_TDC(),
            LNC1(), TF_LNC(),
            LB1(),
            REG_f(), LEN_AND(), LEN_DIV(),
            STMC1_f(), STMC2_f(),
            PROX1(), PROX2(), PROX3(), PROX4(), PROX5(),
        ]) | ORIG()
)

In [8]:
from ir_axioms.backend.pyterrier.transformers import AxiomaticReranker

kwiksort = bm25 % 50 >> AxiomaticReranker(
    axiom=axiom,
    index=index_dir,
    dataset=dataset_name,
    cache_dir=cache_dir,
    verbose=True
) ^ bm25

In [9]:
from pyterrier.pipelines import Experiment
from ir_measures import nDCG, MAP

experiment = Experiment(
    [bm25, kwiksort],
    dataset_test.get_topics(),
    dataset_test.get_qrels(),
    [nDCG @ 5, nDCG @ 10, nDCG @ 20, MAP],
    ["BM25", "KwikSort"],
    verbose=True,
)
experiment.sort_values(by="nDCG@10", ascending=False, inplace=True)

pt.Experiment:   0%|          | 0/2 [00:00<?, ?system/s]

Reranking query axiomatically:   0%|          | 0/54 [00:00<?, ?query/s]

In [10]:
experiment

Unnamed: 0,name,nDCG@5,nDCG@10,nDCG@20,AP
0,BM25,0.496557,0.493627,0.479866,0.358724
1,KwikSort,0.495741,0.491858,0.479045,0.358607
