# Example: Measure Indexing and Retrieval Effectiveness of Terrier, Anserini, and PISA

## Imports

In [None]:
from itertools import islice
from pathlib import Path
from shutil import rmtree

from ir_measures import nDCG
from pandas import DataFrame, concat
from pyterrier import Indexer, IterDictIndexer, IndexRef, Experiment
from pyterrier_anserini import AnseriniIndexer, AnseriniIndex, AnseriniRetriever
from pyterrier.datasets import get_dataset
from pyterrier.terrier import Retriever
from pyterrier_pisa import PisaIndexer, PisaIndex, PisaRetrieve
from seaborn import FacetGrid, lineplot

from tirex_tracker import tracking, TrackingHandle, Measure
from tirex_tracker.pyterrier import TrackedTransformer

## Data

Load the dataset from `ir_datasets`.

In [45]:
dataset = get_dataset("irds:antique/test")

Show some documents.

In [None]:
list(islice(dataset.get_corpus_iter(), 5))

Show the topics.

In [None]:
dataset.get_topics()

Show the qrels.

In [None]:
dataset.get_qrels()

## Indexing

We will consume the document iterator once to warm the document cache, so that the indexers are not slowed down by document downloads.

In [38]:
for _ in dataset.get_corpus_iter():
    pass







[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





antique/test documents: 100%|██████████| 403666/403666 [00:00<00:00, 476953.31it/s]


### Terrier Indexing

In [None]:
terrier_index_path = Path("./index-terrier").resolve()
print(str(terrier_index_path))
if terrier_index_path.exists():
    rmtree(terrier_index_path)
terrier_indexer = IterDictIndexer(str(terrier_index_path))

with tracking() as terrier_indexing_tracking_results:
    terrier_index: IndexRef = terrier_indexer.index(dataset.get_corpus_iter()) # type: ignore

### Anserini Indexing

In [None]:
anserini_index_path = Path("./index-anserini").resolve()
if anserini_index_path.exists():
    rmtree(anserini_index_path)
anserini_indexer: Indexer = AnseriniIndexer(str(anserini_index_path))

with tracking() as anserini_indexing_tracking_results:
    anserini_index: AnseriniIndex = anserini_indexer.index(dataset.get_corpus_iter())

### PISA Indexing

In [None]:
pisa_index_path = Path("./index-pisa").resolve()
if pisa_index_path.exists():
    rmtree(pisa_index_path)
pisa_indexer = PisaIndexer(str(pisa_index_path))
with tracking() as pisa_indexing_tracking_results:
    pisa_index: PisaIndex = pisa_indexer.index(dataset.get_corpus_iter())

### Results

In [None]:
def tracking_results_to_df(
    tracking_results: TrackingHandle, measure: Measure
) -> DataFrame:
    measure_results = tracking_results[measure]
    print(f"Tracking results for {measure}: {measure_results}")
    return DataFrame()

In [None]:
df_indexing = concat([
    tracking_results_to_df(tracking_results, measure)
    for tracking_results in [
        terrier_indexing_tracking_results,
        anserini_indexing_tracking_results,
        pisa_indexing_tracking_results,
    ]
    for measure in [
        Measure.CPU_USED_PROCESS_PERCENT,
        Measure.CPU_USED_SYSTEM_PERCENT,
        Measure.RAM_USED_PROCESS_KB,
        Measure.RAM_USED_SYSTEM_MB,
    ]
])

## Retrieval

Create the BM25 retrievers for Terrier, Anserini, and PISA. Then wrap the retrievers with `TrackedTransformer` to track their execution.

In [56]:
terrier_bm25 = TrackedTransformer(Retriever(terrier_index, wmodel="BM25"))
anserini_bm25: AnseriniRetriever = TrackedTransformer(anserini_index.bm25())
pisa_bm25: PisaRetrieve = TrackedTransformer(pisa_index.bm25())

## Experiment

In [70]:
results = Experiment(
    retr_systems=[
        terrier_bm25,
        anserini_bm25,
        pisa_bm25,
    ],
    topics=dataset.get_topics()[:3],
    qrels=dataset.get_qrels(),
    eval_metrics=[nDCG @ 10, nDCG @ 5],
    names=[
        "Terrier BM25",
        "Anserini BM25",
        "PISA BM25",
    ],
    perquery=True,
)
results

Give read access to /sys/class/powercap/intel-rapl/intel-rapl:1/energy_uj
Give read access to /sys/class/powercap/intel-rapl/intel-rapl:1/energy_uj
Give read access to /sys/class/powercap/intel-rapl/intel-rapl:1/energy_uj
Give read access to /sys/class/powercap/intel-rapl/intel-rapl:1/energy_uj
Give read access to /sys/class/powercap/intel-rapl/intel-rapl:1/energy_uj
Give read access to /sys/class/powercap/intel-rapl/intel-rapl:1/energy_uj


Unnamed: 0,name,qid,measure,value
10,Anserini BM25,2528767,nDCG@5,0.946503
11,Anserini BM25,2528767,nDCG@10,0.805926
6,Anserini BM25,3990512,nDCG@5,0.0
7,Anserini BM25,3990512,nDCG@10,0.041172
8,Anserini BM25,714612,nDCG@5,0.50874
9,Anserini BM25,714612,nDCG@10,0.412978
16,PISA BM25,2528767,nDCG@5,0.868795
17,PISA BM25,2528767,nDCG@10,0.763464
12,PISA BM25,3990512,nDCG@5,0.067827
13,PISA BM25,3990512,nDCG@10,0.065634


In [None]:
def tracking_results_to_df(tracking_results: TrackingHandle):
    return DataFrame(
        {
            "name": tracking_results.name,
            "time": tracking_results.time,
            "type": tracking_results.type,
            "value": tracking_results.value,
        }
    )