# Small Retrieval Baseline with PyTerrier

This is a simple submission of a retrieval approach that uses a prepared PyTerrier index to create and output an BM25 ranking.

### Step 1: Import All Libraries


In [3]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client

# This method ensures that that PyTerrier is loaded so that it also works in the TIRA sandbox
ensure_pyterrier_is_loaded()
import pyterrier as pt
from tqdm import tqdm

tira = Client()

### Step 2: Load the data

In [4]:
dataset_id = 'longeval-tiny-train-20240315-training'
data = pt.get_dataset('irds:ir-lab-padua-2024/')

### Step 3: Build the Index

In [11]:
index = tira.pt.index('ir-benchmarks/tira-ir-starter/Index (tira-ir-starter-pyterrier)', 'longeval-tiny-train-20240315-training')

Download: 39.7MiB [00:06, 6.11MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_runs/ir-benchmarks/longeval-tiny-train-20240315-training/tira-ir-starter


In [5]:
print('Build index:')
# Both the indexer and batch retrieve use terriers default porter stemmer and a default stopword list (englisch)
iter_indexer = pt.IterDictIndexer("/tmp/index", overwrite = True, blocks = True,meta = {'docno':100, 'text': 20480}, stemmer = 'PorterStemmer')
!rm -Rf /tmp/index
index_ref = iter_indexer.index(data.get_corpus_iter())

print('Done. Index is created')

Build index:


Download: 83.2MiB [00:20, 4.20MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-padua-2024/longeval-tiny-train-20240315-training/


ir-lab-padua-2024/longeval-tiny-train-20240315-training documents: 100%|██████████| 47064/47064 [00:46<00:00, 1001.89it/s]


Done. Index is created


### Step 4: Create the Retrieval Pipeline


In [7]:
index = pt.IndexFactory.of(index_ref)

bm25 = pt.BatchRetrieve(index, wmodel="BM25", verbose=True)

### Step 5: Create the Run and Persist the Run


In [8]:
print('Create run')
run = bm25(data.get_topics("title"))
print('Done, run was created')


Create run


Download: 22.0iB [00:00, 80.7kiB/s]

Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-padua-2024/longeval-tiny-train-20240315-training/





FileNotFoundError: [Errno 2] No such file or directory: '/root/.tira/extracted_datasets/ir-lab-padua-2024/longeval-tiny-train-20240315-training//longeval-tiny-train-20240315-training' -> '/root/.tira/extracted_datasets/ir-lab-padua-2024/longeval-tiny-train-20240315-training/truth-data'

In [None]:
persist_and_normalize_run(run, 'bm25-no-weighing')