# This is my cool Pipeline

### Step 1: Import everything and load variables

In [1]:
import pyterrier as pt
import pandas as pd
from tira.third_party_integrations import ensure_pyterrier_is_loaded, get_input_directory_and_output_directory, persist_and_normalize_run
import json
from tqdm import tqdm

ensure_pyterrier_is_loaded()
input_directory, output_directory = get_input_directory_and_output_directory('./sample-input-full-rank')


Start PyTerrier with version=5.7, helper_version=0.0.7, no_download=True


PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


I will use a small hardcoded example located in ./sample-input-full-rank.
The output directory is /tmp/


### Step 2: Load the Data

In [2]:
print('Step 2: Load the data.')

queries = pt.io.read_topics(input_directory + '/queries.xml', format='trecxml')

documents = (json.loads(i) for i in open(input_directory + '/documents.jsonl', 'r'))


Step 2: Load the data.


### Step 3: Create the Index

In [3]:
print('Step 3: Create the Index.')

!rm -Rf ./index
iter_indexer = pt.IterDictIndexer("./index", meta={'docno' : 100})
index_ref = iter_indexer.index(tqdm(documents))

Step 3: Create the Index.


5it [00:00, 53.63it/s]


### Step 4: Create Run

In [4]:
print('Step 4: Create Run.')
run = pt.BatchRetrieve(index_ref, wmodel="BM25", verbose=True)(queries)

Step 4: Create Run.


BR(BM25): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 18.71q/s]


In [5]:
run

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,2,pangram-03,0,-0.491918,fox jumps above animal
1,1,0,pangram-01,1,-0.527167,fox jumps above animal
2,1,3,pangram-04,2,-0.983837,fox jumps above animal
3,1,1,pangram-02,3,-1.054335,fox jumps above animal
4,2,4,pangram-05,0,-0.409729,multiple animals including a zebra
5,2,2,pangram-03,1,-0.491918,multiple animals including a zebra
6,2,0,pangram-01,2,-0.527167,multiple animals including a zebra


### Step 5: Persist Run

In [6]:
print('Step 5: Persist Run.')

persist_and_normalize_run(run, output_file=output_directory, system_name='BM25', depth=1000)

Step 5: Persist Run.
