# This is my cool Pipeline

### Step 1: Import everything and load variables

In [3]:
import pyterrier as pt
import pandas as pd
import os
import json
from tqdm import tqdm
from pathlib import Path

DEPTH = 1000
SYSTEM_NAME = 'my-retrieval-system'

if not pt.started():
    pt.init(version='5.7', helper_version='0.0.7', no_download=True)

input_data = os.environ.get('TIRA_INPUT_DIRECTORY', None)

if input_data:
    print(f'I will read the input data from {input_data}.')
else:
    input_data = './cranfield'
    print('I will use a small hardcoded example.')

output_file = os.environ.get('TIRA_OUTPUT_DIRECTORY', '/tmp/')

print(f'I will write the run file to {output_file}')


I will use a small hardcoded example.
I will write the run file to /tmp/


### Step 2: Load the Data

In [3]:
print('Step 2: Load the data.')

queries = pd.read_json(input_data + '/queries.jsonl', lines=True)

# https://github.com/terrier-org/pyterrier/issues/62
queries['query'] = queries['query'].apply(lambda i: "".join([x if x.isalnum() else " " for x in i]))

documents = (json.loads(i) for i in open(input_data + '/documents.jsonl', 'r'))


Step 2: Load the data.


ValueError: Expected object or value

### Step 3: Create the Index

In [4]:
print('Step 3: Create the Index.')

!rm -Rf ./index
iter_indexer = pt.IterDictIndexer("./index", meta={'docno': 20, 'text': 4096})
index_ref = iter_indexer.index(tqdm(documents))

Step 3: Create the Index.


490it [00:01, 582.45it/s]



1400it [00:02, 570.59it/s] 


19:11:44.204 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 2 empty documents


### Step 4: Create Run

In [5]:
print('Step 4: Create Run.')
run = pt.BatchRetrieve(index_ref, wmodel="BM25", verbose=True)(queries)

Step 4: Create Run.


BR(BM25): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 225/225 [00:31<00:00,  7.18q/s]


In [6]:
run

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,50,51,0,29.469679,what similarity laws must be obeyed when const...
1,1,485,486,1,28.934086,what similarity laws must be obeyed when const...
2,1,11,12,2,25.145340,what similarity laws must be obeyed when const...
3,1,183,184,3,25.069377,what similarity laws must be obeyed when const...
4,1,877,878,4,22.940738,what similarity laws must be obeyed when const...
...,...,...,...,...,...,...
190757,365,299,300,988,0.436652,what design factors can be used to control lif...
190758,365,645,646,989,0.435173,what design factors can be used to control lif...
190759,365,1143,1144,990,0.418178,what design factors can be used to control lif...
190760,365,1391,1392,991,0.416821,what design factors can be used to control lif...


### Step 5: Persist Run

In [7]:
print('Step 5: Persist Run.')

def normalize_run(run):
    try:
        run['qid'] = run['qid'].astype(int)
    except:
        pass
    run['system'] = SYSTEM_NAME
    run = run.copy().sort_values(["qid", "score", "docno"], ascending=[True, False, False]).reset_index()
    run = run.groupby("qid")[["qid", "docno", "rank", "score", "system"]].head(DEPTH)

    # Make sure that rank position starts by 1
    run["rank"] = 1
    run["rank"] = run.groupby("qid")["rank"].cumsum()
    
    return run

Path(output_file).mkdir(parents=True, exist_ok=True)
pt.io.write_results(normalize_run(run), output_file + '/run.txt', run_name='SYSTEM_NAME')

print('Done...')

Step 5: Persist Run.
Done...
