# This is my cool Pipeline

### Step 1: Import everything and load variables

In [1]:
import pyterrier as pt
import pandas as pd
import os
import json
from tqdm import tqdm
from pathlib import Path

DEPTH = 1000
SYSTEM_NAME = 'my-retrieval-system'

if not pt.started():
    pt.init(version=os.environ['PYTERRIER_VERSION'], helper_version=os.environ['PYTERRIER_HELPER_VERSION'], no_download=True)

input_data = os.environ.get('TIRA_INPUT_DIRECTORY', None)

if input_data:
    print(f'I will read the input data from {input_data}.')
else:
    input_data = '/workspace/sample-input'
    print('I will use a small hardcoded example.')

output_file = os.environ.get('TIRA_OUTPUT_DIRECTORY', '/tmp/')

print(f'I will write the run file to {output_file}')

from chatnoir_pyterrier import ChatNoirRetrieve
from chatnoir_api import Index as ChatNoirIndex, html_contents


chatnoir_config = json.load(open(input_data + '/chatnoir-credentials.json'))

chatnoir = ChatNoirRetrieve(api_key=chatnoir_config['apikey'])
chatnoir.index = getattr(ChatNoirIndex, chatnoir_config['index'])

print(f'ChatNoir Client will retrieve from index {chatnoir_config["index"]}')

PyTerrier 0.8.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


I will use a small hardcoded example.
I will write the run file to /tmp/
ChatNoir Client will retrieve from index ClueWeb12


### Step 2: Load the Data

In [2]:
print('Step 2: Load the data.')

queries = pd.read_json(input_data + '/queries.jsonl', lines=True)


Step 2: Load the data.


### Step 3: Define the Pipeline

In [3]:
print('Step 3: Define the Pipeline.')

def add_text_field(i):
    i['text'] = i['docno'].apply(lambda j: html_contents("j", chatnoir.index, plain=True))
    return i

chatnoir_pipeline = (chatnoir >> pt.apply.generic(lambda i: add_text_field(i)))

Step 3: Define the Pipeline.


### Step 4: Create Run

In [4]:
print('Step 4: Create Run.')
run = chatnoir_pipeline(queries)

Step 4: Create Run.


In [5]:
run

Unnamed: 0,qid,query,docno,score,rank,text
0,1111,does computer work increase eye pressure?,clueweb12-1106wb-16-17437,1182.2903,0,<!DOCTYPE html>\n<html>\n<head>\n <title>Oo...
1,1111,does computer work increase eye pressure?,clueweb12-0302wb-19-28258,1168.2012,1,<!DOCTYPE html>\n<html>\n<head>\n <title>Oo...
2,1111,does computer work increase eye pressure?,clueweb12-1212wb-00-02238,1167.9583,2,<!DOCTYPE html>\n<html>\n<head>\n <title>Oo...
3,1111,does computer work increase eye pressure?,clueweb12-0204wb-22-27404,1073.4702,3,<!DOCTYPE html>\n<html>\n<head>\n <title>Oo...
4,1111,does computer work increase eye pressure?,clueweb12-0200wb-41-19355,1072.9404,4,<!DOCTYPE html>\n<html>\n<head>\n <title>Oo...
5,1111,does computer work increase eye pressure?,clueweb12-0310wb-61-05771,907.32837,5,<!DOCTYPE html>\n<html>\n<head>\n <title>Oo...
6,1111,does computer work increase eye pressure?,clueweb12-1515wb-05-13926,905.2084,6,<!DOCTYPE html>\n<html>\n<head>\n <title>Oo...
7,1111,does computer work increase eye pressure?,clueweb12-1709wb-16-10264,872.4196,7,<!DOCTYPE html>\n<html>\n<head>\n <title>Oo...
8,1111,does computer work increase eye pressure?,clueweb12-1303wb-99-19032,846.1579,8,<!DOCTYPE html>\n<html>\n<head>\n <title>Oo...
9,1111,does computer work increase eye pressure?,clueweb12-1413wb-68-06977,846.1234,9,<!DOCTYPE html>\n<html>\n<head>\n <title>Oo...


### Step 5: Persist Run

In [6]:
print('Step 5: Persist Run.')

def normalize_run(run):
    try:
        run['qid'] = run['qid'].astype(int)
    except:
        pass
    run['system'] = SYSTEM_NAME
    run = run.copy().sort_values(["qid", "score", "docno"], ascending=[True, False, False]).reset_index()
    run = run.groupby("qid")[["qid", "docno", "rank", "score", "system"]].head(DEPTH)

    # Make sure that rank position starts by 1
    run["rank"] = 1
    run["rank"] = run.groupby("qid")["rank"].cumsum()
    
    return run

Path(output_file).mkdir(parents=True, exist_ok=True)
pt.io.write_results(normalize_run(run), output_file + '/run.txt', run_name='SYSTEM_NAME')

print('Done...')

Step 5: Persist Run.
Done...
