# This is my cool Pipeline

### Step 1: Import everything and load variables

In [2]:
import pyterrier as pt
import pandas as pd
import os
import json
from tqdm import tqdm
from pathlib import Path

DEPTH = 1000
SYSTEM_NAME = 'my-retrieval-system'

if not pt.started():
    pt.init(version=os.environ['PYTERRIER_VERSION'], helper_version=os.environ['PYTERRIER_HELPER_VERSION'], no_download=True)

input_data = os.environ.get('TIRA_INPUT_DIRECTORY', None)

if input_data:
    print(f'I will read the input data from {input_data}.')
else:
    input_data = '/workspace/sample-input/full-rank'
    print('I will use a small hardcoded example.')

output_file = os.environ.get('TIRA_OUTPUT_DIRECTORY', '/tmp/')

print(f'I will write the run file to {output_file}')

from chatnoir_pyterrier import ChatNoirRetrieve
from chatnoir_api import Index as ChatNoirIndex, html_contents
from chatnoir_pyterrier.feature import Feature

chatnoir_config = json.load(open(input_data + '/chatnoir-credentials.json'))

chatnoir = ChatNoirRetrieve(api_key=chatnoir_config['apikey'])
chatnoir.features = [Feature.TARGET_URI, Feature.TITLE_TEXT, Feature.HTML_PLAIN, Feature.HTML_PLAIN]
chatnoir.verbose = True
chatnoir.index = getattr(ChatNoirIndex, chatnoir_config['index'])

print(f'ChatNoir Client will retrieve from index {chatnoir_config["index"]}')

I will use a small hardcoded example.
I will write the run file to /tmp/
ChatNoir Client will retrieve from index ClueWeb12


### Step 2: Load the Data

In [3]:
print('Step 2: Load the data.')

queries = pd.read_json(input_data + '/queries.jsonl', lines=True)


Step 2: Load the data.


### Step 3: Define the Pipeline

In [4]:
print('Step 3: Define the Pipeline.')

retrieval_pipeline = chatnoir

Step 3: Define the Pipeline.


### Step 4: Create Run

In [5]:
print('Step 4: Create Run.')
run = retrieval_pipeline(queries)

Step 4: Create Run.


Searching with ChatNoir:   0%|                                                                                           | 0/3 [00:00<?, ?query/s]ChatNoir API internal server error. Retrying in 1 seconds.
ChatNoir API internal server error. Retrying in 1 seconds.
ChatNoir API internal server error. Retrying in 2 seconds.
Searching with ChatNoir: 100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [01:37<00:00, 32.54s/query]


In [6]:
run

Unnamed: 0,qid,query,docno,score,target_uri,title_text,html_plain,rank
0,1111,does computer work increase eye pressure?,clueweb12-1106wb-16-17437,1182.2903,http://www.pingueculae.com/eye-strain-informat...,Eyes hurt looking computer screen,"<!doctype html>\n<meta charset=""utf-8"">\n<titl...",0
1,1111,does computer work increase eye pressure?,clueweb12-0302wb-19-28258,1168.2012,https://www.vsp.com/cms/edc/topics/how-the-eye...,How the Eye Works,"<!doctype html>\n<meta charset=""utf-8"">\n<titl...",1
2,1111,does computer work increase eye pressure?,clueweb12-1212wb-00-02238,1167.9583,https://vsp.com/cms/edc/topics/how-the-eye-wor...,How the Eye Works,"<!doctype html>\n<meta charset=""utf-8"">\n<titl...",2
3,1111,does computer work increase eye pressure?,clueweb12-0204wb-22-27404,1073.4702,http://www.marlev.com/HowItWorks.htm,How Does Homeopathy Work?,"<!doctype html>\n<meta charset=""utf-8"">\n<titl...",3
4,1111,does computer work increase eye pressure?,clueweb12-0200wb-41-19355,1072.9404,http://www.homeopathyplanet.com/AudeSapere/How...,How Does Homeopathy Work?,"<!doctype html>\n<meta charset=""utf-8"">\n<titl...",4
5,1111,does computer work increase eye pressure?,clueweb12-0310wb-61-05771,907.32837,http://jslhr.asha.org/cgi/content/full/50/3/621,Effect of Cues to Increase Sound Pressure Leve...,"<!doctype html>\n<meta charset=""utf-8"">\n<titl...",5
6,1111,does computer work increase eye pressure?,clueweb12-1515wb-05-13926,905.2084,http://www.shroffeye.org/eyecare.htm,"Eye Care Center, Eye Clinics, Eye Care","<!doctype html>\n<meta charset=""utf-8"">\n<titl...",6
7,1111,does computer work increase eye pressure?,clueweb12-1709wb-16-10264,872.4196,http://www.sensorprod.com/news/white-papers/ac...,Film | Pressure,"<!doctype html>\n<meta charset=""utf-8"">\n<titl...",7
8,1111,does computer work increase eye pressure?,clueweb12-1303wb-99-19032,846.1579,http://theanarchistlibrary.org/HTML/Solidarity...,Does work make you sick?,"<!doctype html>\n<meta charset=""utf-8"">\n<titl...",8
9,1111,does computer work increase eye pressure?,clueweb12-1413wb-68-06977,846.1234,http://www.control-your-blood-pressure.com/ind...,Lower blood pressure naturally,"<!doctype html>\n<meta charset=""utf-8"">\n<titl...",9


### Step 5: Persist Run

In [7]:
print('Step 5: Persist Run.')

def normalize_run(run):
    try:
        run['qid'] = run['qid'].astype(int)
    except:
        pass
    run['system'] = SYSTEM_NAME
    run = run.copy().sort_values(["qid", "score", "docno"], ascending=[True, False, False]).reset_index()
    run = run.groupby("qid")[["qid", "docno", "rank", "score", "system"]].head(DEPTH)

    # Make sure that rank position starts by 1
    run["rank"] = 1
    run["rank"] = run.groupby("qid")["rank"].cumsum()
    
    return run

Path(output_file).mkdir(parents=True, exist_ok=True)
pt.io.write_results(normalize_run(run), output_file + '/run.txt', run_name='SYSTEM_NAME')

print('Done...')

Step 5: Persist Run.
Done...


### Create the Re-Ranking Input

In [8]:
# TODO: Delete this...

#from bs4 import BeautifulSoup
#
#
#with open ('/workspace/sample-input/rerank.jsonl', 'w') as out:
#    for _, i in run.iterrows():
#        soup = BeautifulSoup(i['html_plain'], 'html.parser')
#        out.write(json.dumps({
#            'qid': i['qid'],
#            'query': i['query'],
#            'docno': i['docno'],
#            'text': soup.title(string=True)[0] + '\n\n' + soup.get_text(),
#            'original_doc': {
#                'uri': i['target_uri'],
#                'title': i['title_text'],
#                'html_plain': i['html_plain'],
#            }
#        }) + '\n')