# This is my cool Pipeline

### Step 1: Import everything and load variables

In [1]:
import pyterrier as pt
import pandas as pd
import os
import json
from tqdm import tqdm
from glob import glob
from pathlib import Path

DEPTH = 1000
SYSTEM_NAME = os.environ.get('TIRA_SYSTEM_NAME' ,'my-retrieval-system')

if not pt.started():
    pt.init(version=os.environ['PYTERRIER_VERSION'], helper_version=os.environ['PYTERRIER_HELPER_VERSION'], no_download=True)

input_data = os.environ.get('TIRA_INPUT_DIRECTORY', None)

if input_data:
    print(f'I will read the input data from {input_data}.')
else:
    input_data = '/workspace/sample-input/full-rank'
    print('I will use a small hardcoded example.')

output_file = os.environ.get('TIRA_OUTPUT_DIRECTORY', '/tmp/')

print(f'I will write the run file to {output_file}')


PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


I will use a small hardcoded example.
I will write the run file to /tmp/


### Step 2: Load the Queries

In [3]:
print('Step 2: Load the queries.')

def load_queries():
    file_name = input_data + '/queries.jsonl'
    
    if not os.path.exists(file_name):
        raise ValueError(f'Could not find the file "{file_name}". Got: {glob(input_data + "/*")}')
    
    ret = pd.read_json(file_name, lines=True)

    # https://github.com/terrier-org/pyterrier/issues/62\n",
    ret['query'] = ret['query'].apply(lambda i: "".join([x if x.isalnum() else " " for x in i]))
    
    return ret

queries =  load_queries()  
queries

Step 2: Load the queries.


Unnamed: 0,qid,query
0,34,Are social networking sites good for our society
1,48,Should the voting age be lowered


### Step 3: Index the images

In [4]:
print('Step 3: Create the Index.')

def all_images():
    for i in glob(input_data + '/images/*/*'):
        image_id = i.split('/')[-1]
        text = ''
        for txt_file in glob(i +'/*/*/*/text.txt'):
            text += '\n\n' + open(txt_file).read()
        
        yield {'docno': image_id, 'text': text.strip()}


!rm -Rf ./index
iter_indexer = pt.IterDictIndexer("./index", meta={'docno': 20, 'text': 4096})
index_ref = iter_indexer.index(tqdm(all_images()))


Step 3: Create the Index.


50it [00:00, 54.68it/s]


### Step 5: Define the Pipeline

In [6]:
print('Step 5: Define the Pipeline.')

retrieval_pipeline = pt.BatchRetrieve(index_ref, wmodel="BM25", verbose=True)

Step 5: Define the Pipeline.


### Step 6: Create Run

In [10]:
print('Step 6: Create Run.')
run = retrieval_pipeline(queries)

Step 6: Create Run.


BR(BM25): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  9.79q/s]


In [11]:
run

Unnamed: 0,qid,docid,docno,rank,score,query
0,34,39,Iad17912610912ffd,0,-0.152528,Are social networking sites good for our society
1,34,33,Ia20c1e2e90f832cb,1,-0.307966,Are social networking sites good for our society
2,34,49,I374eede3492beb08,2,-1.046229,Are social networking sites good for our society
3,34,24,I8b51d50df3ca6612,3,-1.059896,Are social networking sites good for our society
4,34,9,I0c02739ff554ca9c,4,-1.241300,Are social networking sites good for our society
...,...,...,...,...,...,...
74,48,8,I7dad15970750f8d4,31,-2.949061,Should the voting age be lowered
75,48,19,I9cca04466f27d923,32,-2.952404,Should the voting age be lowered
76,48,16,I67bbb02abaf26583,33,-2.956738,Should the voting age be lowered
77,48,10,Ia73d445074b4df3d,34,-2.961695,Should the voting age be lowered


### Step 7: Stence Detection

In [12]:
print('Step 7: Define stence detection')

def detect_stance(query_document_pair):
    # As baseline, we return always pro
    return 'PRO'

Step 7: Define stence detection


### Step 6: Persist Run

In [13]:
print('Step 6: Persist Run.')

def normalize_run(run):
    try:
        run['qid'] = run['qid'].astype(int)
    except:
        pass
    run['system'] = SYSTEM_NAME
    run = run.copy().sort_values(["qid", "score", "docno"], ascending=[True, False, False]).reset_index()
    run = run.groupby("qid")[["qid", "docno", "rank", "score", "system"]].head(DEPTH)

    # Make sure that rank position starts by 1
    run["rank"] = 1
    run["rank"] = run.groupby("qid")["rank"].cumsum()
    run["Q0"] = run.apply(lambda i: detect_stance(i), axis=1)
    run["system"] = SYSTEM_NAME
    
    return run[['qid', 'Q0', 'docno', 'rank', 'score', 'system']]

Path(output_file).mkdir(parents=True, exist_ok=True)
normalize_run(run).to_csv(output_file + '/run.txt', sep=' ', header=False, index=False)

print('Done...')

Step 6: Persist Run.
Done...


In [14]:
!cat /tmp/run.txt

34 PRO Iad17912610912ffd 1 -0.15252834973127183 my-retrieval-system
34 PRO Ia20c1e2e90f832cb 2 -0.30796645663411376 my-retrieval-system
34 PRO I374eede3492beb08 3 -1.046228903596743 my-retrieval-system
34 PRO I8b51d50df3ca6612 4 -1.0598961782264882 my-retrieval-system
34 PRO I0c02739ff554ca9c 5 -1.2412999972368923 my-retrieval-system
34 PRO I3148bc10eaa1db27 6 -1.324361633481474 my-retrieval-system
34 PRO Ia51bd34a65572b5f 7 -1.7183700850377206 my-retrieval-system
34 PRO Icd93895acd1ab732 8 -2.158627225073589 my-retrieval-system
34 PRO I270936e4b9d90dbb 9 -2.4947985179280403 my-retrieval-system
34 PRO I2b62b2335042df6d 10 -2.6692315605255468 my-retrieval-system
34 PRO Ib94f6daf4ab47689 11 -2.803357673757736 my-retrieval-system
34 PRO Icc0f1da10e9b92b9 12 -3.046661112996963 my-retrieval-system
34 PRO If497a0f1c2b730b5 13 -3.1839412967065392 my-retrieval-system
34 PRO I7dad15970750f8d4 14 -3.2251880425051076 my-retrieval-system
34 PRO Ide6d93c2173be1a7 15 -3.234162322647455 my-retrieval-

In [19]:
### Create the Re-Ranking Input

#TODO: Delete this...

#queries = [json.loads(i) for i in open(input_data + '/queries.jsonl')]
#queries = {i['qid']: i['query'] for i in queries}
#
#with open ('/workspace/sample-input/rerank.jsonl', 'w') as out:
#    for _, i in run.iterrows():
#        out.write(json.dumps({
#            'qid': i['qid'],
#            'query': queries[str(i['qid'])],
#            'docno': i['docno'],
#        }) + '\n')