# This is my cool Pipeline

### Step 1: Import everything and load variables

In [17]:
import pyterrier as pt
import pandas as pd
import os
from pathlib import Path
from glob import glob

SYSTEM_NAME = 'my-retrieval-system'

if not pt.started():
    pt.init(version=os.environ['PYTERRIER_VERSION'], helper_version=os.environ['PYTERRIER_HELPER_VERSION'], no_download=True)

input_data = os.environ.get('TIRA_INPUT_DIRECTORY', None)

if input_data:
    print(f'I will read the input data from {input_data}.')
else:
    input_data = '/workspace/sample-input/re-rank'
    print('I will use a small hardcoded example.')

output_file = os.environ.get('TIRA_OUTPUT_DIRECTORY', '/tmp/')

print(f'I will write the run file to {output_file}')


I will use a small hardcoded example.
I will write the run file to /tmp/


### Step 2: Load the data

In [26]:
print(f'Step 2: Read input data from {input_data}.')

def load_image_text(image_id):
    ret = ''
    
    for txt_file in glob(input_data +'/images/' + image_id[:3] + '/' + image_id + '/*/*/*/text.txt'):
        ret += '\n\n' + open(txt_file).read()
        
    return {'docno': image_id, 'text': ret.strip()}

df = pd.read_json(input_data + '/rerank.jsonl', lines=True)
df['query'] = df['query'].apply(lambda i: "".join([x if x.isalnum() else " " for x in i]))
df['qid'] = df['qid'].astype('str')
df['text'] = df['docno'].apply(lambda i: load_image_text(i)['text'])
print(f'Done...')

df

Step 2: Read input data from /workspace/sample-input/re-rank.
Done...


Unnamed: 0,qid,query,docno,text
0,34,Are social networking sites good for our society,Iad17912610912ffd,"Instagram\nTwitter\nFacebook\n\nThursday, Octo..."
1,34,Are social networking sites good for our society,Ia20c1e2e90f832cb,Upload Log in\n Search\n \nVideo\nSlideshow\n...
2,34,Are social networking sites good for our society,I374eede3492beb08,Free Trial Login\n\tSearch\nFeatures\n\t\nPSHE...
3,34,Are social networking sites good for our society,I8b51d50df3ca6612,Cookies\n\nThis site uses cookies to offer you...
4,34,Are social networking sites good for our society,I0c02739ff554ca9c,Skip to main content\nUtility menu\nAbout this...
...,...,...,...,...
74,48,Should the voting age be lowered,I7dad15970750f8d4,We value your privacy\n\nWe and our partners s...
75,48,Should the voting age be lowered,I9cca04466f27d923,Jump to main content\nJump to search\nJump to ...
76,48,Should the voting age be lowered,I67bbb02abaf26583,Home\nAbout\nLatest\nElections\nEU Politics\nF...
77,48,Should the voting age be lowered,Ia73d445074b4df3d,Skip to content\nHOW I GOT TO 5 MILLION\nFEATU...


### Step 3: Define the actual retrieval appraoch

In [27]:
print('Step 3: Define the retrieval approach')
bm25_scorer = pt.text.scorer(body_attr="text", wmodel='BM25', verbose=True)


Step 3: Define the retrieval approach


### Step 4: Run the pipeline

In [28]:
print('Step 4: Run the retrieval pipeline')

run = bm25_scorer(df)
run

Step 4: Run the retrieval pipeline


51documents [00:00, 93.30documents/s]                                                                                                             
BR(BM25):   0%|                                                                                                              | 0/2 [00:00<?, ?q/s]



BR(BM25): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 151.81q/s]


Unnamed: 0,qid,docno,text,rank,score,query
0,34,Iad17912610912ffd,"Instagram\nTwitter\nFacebook\n\nThursday, Octo...",0,0.000000,Are social networking sites good for our society
1,34,Ia20c1e2e90f832cb,Upload Log in\n Search\n \nVideo\nSlideshow\n...,2,-0.168664,Are social networking sites good for our society
2,34,I374eede3492beb08,Free Trial Login\n\tSearch\nFeatures\n\t\nPSHE...,3,-0.851783,Are social networking sites good for our society
3,34,I8b51d50df3ca6612,Cookies\n\nThis site uses cookies to offer you...,4,-0.864547,Are social networking sites good for our society
4,34,I0c02739ff554ca9c,Skip to main content\nUtility menu\nAbout this...,5,-1.017909,Are social networking sites good for our society
...,...,...,...,...,...,...
74,48,I7dad15970750f8d4,We value your privacy\n\nWe and our partners s...,32,-2.218303,Should the voting age be lowered
75,48,I9cca04466f27d923,Jump to main content\nJump to search\nJump to ...,31,-2.217087,Should the voting age be lowered
76,48,I67bbb02abaf26583,Home\nAbout\nLatest\nElections\nEU Politics\nF...,34,-2.240367,Should the voting age be lowered
77,48,Ia73d445074b4df3d,Skip to content\nHOW I GOT TO 5 MILLION\nFEATU...,33,-2.225849,Should the voting age be lowered


### Step 5: Stence Detection

In [29]:
print('Step 5: Define stence detection')

def detect_stance(query_document_pair):
    # As baseline, we return always pro
    return 'PRO'

Step 5: Define stence detection


### Step 6: Persist results

In [30]:
print('Step 6: Persist Run.')

def normalize_run(run):
    try:
        run['qid'] = run['qid'].astype(int)
    except:
        pass
    run['system'] = SYSTEM_NAME
    run = run.copy().sort_values(["qid", "score", "docno"], ascending=[True, False, False]).reset_index()
    run = run.groupby("qid")[["qid", "docno", "rank", "score", "system"]].head(1000)

    # Make sure that rank position starts by 1
    run["rank"] = 1
    run["rank"] = run.groupby("qid")["rank"].cumsum()
    run["Q0"] = run.apply(lambda i: detect_stance(i), axis=1)
    run["system"] = SYSTEM_NAME
    
    return run[['qid', 'Q0', 'docno', 'rank', 'score', 'system']]

Path(output_file).mkdir(parents=True, exist_ok=True)
normalize_run(run).to_csv(output_file + '/run.txt', sep=' ', header=False, index=False)

print('Done...')

Step 6: Persist Run.
Done...
