# This is my cool Pipeline

### Step 1: Import everything and load variables

In [1]:
import pyterrier as pt
import pandas as pd
import os
from pathlib import Path

SYSTEM_NAME = 'my-retrieval-system'

if not pt.started():
    pt.init(version=os.environ['PYTERRIER_VERSION'], helper_version=os.environ['PYTERRIER_HELPER_VERSION'], no_download=True)

input_data = os.environ.get('TIRA_INPUT_DIRECTORY', None)

if input_data:
    input_data = input_data + '/rerank.jsonl'
    print(f'I will read the input data from {input_data}.')
else:
    input_data = '/workspace/sample-input/re-rank-default-text/rerank.jsonl'
    print('I will use a small hardcoded example.')

output_file = os.environ.get('TIRA_OUTPUT_DIRECTORY', '/tmp/')

print(f'I will write the run file to {output_file}')


PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


I will use a small hardcoded example.
I will write the run file to /tmp/


### Step 2: Load the data

In [2]:
print(f'Read input data from {input_data}.')
df = pd.read_json(input_data, lines=True)
df['query'] = df['query'].apply(lambda i: "".join([x if x.isalnum() else " " for x in i]))
df['qid'] = df['qid'].astype('str')
df['text'] = df['text'].apply(lambda i: i.lower())
print(f'Done...')

df

Read input data from /workspace/sample-input/re-rank-default-text/rerank.jsonl.
Done...


Unnamed: 0,qid,query,docno,text
0,1111,does computer work increase eye pressure,clueweb12-1106wb-16-17437,eyes hurt looking computer screen\n\n\n\neyes ...
1,1111,does computer work increase eye pressure,clueweb12-0302wb-19-28258,how the eye works\n\n\n\nhow the eye works\n\n...
2,1111,does computer work increase eye pressure,clueweb12-1212wb-00-02238,how the eye works\n\n\n\nhow the eye works\n\n...
3,1111,does computer work increase eye pressure,clueweb12-0204wb-22-27404,how does homeopathy work?\n\n\n\nhow does home...
4,1111,does computer work increase eye pressure,clueweb12-0200wb-41-19355,how does homeopathy work?\n\n\n\nhow does home...
5,1111,does computer work increase eye pressure,clueweb12-0310wb-61-05771,effect of cues to increase sound pressure leve...
6,1111,does computer work increase eye pressure,clueweb12-1515wb-05-13926,"eye care center, eye clinics, eye care\n\n\n\n..."
7,1111,does computer work increase eye pressure,clueweb12-1709wb-16-10264,pressurex-micro | tactile pressure indicating ...
8,1111,does computer work increase eye pressure,clueweb12-1303wb-99-19032,does work make you sick? then lets change the ...
9,1111,does computer work increase eye pressure,clueweb12-1413wb-68-06977,lower blood pressure naturally\n\n\n\nlower bl...


### Step 3: Define the actual retrieval appraoch

In [3]:
bm25_scorer = pt.text.scorer(body_attr="text", wmodel='BM25', verbose=True)


### Step 4: Run the pipeline

In [4]:
run = bm25_scorer(df)
run

31documents [00:00, 45.13documents/s]                                                                                                             
BR(BM25):   0%|                                                                                                              | 0/3 [00:00<?, ?q/s]



BR(BM25): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 15.63q/s]


Unnamed: 0,qid,docno,text,rank,score,query
0,1111,clueweb12-1106wb-16-17437,eyes hurt looking computer screen\n\n\n\neyes ...,0,-1.692322,does computer work increase eye pressure
1,1111,clueweb12-0302wb-19-28258,how the eye works\n\n\n\nhow the eye works\n\n...,1,-2.76882,does computer work increase eye pressure
2,1111,clueweb12-1212wb-00-02238,how the eye works\n\n\n\nhow the eye works\n\n...,2,-2.76882,does computer work increase eye pressure
3,1111,clueweb12-0204wb-22-27404,how does homeopathy work?\n\n\n\nhow does home...,6,-4.642983,does computer work increase eye pressure
4,1111,clueweb12-0200wb-41-19355,how does homeopathy work?\n\n\n\nhow does home...,7,-4.64454,does computer work increase eye pressure
5,1111,clueweb12-0310wb-61-05771,effect of cues to increase sound pressure leve...,4,-4.050981,does computer work increase eye pressure
6,1111,clueweb12-1515wb-05-13926,"eye care center, eye clinics, eye care\n\n\n\n...",3,-3.937747,does computer work increase eye pressure
7,1111,clueweb12-1709wb-16-10264,pressurex-micro | tactile pressure indicating ...,9,-5.3688,does computer work increase eye pressure
8,1111,clueweb12-1303wb-99-19032,does work make you sick? then lets change the ...,8,-4.792538,does computer work increase eye pressure
9,1111,clueweb12-1413wb-68-06977,lower blood pressure naturally\n\n\n\nlower bl...,5,-4.422439,does computer work increase eye pressure


### Step 5: Stence Detection

In [5]:
print('Step 5: Define stence detection')

def detect_stance(query_document_pair):
    # As baseline, we return always neutral
    return 'NEU'

Step 5: Define stence detection


### Step 6: Persist results

In [7]:
print('Step 6: Persist Run.')

def normalize_run(run):
    try:
        run['qid'] = run['qid'].astype(int)
    except:
        pass
    run['system'] = SYSTEM_NAME
    run = run.copy().sort_values(["qid", "score", "docno"], ascending=[True, False, False]).reset_index()
    run = run.groupby("qid")[["qid", "docno", "rank", "score", "system"]].head(1000)

    # Make sure that rank position starts by 1
    run["rank"] = 1
    run["rank"] = run.groupby("qid")["rank"].cumsum()
    run["Q0"] = run.apply(lambda i: detect_stance(i), axis=1)
    run["system"] = SYSTEM_NAME
    
    return run[['qid', 'Q0', 'docno', 'rank', 'score', 'system']]

Path(output_file).mkdir(parents=True, exist_ok=True)
normalize_run(run).to_csv(output_file + '/run.txt', sep=' ', header=False, index=False)

print('Done...')

Step 6: Persist Run.
Done...
