In [66]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
ensure_pyterrier_is_loaded()
import pandas as pd
import pyterrier as pt
from tqdm import tqdm
import gzip
import json



In [67]:
# Initialize TIRA client
tira = Client()

In [68]:
# Dataset setup

# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
#pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
# A (pre-built) PyTerrier index loaded from TIRA
#index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)
dataset = 'ir-acl-anthology-20240504-training'
pt_dataset = pt.get_dataset(f'irds:ir-lab-sose-2024/{dataset}')
# Load index from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

# Retrieve topics from the dataset
topics=pt_dataset.get_topics('text')
#dataset2 = 'antique-test-20230107-training'
#pt_dataset2 = pt.get_dataset(f'irds:ir-benchmarks/{dataset2}')
#topics=pt_dataset2.get_topics('text')


In [69]:
# Directly initialize BM25 model using PyTerrier
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [70]:
# Define function to read DocT5Query expanded documents
def doc_t5_query(pt_dataset):
    docs = tira.get_run_output('ir-benchmarks/seanmacavaney/DocT5Query', pt_dataset) + '/documents.jsonl.gz'
    with gzip.open(docs, 'rt') as f:
        for l in tqdm(f):
            l = json.loads(l)
            l['text'] = l['querygen']
            l['docno'] = l['doc_id']
            del l['doc_id']
            del l['querygen']
            yield l


# Define function to create index from DocT5Query expanded documents
def doc_t5_query_index(pt_dataset):
    indexer = pt.IterDictIndexer("/tmp/index2", overwrite=True, meta={'docno': 100, 'text': 20480})
    index_ref = indexer.index(doc_t5_query(pt_dataset))
    return pt.IndexFactory.of(index_ref)

# Create index from expanded documents
#index = doc_t5_query_index(dataset)

In [71]:

index = pt.IndexFactory.of('/tmp/index2')

In [72]:
#Retrieve documents using BM25 model
docs_retrieved_by_bm25 = {}
bm25_result = bm25(pt_dataset.get_topics('title'))

for _, i in tqdm(bm25_result.iterrows()):
    qid, docno = str(i['qid']), str(i['docno'])

    if qid not in docs_retrieved_by_bm25:
        docs_retrieved_by_bm25[qid] = set()
    
    docs_retrieved_by_bm25[qid].add(docno)

66283it [00:02, 23996.43it/s]


In [73]:
# Define lambda function to omit already retrieved documents
omit_already_retrieved_docs = lambda i: i[i.apply(lambda j: str(j['docno']) not in docs_retrieved_by_bm25[str(j['qid'])], axis=1)]
omit_already_retrieved_docs = pt.apply.generic(omit_already_retrieved_docs)

In [74]:
# Create BM25 model for DocT5Query index and apply omission filter
bm25_doct5query = pt.BatchRetrieve(index, wmodel="BM25")
bm25_doct5query_new = bm25_doct5query >> omit_already_retrieved_docs

In [75]:
# Create and run the retrieval

print('Create run')
run = bm25_doct5query(topics)
print('Done, run was created')


Create run
Done, run was created


In [76]:
# Persist and normalize the run
persist_and_normalize_run(run, system_name = 'doc_T5_Query', default_output='../runs')

# Diagnostic: Check the first few rows of the run
print(run.head())

The run file is normalized outside the TIRA sandbox, I will store it at "../runs".
Done. run file is stored under "../runs/run.txt".
   qid   docid      docno  rank      score  \
0    1   55753  1883395_0     0  19.780018   
1    1   59013  2915275_2     1  19.766216   
2    1   79345  4202345_0     2  19.766216   
3    1  136866  1254685_1     3  19.766216   
4    1  287157  3080874_4     4  19.766216   

                                      query        system  
0  retrieval system improving effectiveness  doc_T5_Query  
1  retrieval system improving effectiveness  doc_T5_Query  
2  retrieval system improving effectiveness  doc_T5_Query  
3  retrieval system improving effectiveness  doc_T5_Query  
4  retrieval system improving effectiveness  doc_T5_Query  
