In [6]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
import pyterrier as pt
import json
import pandas as pd

# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
# tira = Client()

In [8]:
# Define the dataset and index name
dataset_name = 'msmarco_passage'
index_name = 'terrier_stemmed'

# Load the index
dataset = pt.get_dataset(dataset_name)
index_ref = dataset.get_index(index_name)

# Create a BM25 transformer
# bm25_terrier_stemmed = pt.BatchRetrieve.from_dataset(dataset_name, index_name, wmodel='BM25')
bm25 = pt.BatchRetrieve(index_ref, wmodel='BM25')

In [10]:
kl = pt.rewrite.KLQueryExpansion(index_ref, fb_terms=10, fb_docs=3)
# Define the pipeline: BM25 followed by KLQueryExpansion
pipeline = bm25 >> kl

# Get the topics (queries) from the dataset
# queries_df = dataset.get_topics('train')
queries = [
    {"qid": 1, "query": "what is rba"},
    {"qid": 2, "query": "was ronald reagan a democrat"},
    {"qid": 3, "query": "how long do you need for sydney and surrounding areas"}
]
queries_df = pd.DataFrame(queries)

# Display the first few queries
print("Original Queries:")
print(queries_df.head())

# Apply the pipeline to the topics (queries)
expanded_queries = pipeline.transform(queries_df)['query'] # "query_0": original query
print(expanded_queries)


Original Queries:
   qid                                              query
0    1                                        what is rba
1    2                       was ronald reagan a democrat
2    3  how long do you need for sydney and surroundin...
Expanded Queries DataFrame:
0    applypipeline:off rba^1.846029471 type^0.04316...
1    applypipeline:off ronald^1.246149665 reagan^1....
2    applypipeline:off long^1.000000000 need^1.0000...
Name: query, dtype: object
