In [None]:
# You only need to execute this cell if you are using Google Golab.
# If you use GitHub Codespaces, everything is already installed.
!pip3 install tira ir-datasets python-terrier

In [None]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client

import nltk
from nltk.stem import PorterStemmer
nltk.download('punkt')

import pyterrier as pt
import pandas as pd

In [None]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

stemmer = PorterStemmer()

In [None]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

In [None]:
# Get topics and stem them
topics = pt_dataset.get_topics('text')

stemmed_topics = []
for idx, row in topics.iterrows():
    stemmed_query = ' '.join([stemmer.stem(word) for word in nltk.word_tokenize(row['query'])])
    stemmed_topics.append({'qid': row['qid'], 'query': stemmed_query})

In [None]:
stemmed_topics

In [83]:
# Define the retrieval pipeline with BM25
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

# Convert stemmed_topics to DataFrame
df_stemmed_topics = pd.DataFrame(stemmed_topics)

In [84]:
run = bm25(df_stemmed_topics)

In [85]:
persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')

The run file is normalized outside the TIRA sandbox, I will store it at "../runs".
Done. run file is stored under "../runs/run.txt".


In [86]:
#Localtest
pt.Experiment(
    [bm25], 
    pt_dataset.get_topics(), 
    pt_dataset.get_qrels(), 
    eval_metrics=['P_1000', 'map', 'recip_rank'],
    names=['BM25'],
    baseline=0
    )

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


Unnamed: 0,name,map,recip_rank,P_1000,map +,map -,map p-value,recip_rank +,recip_rank -,recip_rank p-value,P_1000 +,P_1000 -,P_1000 p-value
0,BM25,0.262311,0.579877,0.016191,,,,,,,,,


In [87]:
pt.Experiment(
    [bm25], 
    df_stemmed_topics, 
    pt_dataset.get_qrels(), 
    eval_metrics=['P_1000', 'map', 'recip_rank'],
    names=['BM25 + Porter2 Stemmer'],
    baseline=0
)

Unnamed: 0,name,map,recip_rank,P_1000,map +,map -,map p-value,recip_rank +,recip_rank -,recip_rank p-value,P_1000 +,P_1000 -,P_1000 p-value
0,BM25 + Porter2 Stemmer,0.243377,0.498491,0.015456,,,,,,,,,
