In [None]:
# You only need to execute this cell if you are using Google Golab.
# If you use GitHub Codespaces, everything is already installed.
!pip3 install tira ir-datasets python-terrier nltk

In [22]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client

import nltk
from nltk.stem import WordNetLemmatizer

import pyterrier as pt
import pandas as pd

In [23]:
# Ensure necessary NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [24]:
ensure_pyterrier_is_loaded()
tira = Client()

In [25]:
# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [4]:
# Function to lemmatize text
# def lemmatize_text(text):
#     return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

In [26]:
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)

In [27]:
# Preprocess topics by lemmatizing
topics = pt_dataset.get_topics('text')
# print('Columns in topics DataFrame:', topics.columns)

In [28]:
lemmatized_topics = []
for idx, row in topics.iterrows():
    lemmatized_query = ' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(row['query'])])
    lemmatized_topics.append({'qid': row['qid'], 'query': lemmatized_query})

In [12]:
# text_column = topics.columns[0]
# print('Using column for text:', text_column)
# topics[text_column] = topics[text_column].apply(lemmatize_text)

Using column for text: qid


In [29]:
# BM25 Retrieval Model
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [30]:
df_lemmatized_topics = pd.DataFrame(lemmatized_topics)

In [31]:
run = bm25(df_lemmatized_topics)

In [32]:
persist_and_normalize_run(run, system_name='bm25-lemmatizer', default_output='../runs')

The run file is normalized outside the TIRA sandbox, I will store it at "../runs".
Done. run file is stored under "../runs/run.txt".


In [33]:
# Localtest
pt.Experiment(
    [bm25], 
    topics, 
    pt_dataset.get_qrels(), 
    eval_metrics=['P_1000', 'map', 'recip_rank'],
    names=['BM25'],
    baseline=0
)

Unnamed: 0,name,map,recip_rank,P_1000,map +,map -,map p-value,recip_rank +,recip_rank -,recip_rank p-value,P_1000 +,P_1000 -,P_1000 p-value
0,BM25,0.262311,0.579877,0.016191,,,,,,,,,


In [35]:
run.head(10)

Unnamed: 0,qid,docid,docno,rank,score,query,system
0,1,94858,2004.cikm_conference-2004.47,0,15.681777,retrieval system improving effectiveness,bm25-lemmatizer
1,1,125137,1989.ipm_journal-ir0volumeA25A4.2,1,15.04738,retrieval system improving effectiveness,bm25-lemmatizer
2,1,125817,2005.ipm_journal-ir0volumeA41A5.11,2,14.144223,retrieval system improving effectiveness,bm25-lemmatizer
3,1,5868,W05-0704,3,14.025748,retrieval system improving effectiveness,bm25-lemmatizer
4,1,84876,2016.ntcir_conference-2016.90,4,13.947994,retrieval system improving effectiveness,bm25-lemmatizer
5,1,82472,1998.sigirconf_conference-98.15,5,13.901647,retrieval system improving effectiveness,bm25-lemmatizer
6,1,94415,2008.cikm_conference-2008.183,6,13.808208,retrieval system improving effectiveness,bm25-lemmatizer
7,1,17496,O01-2005,7,13.749449,retrieval system improving effectiveness,bm25-lemmatizer
8,1,82490,1998.sigirconf_conference-98.33,8,13.735541,retrieval system improving effectiveness,bm25-lemmatizer
9,1,124801,2006.ipm_journal-ir0volumeA42A3.2,9,13.569263,retrieval system improving effectiveness,bm25-lemmatizer
