In [12]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run, ir_datasets
from tira.rest_api_client import Client
import pyterrier as pt
from pyterrier_pisa import PisaIndex

In [13]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

In [14]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)


# We are using Pisa Index to index the dataset
#index = PisaIndex('./index', overwrite=True)
#index.index(pt_dataset.get_corpus_iter())

# get all topics of training dataset
#topics = pt_dataset.get_topics()
#topics

# We now do Query expansion in order to improve retrieval effectiveness 
Query expansion generally improves recall, by adding more terms to the query, it broadens the search scope, potentially retrieving more relevant documents. While it could also have a slight negative effect on precision since query expansion might also introdoces irrelevant results. 

In [15]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

#(bm25 >> pt.text.get_text(pt_dataset, 'text')).search('retrieval')
pt.Experiment([bm25], pt_dataset.get_topics(), pt_dataset.get_qrels(), eval_metrics=['ndcg_cut_5'], names=['BM25'])

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


Unnamed: 0,name,ndcg_cut_5
0,BM25,0.39365


In [16]:
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

bo1_pipe = bm25 >> pt.rewrite.Bo1QueryExpansion(index) >> bm25
pt.Experiment([bm25, bo1_pipe], pt_dataset.get_topics(), pt_dataset.get_qrels(), eval_metrics=['ndcg_cut_5'], names=['BM25', 'BM25 >> Bo1 >> BM25'])

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


Unnamed: 0,name,ndcg_cut_5
0,BM25,0.39365
1,BM25 >> Bo1 >> BM25,0.381675


In [17]:
kl_pipe = bm25 >> pt.rewrite.KLQueryExpansion(index) >> bm25

pt.Experiment([bm25, kl_pipe], pt_dataset.get_topics(), pt_dataset.get_qrels(), eval_metrics=['ndcg_cut_5'], names=['BM25', 'BM25 >> KL >> BM25'])

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


Unnamed: 0,name,ndcg_cut_5
0,BM25,0.39365
1,BM25 >> KL >> BM25,0.383947


In [19]:
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

rm3_pipe = bm25 >> pt.rewrite.RM3(index) >> bm25
print(pt_dataset.get_topics())

pt.Experiment([bm25, rm3_pipe], pt_dataset.get_topics(), pt_dataset.get_qrels(), eval_metrics=['ndcg_cut_5'], names=['BM25', 'BM25 >> RM3 >> BM25'])


There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.
   qid                                      text  \
0    1  retrieval system improving effectiveness   
1    2  machine learning language identification   
2    3             social media detect self-harm   
3    4             stemming for arabic languages   
4    5            audio based animal recognition   
..  ..                                       ...   
63  65         information in different language   
64  66                  Abbreviations in queries   
65  67                  lemmatization algorithms   
66  68                  filter ad rich documents   
67  18     Advancements in Information Retrieval   

                                       title  \
0   retrieval system improving effectiveness   
1   machine learning language identification   
2              social media detect self-harm   
3   

Unnamed: 0,name,ndcg_cut_5
0,BM25,0.39365
1,BM25 >> RM3 >> BM25,0.341725


In [13]:
pt_dataset1 = ir_datasets.load('ir-lab-sose-2024/ir-acl-anthology-20240504-training')
print('The dataset has', len(list(pt_dataset1.docs_iter())), 'documents.')
for query in list(pt_dataset1.queries_iter())[:3]:
    print('\nQuery: ', query.query_id)

    print('\tText:\t\t' + query.default_text())
    print('\tDescrition:\t' + query.description)
    print('\tNarrative:\t' + query.narrative)

The dataset has 126958 documents.

Query:  1
	Text:		retrieval system improving effectiveness
	Descrition:	What papers focus on improving the effectiveness of a retrieval system?
	Narrative:	Relevant papers include research on what makes a retrieval system effective and what improves the effectiveness of a retrieval system. Papers that focus on improving something else or improving the effectiveness of a system that is not a retrieval system are not relevant.

Query:  2
	Text:		machine learning language identification
	Descrition:	What papers are about machine learning for language identification?
	Narrative:	Relevant papers include research on methods of machine learning for language identification or how to improve those methods. Papers that focus on other methods for language identification or the usaged of machine learning not for language identification are not relevant.

Query:  3
	Text:		social media detect self-harm
	Descrition:	Which papers focus on how to recognize signs of s