In [31]:
# Nur benötigt für GoLab
!pip3 install tira ir-datasets python-terrier nltk


[0m

In [32]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
from nltk.stem import SnowballStemmer

import pyterrier as pt
import pandas as pd


# PyTerrier initialisieren
ensure_pyterrier_is_loaded()
tira = Client()

# PyTerrier starten
if not pt.started():
    pt.init()

# NLTK SnowballStemmer initialisieren
stemmer = SnowballStemmer("english")

# Pandas Display-Einstellungen
pd.set_option('display.max_colwidth', 0)


In [33]:
# Dataset von der TIRA Plattform abrufen
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

# Vorverarbeitung: SnowballStemmer auf den Text anwenden
def stem_text(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

# Iterieren über den Corpus und Texte stemmen
processed_corpus = []
for doc in pt_dataset.get_corpus_iter():
    doc_id = doc['docno']
    text = doc['text']
    stemmed_text = stem_text(text)
    processed_corpus.append({'docno': doc_id, 'text': stemmed_text})

# Konvertieren in DataFrame
processed_corpus_df = pd.DataFrame(processed_corpus)


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [02:41<00:00, 786.87it/s] 


In [34]:
# Index erstellen
indexer = pt.IterDictIndexer("/tmp/index2", overwrite=True, stemmer='none')  # Kein zusätzlicher Stemmer hier, da wir bereits gestemmt haben
index_ref = indexer.index(processed_corpus)

# Index laden
index = pt.IndexFactory.of(index_ref)

# BM25 Retrieval Modell initialisieren
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

07:34:53.732 [ForkJoinPool-2-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 1 empty documents


In [42]:
def stem_query_dataframe(query_df):
    query_df['text'] = query_df['text'].apply(stem_text)
    return query_df

stemmed_topics_df = stem_query_dataframe(pt_dataset.get_topics())

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


In [44]:
#print('Now we do the retrieval...')
run = bm25(stemmed_topics_df)

#print('Done. Here are the first 10 entries of the run')
run.head(10)

Unnamed: 0,qid,docid,docno,rank,score,text,title,query,description,narrative
0,1,36009,T75-2025,0,21.580689,retriev system improv effect,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectiveness of a retrieval system?,Relevant papers include research on what makes a retrieval system effective and what improves the effectiveness of a retrieval system. Papers that focus on improving something else or improving the effectiveness of a system that is not a retrieval system are not relevant.
1,1,126778,1999.tois_journal-i,1,18.178801,retriev system improv effect,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectiveness of a retrieval system?,Relevant papers include research on what makes a retrieval system effective and what improves the effectiveness of a retrieval system. Papers that focus on improving something else or improving the effectiveness of a system that is not a retrieval system are not relevant.
2,1,74020,2008.ntcir_workshop,2,17.75,retriev system improv effect,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectiveness of a retrieval system?,Relevant papers include research on what makes a retrieval system effective and what improves the effectiveness of a retrieval system. Papers that focus on improving something else or improving the effectiveness of a system that is not a retrieval system are not relevant.
3,1,1177,W18-5301,3,17.541705,retriev system improv effect,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectiveness of a retrieval system?,Relevant papers include research on what makes a retrieval system effective and what improves the effectiveness of a retrieval system. Papers that focus on improving something else or improving the effectiveness of a system that is not a retrieval system are not relevant.
4,1,82900,2013.sigirconf_conf,4,17.02876,retriev system improv effect,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectiveness of a retrieval system?,Relevant papers include research on what makes a retrieval system effective and what improves the effectiveness of a retrieval system. Papers that focus on improving something else or improving the effectiveness of a system that is not a retrieval system are not relevant.
5,1,92066,2005.ecir_conferenc,5,16.361147,retriev system improv effect,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectiveness of a retrieval system?,Relevant papers include research on what makes a retrieval system effective and what improves the effectiveness of a retrieval system. Papers that focus on improving something else or improving the effectiveness of a system that is not a retrieval system are not relevant.
6,1,80745,2008.sigirconf_conf,6,15.878637,retriev system improv effect,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectiveness of a retrieval system?,Relevant papers include research on what makes a retrieval system effective and what improves the effectiveness of a retrieval system. Papers that focus on improving something else or improving the effectiveness of a system that is not a retrieval system are not relevant.
7,1,76958,2013.dir_workshop-2,7,15.286855,retriev system improv effect,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectiveness of a retrieval system?,Relevant papers include research on what makes a retrieval system effective and what improves the effectiveness of a retrieval system. Papers that focus on improving something else or improving the effectiveness of a system that is not a retrieval system are not relevant.
8,1,90861,2021.ecir_conferenc,8,15.064785,retriev system improv effect,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectiveness of a retrieval system?,Relevant papers include research on what makes a retrieval system effective and what improves the effectiveness of a retrieval system. Papers that focus on improving something else or improving the effectiveness of a system that is not a retrieval system are not relevant.
9,1,29696,P07-2025,9,14.848132,retriev system improv effect,retrieval system improving effectiveness,retrieval system improving effectiveness,What papers focus on improving the effectiveness of a retrieval system?,Relevant papers include research on what makes a retrieval system effective and what improves the effectiveness of a retrieval system. Papers that focus on improving something else or improving the effectiveness of a system that is not a retrieval system are not relevant.


In [38]:
# Output runfile für das Deployment auf TIRA
persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../../runs')


The run file is normalized outside the TIRA sandbox, I will store it at "../../runs".
Done. run file is stored under "../../runs/run.txt".


In [56]:
# Einzelne gestemmte Abfrage
query = "retrieval system improving effectiveness"
stemmed_query = stem_text(query)
print(f"Gestemmte Abfrage: {stemmed_query}")


Gestemmte Abfrage: retriev system improv effect


In [59]:
# Manuelle Suche
results = bm25.search(stemmed_query)
print(results)

    qid   docid                docno  rank      score  \
0    1   94858   2004.cikm_conferenc  0     16.232194   
1    1   125137  1989.ipm_journal-ir  1     15.678307   
2    1   5868    W05-0704             2     14.409583   
3    1   94415   2008.cikm_conferenc  3     14.341223   
4    1   125817  2005.ipm_journal-ir  4     14.093883   
..  ..      ...                  ... ..           ...   
995  1   80937   2008.sigirconf_conf  995   9.058766    
996  1   80908   2008.sigirconf_conf  996   9.055515    
997  1   111407  2005.trec_conferenc  997   9.055515    
998  1   126143  2015.tois_journal-i  998   9.051541    
999  1   83446   1997.sigirconf_conf  999   9.051440    

                            query  
0    retriev system improv effect  
1    retriev system improv effect  
2    retriev system improv effect  
3    retriev system improv effect  
4    retriev system improv effect  
..                            ...  
995  retriev system improv effect  
996  retriev system improv 

In [52]:
# Prüfen wir die Dokumentanzahl im Index
print(f"Anzahl der Dokumente im Index: {index.getCollectionStatistics().getNumberOfDocuments()}")


Anzahl der Dokumente im Index: 126958


In [51]:
processed_corpus_df.head()

Unnamed: 0,docno,text
0,O02-2002,"a studi on word similar use context vector model there is a need to measur word similar when process natur languages, especi when use generalization, classification, or exampl -base approaches. usually, measur of similar between two word are defin accord to the distanc between their semant class in a semant taxonomi . the taxonomi approach are more or less semant -base that do not consid syntact similarit ies. however, in real applications, both semant and syntact similar are requir and weight differently. word similar base on context vector is a mixtur of syntact and semant similarit ies. in this paper, we propos use onli syntact relat co-occurr as context vector and adopt inform theoret model to solv the problem of data spars and characterist precision. the probabilist distribut of co-occurr context featur is deriv by pars the contextu environ of each word , and all the context featur are adjust accord to their idf (invers document frequency) values. the agglom cluster algorithm is appli to group similar word accord to their similar values. it turn out that word with similar syntact categori and semant class are group together."
1,L02-1310,bootstrap larg sens tag corpora
2,R13-1042,"headerless, quoteless, but not hopeless? use pairwis email classif to disentangl email thread thread disentangl is the task of separ out convers whose thread structur is implicit, distorted, or lost. in this paper, we perform email thread disentangl through pairwis classification, use text similar measur on non-quot text in emails. we show that i) content text similar metric outperform style and structur text similar metric in both a class-balanc and class-imbalanc setting, and ii) although featur perform is depend on the semant similar of the corpus, content featur are still effect even when control for semant similarity. we make avail the enron thread corpus, a newly-extract corpus of 70,178 multiemail thread with email from the enron email corpus."
3,W05-0819,"align word in {e}nglish-{h}indi parallel corpora in this paper, we describ a word align algorithm for english-hindi parallel data. the system was develop to particip in the share task on word align for languag with scarc resourc at the acl 2005 workshop, on ""build and use parallel texts: data driven machin translat and beyond"". our word align algorithm is base on a hybrid method which perform local word group on hindi sentenc and use other method such as dictionari lookup, transliter similarity, expect english word and nearest align neighbours. we train our system on the train data provid to obtain a list of name entiti and cognat and to collect rule for local word group in hindi sentences. the system score 77.03% precis and 60.68% recal on the share task unseen test data."
4,L02-1309,propos of a very-large-corpus acquisit method by cell-form registr


In [61]:
#Localtest
pt.Experiment(
    [bm25], 
    stemmed_topics_df, 
    pt_dataset.get_qrels(), 
    eval_metrics=['P_1000', 'map', 'recip_rank', 'ndcg_cut_5'],
    names=['BM25'],
    baseline=0
    )

Unnamed: 0,name,map,recip_rank,P_1000,ndcg_cut_5,map +,map -,map p-value,recip_rank +,recip_rank -,recip_rank p-value,P_1000 +,P_1000 -,P_1000 p-value,ndcg_cut_5 +,ndcg_cut_5 -,ndcg_cut_5 p-value
0,BM25,0.0,0.0,0.0,0.0,,,,,,,,,,,,


In [62]:
pt_dataset.get_qrels()

Unnamed: 0,qid,docno,label,iteration
0,1,2005.ipm_journal-ir0volumeA41A1.7,1,0
1,1,2019.tois_journal-ir0volumeA37A1.2,1,0
2,1,2008.sigirconf_conference-2008.127,1,0
3,1,2015.ipm_journal-ir0volumeA51A5.7,0,0
4,1,2008.tois_journal-ir0volumeA27A1.1,0,0
...,...,...,...,...
2618,18,1985.jasis_journal-ir0volumeA36A3.9,0,0
2619,18,2010.wwwconf_conference-2010.11,1,0
2620,18,2011.ntcir_workshop-2011evia.3,0,0
2621,18,1988.ipm_journal-ir0volumeA24A3.1,0,0
