In [None]:
# Nur benötigt für GoLab
!pip3 install tira ir-datasets python-terrier nltk


In [None]:
# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
from nltk.stem import SnowballStemmer

import pyterrier as pt
import pandas as pd


# PyTerrier initialisieren
ensure_pyterrier_is_loaded()
tira = Client()

# PyTerrier starten
if not pt.started():
    pt.init()

# NLTK SnowballStemmer initialisieren
stemmer = SnowballStemmer("english")

# Pandas Display-Einstellungen
pd.set_option('display.max_colwidth', 0)


In [None]:
# Dataset von der TIRA Plattform abrufen
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

# Vorverarbeitung: SnowballStemmer auf den Text anwenden
def stem_text(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

# Iterieren über den Corpus und Texte stemmen
processed_corpus = []
for doc in pt_dataset.get_corpus_iter():
    doc_id = doc['docno']
    text = doc['text']
    stemmed_text = stem_text(text)
    processed_corpus.append({'docno': doc_id, 'text': stemmed_text})

# Konvertieren in DataFrame
processed_corpus_df = pd.DataFrame(processed_corpus)


In [None]:
# Index erstellen
indexer = pt.IterDictIndexer("/tmp/index2", overwrite=True, stemmer='none')  # Kein zusätzlicher Stemmer hier, da wir bereits gestemmt haben
index_ref = indexer.index(processed_corpus_df.to_dict(orient='records'))

# Index laden
index = pt.IndexFactory.of(index_ref)

# BM25 Retrieval Modell initialisieren
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [None]:
#print('Now we do the retrieval...')
run = bm25(pt_dataset.get_topics('text'))

#print('Done. Here are the first 10 entries of the run')
run.head(10)

In [None]:
# Output runfile für das Deployment auf TIRA
persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../../runs')
