In [None]:
#Standard Imports für alle Notebooks

!pip3 install tira ir-datasets python-terrier nltk scikit-learn spacy

from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt
import spacy
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import os
import time
import hashlib

ensure_pyterrier_is_loaded()
tira = Client()

pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

In [None]:
# Laden der NLTK Ressourcen
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Laden der SpaCy-Ressourcen
!python -m spacy download en_core_web_sm

# Laden des SpaCy-Modells
nlp = spacy.load("en_core_web_sm")

In [None]:
# Methode um Beschreibung des POS-Tags zu bekommen für den NLTK Lemmatizer
def get_wordnet_pos_nltk(treebank_tag):
    """Konvertiert POS-Tag in ein Format, das vom WordNet-Lemmatizer unterstützt wird."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Funktion um den Text zu lemmatizen für NLTK Lemmatizer
def lemmatize_text_nltk(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos_nltk(tag)) for token, tag in pos_tags]
    return ' '.join(lemmatized_tokens)

# Definition der Funktion zur Lemmatization eines Textes mit SpaCy
def lemmatize_text_spacy(text):
    """Lemmatiziert den gegebenen Text mit SpaCy."""
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_tokens)

def preprocess_documents(documents, method):
    if method == 'nltk':
        lemmatize_text = lemmatize_text_nltk
    elif method == 'spacy':
        lemmatize_text = lemmatize_text_spacy
    else:
        raise ValueError("Invalid method specified. Use 'nltk' or 'spacy'.")

    for doc in documents:
        doc['text'] = lemmatize_text(doc['text'])
        yield doc

#Funktion um eigene Indecies zu erstellen
def create_index(base_path, documents, stopwords, stemmer):
    # Generate a unique identifier based on current timestamp
    unique_id = hashlib.sha1(str(time.time()).encode('utf-8')).hexdigest()[:8]
    
    # Construct the unique path using base_path and unique_id
    index_path = os.path.join(base_path, f"index_{unique_id}/")
    
    indexer = pt.IterDictIndexer(index_path, overwrite=True, meta={'docno': 100, 'text': 20480}, stopwords=stopwords, stemmer=stemmer)
    index_ref = indexer.index(documents)
    return pt.IndexFactory.of(index_ref)

#Funktion um aus einem txt-file eine Python Liste zu machen
def read_text_file_to_array(file_path):
    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()
            # Remove newline characters and convert to integers
            array = [(line.strip()) for line in lines]
            return array
    except FileNotFoundError:
        print(f"File {file_path} not found.")
        return None


In [None]:
# Load custom stopword lists
terrier_custom_stopwords = read_text_file_to_array('../terrier-custom.txt')
chatgpt_stopwords = read_text_file_to_array('../chatgpt-stopwordlist.txt')

base_path = '/workspaces/ir-lab-sose-2024-ir-sose-24-6/gridsearch/var/tmp/'

S_E_T_index = create_index(base_path, pt_dataset.get_corpus_iter(), terrier_custom_stopwords, 'EnglishSnowballStemmer')

S_E_T_XSqrA_M = pt.BatchRetrieve(S_E_T_index, wmodel="XSqrA_M")

In [None]:
run = S_E_T_XSqrA_M(pt_dataset.get_topics('text'))

In [None]:
persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')