In [None]:
#Standard Imports für alle Notebooks

!pip3 install tira ir-datasets python-terrier nltk scikit-learn spacy

from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt
import spacy
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import os
import time
import hashlib

ensure_pyterrier_is_loaded()
tira = Client()

pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

In [None]:
# Laden der NLTK Ressourcen
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Laden der SpaCy-Ressourcen
!python -m spacy download en_core_web_sm

# Laden des SpaCy-Modells
nlp = spacy.load("en_core_web_sm")

In [3]:
# Methode um Beschreibung des POS-Tags zu bekommen für den NLTK Lemmatizer
def get_wordnet_pos_nltk(treebank_tag):
    """Konvertiert POS-Tag in ein Format, das vom WordNet-Lemmatizer unterstützt wird."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Funktion um den Text zu lemmatizen für NLTK Lemmatizer
def lemmatize_text_nltk(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos_nltk(tag)) for token, tag in pos_tags]
    return ' '.join(lemmatized_tokens)

# Definition der Funktion zur Lemmatization eines Textes mit SpaCy
def lemmatize_text_spacy(text):
    """Lemmatiziert den gegebenen Text mit SpaCy."""
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_tokens)

def preprocess_documents(documents, method):
    if method == 'nltk':
        lemmatize_text = lemmatize_text_nltk
    elif method == 'spacy':
        lemmatize_text = lemmatize_text_spacy
    else:
        raise ValueError("Invalid method specified. Use 'nltk' or 'spacy'.")

    for doc in documents:
        doc['text'] = lemmatize_text(doc['text'])
        yield doc

#Funktion um eigene Indecies zu erstellen
def create_index(base_path, documents, stopwords, stemmer):
    # Generate a unique identifier based on current timestamp
    unique_id = hashlib.sha1(str(time.time()).encode('utf-8')).hexdigest()[:8]
    
    # Construct the unique path using base_path and unique_id
    index_path = os.path.join(base_path, f"index_{unique_id}/")
    
    indexer = pt.IterDictIndexer(index_path, overwrite=True, meta={'docno': 100, 'text': 20480}, stopwords=stopwords, stemmer=stemmer)
    index_ref = indexer.index(documents)
    return pt.IndexFactory.of(index_ref)

#Funktion um aus einem txt-file eine Python Liste zu machen
def read_text_file_to_array(file_path):
    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()
            # Remove newline characters and convert to integers
            array = [(line.strip()) for line in lines]
            return array
    except FileNotFoundError:
        print(f"File {file_path} not found.")
        return None

In [None]:
# Load custom stopword lists
terrier_custom_stopwords = read_text_file_to_array('../terrier-custom.txt')
chatgpt_stopwords = read_text_file_to_array('../chatgpt-stopwordlist.txt')

base_path = '/workspaces/ir-lab-sose-2024-ir-sose-24-6/gridsearch/var/tmp/'

# Naming: [Stemmer/Lemmatizer]_[Type]__[StopwordList]
indices = {
    "L_N_T": create_index(base_path, preprocess_documents(pt_dataset.get_corpus_iter(), 'nltk'), terrier_custom_stopwords, 'NoOp'),
    "L_N_C": create_index(base_path, preprocess_documents(pt_dataset.get_corpus_iter(), 'nltk'), chatgpt_stopwords, 'NoOp'),
    "L_N_N": create_index(base_path, preprocess_documents(pt_dataset.get_corpus_iter(), 'nltk'), [], 'NoOp'),
    "L_S_T": create_index(base_path, preprocess_documents(pt_dataset.get_corpus_iter(), 'spacy'), terrier_custom_stopwords, 'NoOp'),
    "L_S_C": create_index(base_path, preprocess_documents(pt_dataset.get_corpus_iter(), 'spacy'), chatgpt_stopwords, 'NoOp'),
    "L_S_N": create_index(base_path, preprocess_documents(pt_dataset.get_corpus_iter(), 'spacy'), [], 'NoOp')
}

retrieval_models_nltk = {
    "L_N_T_XSqrA_M": pt.BatchRetrieve(indices["L_N_T"], wmodel="XSqrA_M"),
    "L_N_T_BM25": pt.BatchRetrieve(indices["L_N_T"], wmodel="BM25"),
    "L_N_T_DPH": pt.BatchRetrieve(indices["L_N_T"], wmodel="DPH"),
    
    "L_N_C_XSqrA_M": pt.BatchRetrieve(indices["L_N_C"], wmodel="XSqrA_M"),
    "L_N_C_BM25": pt.BatchRetrieve(indices["L_N_C"], wmodel="BM25"),
    "L_N_C_DPH": pt.BatchRetrieve(indices["L_N_C"], wmodel="DPH"),
    
    "L_N_N_XSqrA_M": pt.BatchRetrieve(indices["L_N_N"], wmodel="XSqrA_M"),
    "L_N_N_BM25": pt.BatchRetrieve(indices["L_N_N"], wmodel="BM25"),
    "L_N_N_DPH": pt.BatchRetrieve(indices["L_N_N"], wmodel="DPH"),
}

retrieval_models_spacy = {
    "L_S_T_XSqrA_M": pt.BatchRetrieve(indices["L_S_T"], wmodel="XSqrA_M"),
    "L_S_T_BM25": pt.BatchRetrieve(indices["L_S_T"], wmodel="BM25"),
    "L_S_T_DPH": pt.BatchRetrieve(indices["L_S_T"], wmodel="DPH"),
    
    "L_S_C_XSqrA_M": pt.BatchRetrieve(indices["L_S_C"], wmodel="XSqrA_M"),
    "L_S_C_BM25": pt.BatchRetrieve(indices["L_S_C"], wmodel="BM25"),
    "L_S_C_DPH": pt.BatchRetrieve(indices["L_S_C"], wmodel="DPH"),
    
    "L_S_N_XSqrA_M": pt.BatchRetrieve(indices["L_S_N"], wmodel="XSqrA_M"),
    "L_S_N_BM25": pt.BatchRetrieve(indices["L_S_N"], wmodel="BM25"),
    "L_S_N_DPH": pt.BatchRetrieve(indices["L_S_N"], wmodel="DPH"),
}


In [5]:
# Erhalten der Topics und Lemmatisierung der Queries
topics_nltk = pt_dataset.get_topics()
topics_nltk['query'] = topics_nltk['query'].apply(lemmatize_text_nltk)
topics_spacy= pt_dataset.get_topics()
topics_spacy['query'] = topics_spacy['query'].apply(lemmatize_text_spacy)

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.
There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


In [6]:
# Evaluation der Modelle
pt.Experiment(
    list(retrieval_models_nltk.values()),
    topics_nltk,
    pt_dataset.get_qrels(),
    ["ndcg_cut.10", "recip_rank", "recall_100", "P_10"],
    names=list(retrieval_models_nltk.keys())
)

Unnamed: 0,name,ndcg_cut.10,recip_rank,recall_100,P_10
0,L_N_T_XSqrA_M,0.399378,0.604003,0.536173,0.348529
1,L_N_T_BM25,0.333125,0.565849,0.516693,0.292647
2,L_N_T_DPH,0.383537,0.601354,0.526972,0.330882
3,L_N_C_XSqrA_M,0.392602,0.645102,0.53641,0.332353
4,L_N_C_BM25,0.315186,0.568439,0.516743,0.267647
5,L_N_C_DPH,0.378403,0.62561,0.524916,0.323529
6,L_N_N_XSqrA_M,0.388101,0.681389,0.522145,0.326471
7,L_N_N_BM25,0.304109,0.550845,0.499112,0.263235
8,L_N_N_DPH,0.358759,0.616519,0.514664,0.304412


In [7]:
# Evaluation der Modelle
pt.Experiment(
    list(retrieval_models_spacy.values()),
    topics_spacy,
    pt_dataset.get_qrels(),
    ["ndcg_cut.10", "recip_rank", "recall_100", "P_10"],
    names=list(retrieval_models_spacy.keys())
)

Unnamed: 0,name,ndcg_cut.10,recip_rank,recall_100,P_10
0,L_S_T_XSqrA_M,0.412752,0.629893,0.543502,0.355882
1,L_S_T_BM25,0.340062,0.568778,0.53304,0.297059
2,L_S_T_DPH,0.396113,0.623806,0.539705,0.336765
3,L_S_C_XSqrA_M,0.406269,0.662312,0.548509,0.341176
4,L_S_C_BM25,0.326777,0.572283,0.532044,0.279412
5,L_S_C_DPH,0.389887,0.638468,0.53998,0.329412
6,L_S_N_XSqrA_M,0.397748,0.663278,0.538604,0.333824
7,L_S_N_BM25,0.310113,0.550603,0.512335,0.266176
8,L_S_N_DPH,0.374509,0.631454,0.53068,0.311765
