# IR Lab SoSe 2024 Team 6: Grid Search

This jupyter notebook is a grid search over all the customizations we looked at regarding Stopwords, Stemming and Lemmatization, and different retrieval systems.
For each of these topics we individually tested a few of the typical types and test the best two of each in this grid search.

In [12]:
#Standard Imports für alle Notebooks

!pip3 install tira ir-datasets python-terrier nltk scikit-learn spacy

from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt
import spacy
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

ensure_pyterrier_is_loaded()
tira = Client()

pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

[0m

In [13]:
# Laden der NLTK Ressourcen
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Laden der SpaCy-Ressourcen
!python -m spacy download en_core_web_sm

# Laden des SpaCy-Modells
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [14]:
# Methode um Beschreibung des POS-Tags zu bekommen für den NLTK Lemmatizer
def get_wordnet_pos_nltk(treebank_tag):
    """Konvertiert POS-Tag in ein Format, das vom WordNet-Lemmatizer unterstützt wird."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Funktion um den Text zu lemmatizen für NLTK Lemmatizer
def lemmatize_text_nltk(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos_nltk(tag)) for token, tag in pos_tags]
    return ' '.join(lemmatized_tokens)

# Definition der Funktion zur Lemmatization eines Textes mit SpaCy
def lemmatize_text_spacy(text):
    """Lemmatiziert den gegebenen Text mit SpaCy."""
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_tokens)

def preprocess_documents(documents, method):
    if method == 'nltk':
        lemmatize_text = lemmatize_text_nltk
    elif method == 'spacy':
        lemmatize_text = lemmatize_text_spacy
    else:
        raise ValueError("Invalid method specified. Use 'nltk' or 'spacy'.")

    for doc in documents:
        doc['text'] = lemmatize_text(doc['text'])
        yield doc

#Funktion um eigene Indecies zu erstellen
def create_index(path, documents, stopwords, stemmer):
    indexer = pt.IterDictIndexer(path, overwrite=True, meta={'docno': 100, 'text': 20480}, stopwords=stopwords, stemmer=stemmer)
    index_ref = indexer.index(documents)
    return pt.IndexFactory.of(index_ref)

#Funktion um aus einem txt-file eine Python Liste zu machen
def read_text_file_to_array(file_path):
    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()
            # Remove newline characters and convert to integers
            array = [(line.strip()) for line in lines]
            return array
    except FileNotFoundError:
        print(f"File {file_path} not found.")
        return None


In [15]:
# Load custom stopword lists
terrier_custom_stopwords = read_text_file_to_array('../terrier-custom.txt')
chatgpt_stopwords = read_text_file_to_array('../chatgpt-stopwordlist.txt')

# Naming: [Stemmer/Lemmatizer]_[Type]__[StopwordList]
indices = {
    "S_L_T": create_index('tmp/index1/', pt_dataset.get_corpus_iter(), terrier_custom_stopwords, 'LemurKrovetzStemmer'),
    "S_L_C": create_index('tmp/index2/', pt_dataset.get_corpus_iter(), chatgpt_stopwords, 'LemurKrovetzStemmer'),
    "S_L_N": create_index('tmp/index3/', pt_dataset.get_corpus_iter(), [], 'LemurKrovetzStemmer'),
    "S_P_T": create_index('tmp/index4/', pt_dataset.get_corpus_iter(), terrier_custom_stopwords, 'TRv2PorterStemmer'),
    "S_P_C": create_index('tmp/index5/', pt_dataset.get_corpus_iter(), chatgpt_stopwords, 'TRv2PorterStemmer'),
    "S_P_N": create_index('tmp/index6/', pt_dataset.get_corpus_iter(), [], 'TRv2PorterStemmer'),
    "L_N_T": create_index('tmp/index7/', preprocess_documents(pt_dataset.get_corpus_iter(), nltk), terrier_custom_stopwords, None),
    "L_N_C": create_index('tmp/index8/', preprocess_documents(pt_dataset.get_corpus_iter(), nltk), chatgpt_stopwords, None),
    "L_N_N": create_index('tmp/index9/', preprocess_documents(pt_dataset.get_corpus_iter(), nltk), [], None),
    "L_S_T": create_index('tmp/index10/', preprocess_documents(pt_dataset.get_corpus_iter(), spacy), terrier_custom_stopwords, None),
    "L_S_C": create_index('tmp/index11/', preprocess_documents(pt_dataset.get_corpus_iter(), spacy), chatgpt_stopwords, None),
    "L_S_N": create_index('tmp/index12/', preprocess_documents(pt_dataset.get_corpus_iter(), spacy), [], None)
}

retrieval_models = {
    "S_L_T_XSqrA_M": pt.BatchRetrieve(indices["S_L_T"], wmodel="XSqrA_M"),
    "S_L_T_BM25": pt.BatchRetrieve(indices["S_L_T"], wmodel="BM25"),
    "S_L_T_DPH": pt.BatchRetrieve(indices["S_L_T"], wmodel="DPH"),
    
    "S_L_C_XSqrA_M": pt.BatchRetrieve(indices["S_L_C"], wmodel="XSqrA_M"),
    "S_L_C_BM25": pt.BatchRetrieve(indices["S_L_C"], wmodel="BM25"),
    "S_L_C_DPH": pt.BatchRetrieve(indices["S_L_C"], wmodel="DPH"),
    
    "S_L_N_XSqrA_M": pt.BatchRetrieve(indices["S_L_N"], wmodel="XSqrA_M"),
    "S_L_N_BM25": pt.BatchRetrieve(indices["S_L_N"], wmodel="BM25"),
    "S_L_N_DPH": pt.BatchRetrieve(indices["S_L_N"], wmodel="DPH"),
    
    "S_P_T_XSqrA_M": pt.BatchRetrieve(indices["S_P_T"], wmodel="XSqrA_M"),
    "S_P_T_BM25": pt.BatchRetrieve(indices["S_P_T"], wmodel="BM25"),
    "S_P_T_DPH": pt.BatchRetrieve(indices["S_P_T"], wmodel="DPH"),
    
    "S_P_C_XSqrA_M": pt.BatchRetrieve(indices["S_P_C"], wmodel="XSqrA_M"),
    "S_P_C_BM25": pt.BatchRetrieve(indices["S_P_C"], wmodel="BM25"),
    "S_P_C_DPH": pt.BatchRetrieve(indices["S_P_C"], wmodel="DPH"),
    
    "S_P_N_XSqrA_M": pt.BatchRetrieve(indices["S_P_N"], wmodel="XSqrA_M"),
    "S_P_N_BM25": pt.BatchRetrieve(indices["S_P_N"], wmodel="BM25"),
    "S_P_N_DPH": pt.BatchRetrieve(indices["S_P_N"], wmodel="DPH"),
    
    "L_N_T_XSqrA_M": pt.BatchRetrieve(indices["L_N_T"], wmodel="XSqrA_M"),
    "L_N_T_BM25": pt.BatchRetrieve(indices["L_N_T"], wmodel="BM25"),
    "L_N_T_DPH": pt.BatchRetrieve(indices["L_N_T"], wmodel="DPH"),
    
    "L_N_C_XSqrA_M": pt.BatchRetrieve(indices["L_N_C"], wmodel="XSqrA_M"),
    "L_N_C_BM25": pt.BatchRetrieve(indices["L_N_C"], wmodel="BM25"),
    "L_N_C_DPH": pt.BatchRetrieve(indices["L_N_C"], wmodel="DPH"),
    
    "L_N_N_XSqrA_M": pt.BatchRetrieve(indices["L_N_N"], wmodel="XSqrA_M"),
    "L_N_N_BM25": pt.BatchRetrieve(indices["L_N_N"], wmodel="BM25"),
    "L_N_N_DPH": pt.BatchRetrieve(indices["L_N_N"], wmodel="DPH"),
    
    "L_S_T_XSqrA_M": pt.BatchRetrieve(indices["L_S_T"], wmodel="XSqrA_M"),
    "L_S_T_BM25": pt.BatchRetrieve(indices["L_S_T"], wmodel="BM25"),
    "L_S_T_DPH": pt.BatchRetrieve(indices["L_S_T"], wmodel="DPH"),
    
    "L_S_C_XSqrA_M": pt.BatchRetrieve(indices["L_S_C"], wmodel="XSqrA_M"),
    "L_S_C_BM25": pt.BatchRetrieve(indices["L_S_C"], wmodel="BM25"),
    "L_S_C_DPH": pt.BatchRetrieve(indices["L_S_C"], wmodel="DPH"),
    
    "L_S_N_XSqrA_M": pt.BatchRetrieve(indices["L_S_N"], wmodel="XSqrA_M"),
    "L_S_N_BM25": pt.BatchRetrieve(indices["L_S_N"], wmodel="BM25"),
    "L_S_N_DPH": pt.BatchRetrieve(indices["L_S_N"], wmodel="DPH"),
}

# Evaluation
pt.Experiment(
    list(retrieval_models.values()),
    pt_dataset.get_topics(),
    pt_dataset.get_qrels(),
    ["ndcg_cut.10", "recip_rank", "recall_100", "P_10"],
    names=list(retrieval_models.keys())
)

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:   0%|          | 0/126958 [00:00<?, ?it/s]

13:16:30.168 [ForkJoinPool-9-worker-3] WARN org.terrier.structures.indexing.Indexer - TermPipeline object org.terrier.terms.LemurKrovetzStemmer not found: java.lang.ClassNotFoundException: org.terrier.terms.LemurKrovetzStemmer
13:16:30.205 [ForkJoinPool-9-worker-3] ERROR org.terrier.structures.Index - Cannot create new index: path /workspaces/ir-lab-sose-2024-ir-sose-24-6/gridsearch/./var/tmp/index1/ does not exist, or cannot be written to


java.lang.ClassNotFoundException: org.terrier.terms.LemurKrovetzStemmer
	at java.base/jdk.internal.loader.BuiltinClassLoader.loadClass(BuiltinClassLoader.java:581)
	at java.base/jdk.internal.loader.ClassLoaders$AppClassLoader.loadClass(ClassLoaders.java:178)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:527)
	at java.base/java.lang.Class.forName0(Native Method)
	at java.base/java.lang.Class.forName(Class.java:398)
	at org.terrier.utility.ApplicationSetup.getClass(ApplicationSetup.java:416)
	at org.terrier.structures.indexing.Indexer.load_pipeline(Indexer.java:323)
	at org.terrier.structures.indexing.Indexer.init(Indexer.java:197)
	at org.terrier.structures.indexing.classical.BasicIndexer.<init>(BasicIndexer.java:183)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingCo

JavaException: JVM exception occurred: java.util.concurrent.ExecutionException: java.lang.IllegalArgumentException: java.lang.IllegalArgumentException: Cannot create new index: path /workspaces/ir-lab-sose-2024-ir-sose-24-6/gridsearch/./var/tmp/index1/ does not exist, or cannot be written to java.lang.RuntimeException