# IR Lab SoSe 2024 Team 6: Grid Search

This jupyter notebook is a grid search over all the customizations we looked at regarding Stopwords, Stemming and Lemmatization, and different retrieval systems.
For each of these topics we individually tested a few of the typical types and test the best two of each in this grid search.

In [1]:
#Standard Imports für alle Notebooks

!pip3 install tira ir-datasets python-terrier nltk scikit-learn spacy

from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt
import spacy
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import os
import time
import hashlib

ensure_pyterrier_is_loaded()
tira = Client()

pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting spacy
  Downloading spacy-3.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting regex>=2021.8.3
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (775 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m775.1/775.1 KB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting click
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 KB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.10-py3-none-any.whl (17 kB)
Collecting spacy-

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
# Laden der NLTK Ressourcen
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Laden der SpaCy-Ressourcen
!python -m spacy download en_core_web_sm

# Laden des SpaCy-Modells
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Methode um Beschreibung des POS-Tags zu bekommen für den NLTK Lemmatizer
def get_wordnet_pos_nltk(treebank_tag):
    """Konvertiert POS-Tag in ein Format, das vom WordNet-Lemmatizer unterstützt wird."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Funktion um den Text zu lemmatizen für NLTK Lemmatizer
def lemmatize_text_nltk(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos_nltk(tag)) for token, tag in pos_tags]
    return ' '.join(lemmatized_tokens)

# Definition der Funktion zur Lemmatization eines Textes mit SpaCy
def lemmatize_text_spacy(text):
    """Lemmatiziert den gegebenen Text mit SpaCy."""
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_tokens)

def preprocess_documents(documents, method):
    if method == 'nltk':
        lemmatize_text = lemmatize_text_nltk
    elif method == 'spacy':
        lemmatize_text = lemmatize_text_spacy
    else:
        raise ValueError("Invalid method specified. Use 'nltk' or 'spacy'.")

    for doc in documents:
        doc['text'] = lemmatize_text(doc['text'])
        yield doc

#Funktion um eigene Indecies zu erstellen
def create_index(base_path, documents, stopwords, stemmer):
    # Generate a unique identifier based on current timestamp
    unique_id = hashlib.sha1(str(time.time()).encode('utf-8')).hexdigest()[:8]
    
    # Construct the unique path using base_path and unique_id
    index_path = os.path.join(base_path, f"index_{unique_id}/")
    
    indexer = pt.IterDictIndexer(index_path, overwrite=True, meta={'docno': 100, 'text': 20480}, stopwords=stopwords, stemmer=stemmer)
    index_ref = indexer.index(documents)
    return pt.IndexFactory.of(index_ref)

#Funktion um aus einem txt-file eine Python Liste zu machen
def read_text_file_to_array(file_path):
    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()
            # Remove newline characters and convert to integers
            array = [(line.strip()) for line in lines]
            return array
    except FileNotFoundError:
        print(f"File {file_path} not found.")
        return None


In [4]:
# Load custom stopword lists
terrier_custom_stopwords = read_text_file_to_array('../terrier-custom.txt')
chatgpt_stopwords = read_text_file_to_array('../chatgpt-stopwordlist.txt')

base_path = '/workspaces/ir-lab-sose-2024-ir-sose-24-6/gridsearch/var/tmp/'

# Naming: [Stemmer/Lemmatizer]_[Type]__[StopwordList]
indices = {
    "S_E_T": create_index(base_path, pt_dataset.get_corpus_iter(), terrier_custom_stopwords, 'EnglishSnowballStemmer'),
    "S_E_C": create_index(base_path, pt_dataset.get_corpus_iter(), chatgpt_stopwords, 'EnglishSnowballStemmer'),
    "S_E_N": create_index(base_path, pt_dataset.get_corpus_iter(), [], 'EnglishSnowballStemmer'),
    "S_P_T": create_index(base_path, pt_dataset.get_corpus_iter(), terrier_custom_stopwords, 'TRv2PorterStemmer'),
    "S_P_C": create_index(base_path, pt_dataset.get_corpus_iter(), chatgpt_stopwords, 'TRv2PorterStemmer'),
    "S_P_N": create_index(base_path, pt_dataset.get_corpus_iter(), [], 'TRv2PorterStemmer'),
    "L_N_T": create_index(base_path, preprocess_documents(pt_dataset.get_corpus_iter(), 'nltk'), terrier_custom_stopwords, 'NoOp'),
    "L_N_C": create_index(base_path, preprocess_documents(pt_dataset.get_corpus_iter(), 'nltk'), chatgpt_stopwords, 'NoOp'),
    "L_N_N": create_index(base_path, preprocess_documents(pt_dataset.get_corpus_iter(), 'nltk'), [], 'NoOp'),
    "L_S_T": create_index(base_path, preprocess_documents(pt_dataset.get_corpus_iter(), 'spacy'), terrier_custom_stopwords, 'NoOp'),
    "L_S_C": create_index(base_path, preprocess_documents(pt_dataset.get_corpus_iter(), 'spacy'), chatgpt_stopwords, 'NoOp'),
    "L_S_N": create_index(base_path, preprocess_documents(pt_dataset.get_corpus_iter(), 'spacy'), [], 'NoOp')
}

retrieval_models = {
    "S_E_T_XSqrA_M": pt.BatchRetrieve(indices["S_E_T"], wmodel="XSqrA_M"),
    "S_E_T_BM25": pt.BatchRetrieve(indices["S_E_T"], wmodel="BM25"),
    "S_E_T_DPH": pt.BatchRetrieve(indices["S_E_T"], wmodel="DPH"),
    
    "S_E_C_XSqrA_M": pt.BatchRetrieve(indices["S_E_C"], wmodel="XSqrA_M"),
    "S_E_C_BM25": pt.BatchRetrieve(indices["S_E_C"], wmodel="BM25"),
    "S_E_C_DPH": pt.BatchRetrieve(indices["S_E_C"], wmodel="DPH"),
    
    "S_E_N_XSqrA_M": pt.BatchRetrieve(indices["S_E_N"], wmodel="XSqrA_M"),
    "S_E_N_BM25": pt.BatchRetrieve(indices["S_E_N"], wmodel="BM25"),
    "S_E_N_DPH": pt.BatchRetrieve(indices["S_E_N"], wmodel="DPH"),
    
    "S_P_T_XSqrA_M": pt.BatchRetrieve(indices["S_P_T"], wmodel="XSqrA_M"),
    "S_P_T_BM25": pt.BatchRetrieve(indices["S_P_T"], wmodel="BM25"),
    "S_P_T_DPH": pt.BatchRetrieve(indices["S_P_T"], wmodel="DPH"),
    
    "S_P_C_XSqrA_M": pt.BatchRetrieve(indices["S_P_C"], wmodel="XSqrA_M"),
    "S_P_C_BM25": pt.BatchRetrieve(indices["S_P_C"], wmodel="BM25"),
    "S_P_C_DPH": pt.BatchRetrieve(indices["S_P_C"], wmodel="DPH"),
    
    "S_P_N_XSqrA_M": pt.BatchRetrieve(indices["S_P_N"], wmodel="XSqrA_M"),
    "S_P_N_BM25": pt.BatchRetrieve(indices["S_P_N"], wmodel="BM25"),
    "S_P_N_DPH": pt.BatchRetrieve(indices["S_P_N"], wmodel="DPH"),
    
    "L_N_T_XSqrA_M": pt.BatchRetrieve(indices["L_N_T"], wmodel="XSqrA_M"),
    "L_N_T_BM25": pt.BatchRetrieve(indices["L_N_T"], wmodel="BM25"),
    "L_N_T_DPH": pt.BatchRetrieve(indices["L_N_T"], wmodel="DPH"),
    
    "L_N_C_XSqrA_M": pt.BatchRetrieve(indices["L_N_C"], wmodel="XSqrA_M"),
    "L_N_C_BM25": pt.BatchRetrieve(indices["L_N_C"], wmodel="BM25"),
    "L_N_C_DPH": pt.BatchRetrieve(indices["L_N_C"], wmodel="DPH"),
    
    "L_N_N_XSqrA_M": pt.BatchRetrieve(indices["L_N_N"], wmodel="XSqrA_M"),
    "L_N_N_BM25": pt.BatchRetrieve(indices["L_N_N"], wmodel="BM25"),
    "L_N_N_DPH": pt.BatchRetrieve(indices["L_N_N"], wmodel="DPH"),
    
    "L_S_T_XSqrA_M": pt.BatchRetrieve(indices["L_S_T"], wmodel="XSqrA_M"),
    "L_S_T_BM25": pt.BatchRetrieve(indices["L_S_T"], wmodel="BM25"),
    "L_S_T_DPH": pt.BatchRetrieve(indices["L_S_T"], wmodel="DPH"),
    
    "L_S_C_XSqrA_M": pt.BatchRetrieve(indices["L_S_C"], wmodel="XSqrA_M"),
    "L_S_C_BM25": pt.BatchRetrieve(indices["L_S_C"], wmodel="BM25"),
    "L_S_C_DPH": pt.BatchRetrieve(indices["L_S_C"], wmodel="DPH"),
    
    "L_S_N_XSqrA_M": pt.BatchRetrieve(indices["L_S_N"], wmodel="XSqrA_M"),
    "L_S_N_BM25": pt.BatchRetrieve(indices["L_S_N"], wmodel="BM25"),
    "L_S_N_DPH": pt.BatchRetrieve(indices["L_S_N"], wmodel="DPH"),
}

# Evaluation
pt.Experiment(
    list(retrieval_models.values()),
    pt_dataset.get_topics(),
    pt_dataset.get_qrels(),
    ["ndcg_cut.10", "recip_rank", "recall_100", "P_10"],
    names=list(retrieval_models.keys())
)

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/ir-acl-anthology-20240504-inputs.zip?download=1
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 39.4M/39.4M [00:00<00:00, 63.2MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:  70%|███████   | 89066/126958 [00:25<00:08, 4311.25it/s]



ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:31<00:00, 4031.95it/s] 


17:10:15.056 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 3 empty documents


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:  70%|███████   | 89407/126958 [00:19<00:07, 5029.26it/s]



ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:25<00:00, 4884.17it/s] 


17:10:46.327 [ForkJoinPool-2-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 2 empty documents


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:28<00:00, 4515.70it/s] 
ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:  70%|███████   | 89334/126958 [00:16<00:06, 5894.16it/s]



ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:21<00:00, 5898.25it/s] 


17:11:44.708 [ForkJoinPool-4-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 3 empty documents


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:  70%|███████   | 89081/126958 [00:16<00:06, 5530.62it/s]



ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:21<00:00, 5868.58it/s] 


17:12:10.465 [ForkJoinPool-5-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 2 empty documents


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:23<00:00, 5397.51it/s] 
ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:  71%|███████   | 89898/126958 [07:07<00:25, 1431.49it/s]



ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [08:59<00:00, 235.50it/s] 


17:21:41.851 [ForkJoinPool-7-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 3 empty documents


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:  71%|███████   | 90022/126958 [07:08<00:25, 1428.00it/s]



ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [09:02<00:00, 234.02it/s] 


17:30:48.408 [ForkJoinPool-8-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 2 empty documents


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [08:52<00:00, 238.31it/s] 
ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:  71%|███████   | 90037/126958 [23:11<02:49, 217.94it/s]



ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [29:56<00:00, 70.68it/s] 


18:09:45.938 [ForkJoinPool-10-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 3 empty documents


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:  71%|███████   | 90045/126958 [22:57<02:55, 210.80it/s]



ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [29:38<00:00, 71.37it/s] 


18:39:29.121 [ForkJoinPool-11-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 2 empty documents


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [29:44<00:00, 71.13it/s] 


Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/ir-acl-anthology-20240504-truth.zip?download=1
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 29.6k/29.6k [00:00<00:00, 1.45MiB/s]

Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/





There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


Unnamed: 0,name,ndcg_cut.10,recip_rank,recall_100,P_10
0,S_E_T_XSqrA_M,0.4551,0.664893,0.593577,0.407353
1,S_E_T_BM25,0.376976,0.5971,0.600805,0.333824
2,S_E_T_DPH,0.434348,0.642687,0.595102,0.389706
3,S_E_C_XSqrA_M,0.434296,0.658947,0.594299,0.379412
4,S_E_C_BM25,0.36766,0.582085,0.598654,0.327941
5,S_E_C_DPH,0.421294,0.622764,0.580136,0.377941
6,S_E_N_XSqrA_M,0.422635,0.666621,0.584954,0.372059
7,S_E_N_BM25,0.355778,0.562072,0.561727,0.319118
8,S_E_N_DPH,0.405986,0.610658,0.573483,0.364706
9,S_P_T_XSqrA_M,0.450618,0.661251,0.594704,0.402941
