# IR Lab SoSe 2024: Knowledge Knights prototype
This is a first try notebook to improve upon the baseline retrieval system via Stopword Removal and 
Query Expansion.

# Step 1: Importing relevant libraries:

In [9]:
import os
import pandas as pd
import re

# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client

# Loading and starting for Tira use
ensure_pyterrier_is_loaded()
# Importing after the call to ensure_pyterrier_is_loaded in TIRA.
import pyterrier as pt

# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
tira = Client()

# Import Stopword-List
import nltk
nltk.download('stopwords')  # Lade spezifisch die Stopwords herunter
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Load Spacy model, if not installed
!python -m spacy download en_core_web_sm

# Generate custom stopword list
nltk_stopwords = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")
spacy_stopwords = set(nlp.Defaults.stop_words)
sklearn_stopwords = set(ENGLISH_STOP_WORDS)
combined_stopwords = set.union(nltk_stopwords, spacy_stopwords, sklearn_stopwords)

# Create and save stopword file
file_path = "custom_stopwords.txt"

with open(file_path, 'w+') as file:
    for element in combined_stopwords:
        file.write(element + "\n")

# Set property for stopword file in PyTerrier
pt.set_property('stopwords.filename', './custom_stopwords.txt')

Due to execution in TIRA, I have patched ir_datasets to always return the single input dataset mounted to the sandbox.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 24.7 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Step 2: Loading Dataset:

In [10]:
print('Loading Dataset...')
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
print('Dataset loaded.')

# TODO implement Query Expansion

Loading Dataset...
Dataset loaded.


# Step 3: Index Building

In [11]:
print('Building Index...')

def create_index(pt_dataset, stopwords):
    indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, meta={'docno': 100, 'text': 20480}, stopwords=stopwords)
    index_ref = indexer.index(pt_dataset)
    return pt.IndexFactory.of(index_ref)

index = create_index(pt_dataset.get_corpus_iter(), combined_stopwords)
print('Index created.')

Building Index...


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:   0%|          | 0/126958 [00:00<?, ?it/s]

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:  65%|██████▌   | 82849/126958 [00:20<00:07, 5639.26it/s]



ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:26<00:00, 4712.75it/s] 


16:12:50.855 [ForkJoinPool-2-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 4 empty documents
Index created.


# Step 4: Create Retrieval Pipeline

In [12]:
# definition of BM25 pipeline with stopword index
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

# Step 5: Run

In [13]:
print('Create run')
run = bm25(pt_dataset.get_topics('text'))
print('Done, run was created')
persist_and_normalize_run(run, 'bm25-stopwords-query-expansion')

Create run


Done, run was created
I use the environment variable "TIRA_OUTPUT_DIR" to determine where I should store the run file using "." as default.
Done. run file is stored under "./run.txt".
