# IR Lab SoSe 2024: Knowledge Knights prototype
This is a first try notebook to improve upon the baseline retrieval system via Stopword Removal and 
Query Expansion.

# Step 1: Importing relevant libraries:

In [None]:
import os
import pandas as pd
import re

# Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client


# Loading and starting for Tira use
ensure_pyterrier_is_loaded()
# Importing after the call to ensure_pyterrier_is_loaded in TIRA.
import pyterrier as pt

# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
tira = Client()

# Import Stopword-List
import nltk
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# generate custom stopword list
nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")
spacy_stopwords = set(nlp.Defaults.stop_words)
sklearn_stopwords = set(ENGLISH_STOP_WORDS)
combined_stopwords = set.union(nltk_stopwords, spacy_stopwords, sklearn_stopwords)

!rm -Rf /tmp/index
file_path = "custom_stopwords.txt"

with open(file_path, 'w+') as file:
    for element in combined_stopwords:
        file.write(element+ "\n")

pt.set_property('stopwords.filename','./custom_stopwords.txt')

# Step 2: Loading Dataset:

In [None]:
print('Loading Dataset...')
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
print('Dataset loaded.')

# TODO implement Query Expansion

# Step 3: Index Building

In [None]:
print('Building Index...')

def create_index(pt_dataset, stopwords):
    indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, meta={'docno': 100, 'text': 20480}, stopwords=stopwords)
    index_ref = indexer.index(pt_dataset)
    return pt.IndexFactory.of(index_ref)

index = create_index(pt_dataset, combined_stopwords)
print('Index created.')

# Step 4: Create Retrieval Pipeline

In [None]:
# definition of BM25 pipeline
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

# Step 5: Run

In [None]:
print('Create run')
run = bm25(topics)
print('Done, run was created')
persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')