# PyTerrier Notebook for Full-Rank Submissions

This notebook serves as a baseline full-rank submission for [TIRA](https://tira.io)/[TIREx](https://tira.io/tirex) that builds a PyTerrier index and subsequently creates a run with BM25.

### Step 1: Ensure Libraries are Imported

In [None]:
import os

# Detect if we are in the TIRA sandbox
# Install the required dependencies if we are not in the sandbox.
if 'TIRA_DATASET_ID' not in os.environ:
    !pip3 install python-terrier tira==0.0.88 ir_datasets #nltk spacy
    #!python -m spacy download en_core_web_sm
else:
    print('We are in the TIRA sandbox.')

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run

# this loads and starts pyterrier so that it also works in the TIRA
ensure_pyterrier_is_loaded()

# PyTerrier must be imported after the call to ensure_pyterrier_is_loaded in TIRA.
import pyterrier as pt

if not pt.started():
    pt.init(boot_packages=['mam10eks:custom-terrier-token-processing:0.0.1', 'com.github.terrierteam:terrier-prf:-SNAPSHOT'])
    from jnius import autoclass


#### Step 1.1: Load Stopword-List

In [None]:
import nltk
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# generate custom stopword list
nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")
spacy_stopwords = set(nlp.Defaults.stop_words)
sklearn_stopwords = set(ENGLISH_STOP_WORDS)
combined_stopwords = set.union(nltk_stopwords, spacy_stopwords, sklearn_stopwords)

!rm -Rf /tmp/index
file_path = "custom_stopwords.txt"

with open(file_path, 'w+') as file:
    for element in combined_stopwords:
        file.write(element+ "\n")

pt.set_property('stopwords.filename','./custom_stopwords.txt')

### Step 2: Load the data

In [27]:
data = pt.get_dataset('irds:ir-lab-jena-leipzig-wise-2023/validation-20231104-training')


In [None]:
def preprocess_text(text):
    # TODO: (optional) add text custom text pre-processing (i.e. acronym expansion)
    return text

def preprocess_corpus(data):
    processed_corpus = []
    for element in data.get_corpus_iter():
        element['text'] = preprocess_text(element['text'])
        processed_corpus.append(element)

    return processed_corpus

processed_corpus = preprocess_corpus(data)

In [None]:
topics = data.get_topics('title')
for entry in topics.iterrows():
    query = entry[1]["query"]
    query = preprocess_text(query)
    entry[1]["query"] = query
print('See the first two queries:')
print(topics.head(2))

### Step 3: Build the Index

In [None]:
print('Build index:')
# Both the indexer and batch retrieve use terriers default porter stemmer and a default stopword list (englisch)
# TODO: consider adding french stopwords
iter_indexer = pt.IterDictIndexer("/tmp/index", overwrite = True, blocks = True,meta = {'docno':100, 'text': 20480}, stemmer = 'PorterStemmer')
!rm -Rf /tmp/index
index_ref = iter_indexer.index(processed_corpus)

print('Done. Index is created')

### Step 4: Create the Retrieval Pipeline

In [None]:
index = pt.IndexFactory.of(index_ref)

bm25 = pt.BatchRetrieve(index, wmodel="BM25", verbose=True)
pl2 = pt.BatchRetrieve(index, wmodel="PL2", verbose=True)

#### Step 4.1: Add Query Expansion

In [37]:
#Query Expansion
bo1 = pt.rewrite.Bo1QueryExpansion(index) 

#Pipeline
pipe = (bm25 % 100) >> bo1 >> pl2

### Step 5: Create the Run and Persist the Run

In [None]:
print('Create run')

run = pipe(topics)

print('Done, run was created')

### Step 6: Run Experiments

In [46]:
# Doesn't work in TIRA, only for local testing
#pt.Experiment(
#    [bm25, pipe],
#    data.get_topics()[:50],
#    data.get_qrels(),
#    eval_metrics=["map", "recip_rank", "P_10", "recall_10", "ndcg"],
#    names=["BM25", "Spotted Turtle"],
#    baseline=0
#)

In [None]:
persist_and_normalize_run(run, 'bm25-custom-stopwords')