# PyTerrier Notebook for Full-Rank Submissions

This notebook serves as a baseline full-rank submission for [TIRA](https://tira.io)/[TIREx](https://tira.io/tirex) that builds a PyTerrier index and subsequently creates a run with BM25.

### Step 1: Ensure Libraries are Imported

In [3]:
import os
import pandas as pd
import math
import re

# Detect if we are in the TIRA sandbox
# Install the required dependencies if we are not in the sandbox.
if 'TIRA_DATASET_ID' not in os.environ:
    !pip3 install python-terrier tira==0.0.88 ir_datasets
else:
    print('We are in the TIRA sandbox.')

[1;31merror[0m: [1mexternally-managed-environment[0m

[31m×[0m This environment is externally managed
[31m╰─>[0m To install Python packages system-wide, try apt install
[31m   [0m python3-xyz, where xyz is the package you are trying to
[31m   [0m install.
[31m   [0m 
[31m   [0m If you wish to install a non-Debian-packaged Python package,
[31m   [0m create a virtual environment using python3 -m venv path/to/venv.
[31m   [0m Then use path/to/venv/bin/python and path/to/venv/bin/pip. Make
[31m   [0m sure you have python3-full installed.
[31m   [0m 
[31m   [0m If you wish to install a non-Debian packaged Python application,
[31m   [0m it may be easiest to use pipx install xyz, which will manage a
[31m   [0m virtual environment for you. Make sure you have pipx installed.
[31m   [0m 
[31m   [0m See /usr/share/doc/python3.11/README.venv for more information.

[1;35mnote[0m: If you believe this is a mistake, please contact your Python installation or OS dist

In [None]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run

# this loads and starts pyterrier so that it also works in the TIRA
ensure_pyterrier_is_loaded()

# PyTerrier must be imported after the call to ensure_pyterrier_is_loaded in TIRA.
import pyterrier as pt

if not pt.started():
    pt.init(boot_packages=['mam10eks:custom-terrier-token-processing:0.0.1', 'com.github.terrierteam:terrier-prf:-SNAPSHOT'])
    from jnius import autoclass


### Step 2: Load the data

In [None]:
data = pt.get_dataset('irds:ir-lab-jena-leipzig-wise-2023/validation-20231104-training')

In [None]:
def linear_weight_function(value, max_length):
    """Linear weight function, which applies linearly decreasing weights, such taht the last term has weight 1."""
    return (-value+max_length)/max_length

def linear_inverse_weight_function(value, max_length):
    """Linear weight function, which applies linearly decreasing weights, such taht the last term has weight 1."""
    return value/max_length

def centered_parabola_weight_function(value, max_length):
    """Centered parabola weight function, which applies decreasing and increasing weights, such taht the middle term has weight 1."""
    multiplier = 0.5
    return (value*multiplier - (max_length-1)*multiplier*0.5)**2 + 1

def log_weight_function(value, max_length):
    """Logarithmic weight function, which applies decreasing weights, such taht the last term has weight 1."""
    return - math.log2((value+0.1)/(value+(max_length-0.9)+0.1))

def apply_query_term_weighing(query, weight_function):
    query_parts = query.split() # TODO: PyTerrier Split
    query_length = len(query_parts)
    weights = [weight_function(x, query_length) for x in range(query_length)]

    return " ".join([f"{query_part}^{weight}" for query_part, weight in zip(query_parts,weights)])

In [None]:
def preprocess_corpus(data):
    processed_corpus = []
    for element in data.get_corpus_iter():
        element['text'] = apply_query_term_weighing(element['text'], log_weight_function)
        processed_corpus.append(element)

    return processed_corpus

processed_corpus = preprocess_corpus(data)

In [None]:
weightedTopics = data.get_topics('title')
for entry in weightedTopics.iterrows():
    query = entry[1]["query"]
    query = apply_query_term_weighing(query, linear_weight_function)
    entry[1]["query"] = query

inverseWeightedTopics = data.get_topics('title')
for entry in inverseWeightedTopics.iterrows():
    query = entry[1]["query"]
    query = apply_query_term_weighing(query, linear_inverse_weight_function)
    entry[1]["query"] = query

### Step 3: Build the Index

In [None]:
print('Build index:')
# Both the indexer and batch retrieve use terriers default porter stemmer and a default stopword list (englisch)
# TODO: consider adding french stopwords
iter_indexer = pt.IterDictIndexer("/tmp/index", overwrite = True, blocks = True,meta = {'docno':100, 'text': 20480}, stemmer = 'PorterStemmer')
!rm -Rf /tmp/index
index_ref = iter_indexer.index(processed_corpus)

print('Done. Index is created')

### Step 4: Create the Retrieval Pipeline

In [None]:
index = pt.IndexFactory.of(index_ref)

bm25 = pt.BatchRetrieve(index, wmodel="BM25", verbose=True)

#### Step 4.1: Add Query Expansion

In [None]:
#Pipeline
pipe = bm25

### Step 5: Create the Run and Persist the Run

In [None]:
print('Create run')

run = pipe(weightedTopics)
run2 = pipe(inverseWeightedTopics)

print('Done, run was created')

### Step 6: Run Experiments

In [None]:
# Doesn't work in TIRA, only for local testing
pt.Experiment(
   [run, run2],
   data.get_topics()[:50],
   data.get_qrels(),
   eval_metrics=["ndcg_5"],
   names=["BM25", "Spotted Turtle"],
   baseline=0
)

In [None]:
persist_and_normalize_run(run, 'bm25-custom-stopwords')