# PyTerrier Notebook for Full-Rank Submissions

This notebook serves as a baseline full-rank submission for [TIRA](https://tira.io)/[TIREx](https://tira.io/tirex) that builds a PyTerrier index and subsequently creates a run with BM25.

### Step 1: Ensure Libraries are Imported

In [None]:
import os
import math

# Detect if we are in the TIRA sandbox
# Install the required dependencies if we are not in the sandbox.
if 'TIRA_DATASET_ID' not in os.environ:
    !pip3 install python-terrier tira==0.0.88 ir_datasets
else:
    print('We are in the TIRA sandbox.')

In [None]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run

# this loads and starts pyterrier so that it also works in the TIRA
ensure_pyterrier_is_loaded()

# PyTerrier must be imported after the call to ensure_pyterrier_is_loaded in TIRA.
import pyterrier as pt
from pyterrier.measures import *

if not pt.started():
    pt.init(boot_packages=['mam10eks:custom-terrier-token-processing:0.0.1', 'com.github.terrierteam:terrier-prf:-SNAPSHOT'])
    from jnius import autoclass


### Step 2: Load the data

In [None]:
data = pt.get_dataset('irds:ir-lab-jena-leipzig-wise-2023/validation-20231104-training')

In [None]:
def inverse_linear_weight_function(value, max_length):
    """Inverse linear weight function, which applies linearly increasing weights."""
    if max_length == 0:
        return 1.0
    return value / max_length

def english_tokenizer(string):
    """Tokenizes the input string according to the english terrier tokenizer and generates a list of tokens."""
    english_tokeniser = pt.TerrierTokeniser.english
    english_tokeniser = pt.TerrierTokeniser._to_obj(english_tokeniser)
    english_tokeniser = pt.TerrierTokeniser._to_class(english_tokeniser)

    tokeniser = "org.terrier.indexing.tokenisation." + english_tokeniser
    tokenobj = pt.autoclass(tokeniser)()
    _query_fn = tokenobj.getTokens
    return _query_fn(string)


def apply_query_term_weighing(query, weight_function):
    query_parts = english_tokenizer(query)
    query_length = len(query_parts)
    weights = [weight_function(x, query_length - 1) for x in range(query_length)]

    return " ".join(
        [f"{query_part}^{weight}" for query_part, weight in zip(query_parts, weights)]
    )

In [None]:
topics = data.get_topics("title")
for idx, entry in topics.iterrows():
    query = entry["query"]
    query = apply_query_term_weighing(query, inverse_linear_weight_function)
    topics.at[idx, "query"] = query
topics

### Step 3: Build the Index

In [None]:
print('Build index:')
# Both the indexer and batch retrieve use terriers default porter stemmer and a default stopword list (englisch)
iter_indexer = pt.IterDictIndexer("/tmp/index", overwrite = True, blocks = True,meta = {'docno':100, 'text': 20480}, stemmer = 'PorterStemmer')
!rm -Rf /tmp/index
index_ref = iter_indexer.index(data.get_corpus_iter())

print('Done. Index is created')

### Step 4: Create the Retrieval Pipeline

In [None]:
index = pt.IndexFactory.of(index_ref)

bm25 = pt.BatchRetrieve(index, wmodel="BM25", verbose=True)

#### Step 4.1: Add Query Expansion

In [None]:
#Pipeline
pipe = bm25

### Step 5: Create the Run and Persist the Run

In [None]:
print('Create run')
run = pipe.transform(topics).fillna(0)
print('Done, run was created')

### Step 6: Run Experiments

In [None]:
persist_and_normalize_run(run, 'bm25-linear-weighing-descending')