# PyTerrier Notebook for Full-Rank Submissions

This notebook serves as a baseline full-rank submission for [TIRA](https://tira.io)/[TIREx](https://tira.io/tirex) that builds a PyTerrier index and subsequently creates a run with BM25.

### Step 1: Ensure Libraries are Imported

In [None]:
import os

# Detect if we are in the TIRA sandbox
# Install the required dependencies if we are not in the sandbox.
if 'TIRA_DATASET_ID' not in os.environ:
    !python -m pip install --upgrade pip
    !pip3 install  python-terrier tira==0.0.88 ir_datasets
    !pip3 install -q python-terrier
    !pip3 install -q --upgrade git+https://github.com/terrierteam/pyterrier_t5.git
else:
    print('We are in the TIRA sandbox.')

from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run

# PyTerrier must be imported after the call to ensure_pyterrier_is_loaded in TIRA.
import pyterrier as pt



  from .autonotebook import tqdm as notebook_tqdm


Ensure Pyterrier integration is loaded

In [None]:
ensure_pyterrier_is_loaded()

Due to execution in TIRA, I have patched ir_datasets to always return the single input dataset mounted to the sandbox.
Start PyTerrier with version=5.7, helper_version=0.0.7, no_download=True


PyTerrier 0.10.0 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


### Step 2: Load data, create index

In [None]:
dataset = pt.get_dataset('irds:ir-lab-jena-leipzig-wise-2023/validation-20231104-training')
topics = dataset.get_topics(variant="title")

from pathlib import Path
index_loc = "./index"
if not (Path(index_loc) / "data.properties").exists():
    indexer = pt.IterDictIndexer(index_loc)
    indexref = indexer.index(dataset.get_corpus_iter())
else:
    indexref = pt.IndexFactory.of(index_loc)

In [None]:
from tira.rest_api_client import Client
tira_client = Client()

# TODO change to 'jena-topics-20231026-test' later
dataset_name = 'validation-20231104-training'

In [None]:
bm25_bo1_pl2_max = tira_client.pt.from_retriever_submission('ir-lab-jena-leipzig-wise-2023/galapagos-tortoise/mild-duck', dataset_name) 
max_run = bm25_bo1_pl2_max.transform(topics)

#### Secondly, rerank with mean passage aggregation.

In [None]:
bm25_bo1_pl2_mean = tira_client.pt.from_retriever_submission('ir-lab-jena-leipzig-wise-2023/galapagos-tortoise/poky-claim', dataset_name)
mean_run = bm25_bo1_pl2_mean.transform(topics)

Unnamed: 0,qid,docno,score,query,query_0,rank
0,q072210025,doc072201202671,26.547201,recipe spring roll,applypipeline:off recip^1.053565089 spring^1.3...,2
1,q072210025,doc072201901565,23.233679,recipe spring roll,applypipeline:off recip^1.053565089 spring^1.3...,7
2,q072210025,doc072203110074,25.881011,recipe spring roll,applypipeline:off recip^1.053565089 spring^1.3...,4
3,q072210025,doc072204307357,26.691692,recipe spring roll,applypipeline:off recip^1.053565089 spring^1.3...,1
4,q072210025,doc072207501000,25.487764,recipe spring roll,applypipeline:off recip^1.053565089 spring^1.3...,6


In [None]:
bm25_bo1_pl2_kmax = tira_client.pt.from_retriever_submission('ir-lab-jena-leipzig-wise-2023/galapagos-tortoise/edible-status', dataset_name)
kmax_run = bm25_bo1_pl2_kmax.transform(topics)

In [None]:
experiment = pt.Experiment(
        [
            bm25_bo1_pl2_max,
            bm25_bo1_pl2_mean,
            bm25_bo1_pl2_kmax,
        ],
        dataset.get_topics(),
        dataset.get_qrels(),
        eval_metrics=["ndcg_cut_5", "ndcg"],
        names=[
            "max passage",
            "mean passage",
            "best kmax passage",
        ],
        # perquery=True
    )

hypo1 = pt.Experiment(
    [bm25_bo1_pl2_max, bm25_bo1_pl2_mean],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=["ndcg_cut_5", "ndcg"],
    names=["max passage", "mean passage"],
    baseline = 0
)

hypo2 = pt.Experiment(
    [bm25_bo1_pl2_best_kmax, bm25_bo1_pl2_max, bm25_bo1_pl2_mean],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=["ndcg_cut_5", "ndcg"],
    names=["best kmax passage", "max passage", "mean passage"],
    baseline = 0
)

### Persist experiment and tests.

In [None]:
with open("results.txt", "wt") as file:
    file.write("Experiment:\n\n")
    experiment.string = experiment.to_string(header=True, index=False)
    file.write(experiment_string + "\n\n")
    # ...
    file.write("Results for Hypothesis 1:\n\n")
    file.write("Significance test:\n")
    hypo1_string = hypo1.to_string(header=True, index=False)
    file.write(hypo1_string + "\n\n")
    # ...
    file.write("Results for Hypothesis 2:\n\n")
    file.write("Best k in [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] is " + str(bm25_bo1_pl2_best_kmax.k) + "\n\n")
    file.write("Significance test:\n")
    hypo2_string = hypo2.to_string(header=True, index=False)
    file.write(hypo2_string)
    # ...