## Step 1: Ensure that libraries are imported

In [1]:
!pip3 install tira>=0.0.141 ir-datasets python-terrier==0.10.0


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
!rm -Rf ~/.tira/extracted_datasets/ir-lab-wise-2024/subsampled-ms-marco-ir-lab-20250105-test
!rm -Rf ~/.tira/.archived

In [3]:
# This command loads and starts PyTerrier so that it also works in TIRA.

from tira.third_party_integrations import ensure_pyterrier_is_loaded

ensure_pyterrier_is_loaded()

  from .autonotebook import tqdm as notebook_tqdm
PyTerrier 0.10.0 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
# PyTerrier must be imported after `ensure_pyterrier_is_loaded` is called.

from pyterrier import started, init

if not started():
    init()

## Step 2: Load the dataset

In [5]:
from pyterrier import get_dataset

dataset = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-ir-lab-20250105-test')
dataset

IRDSDataset('ir-lab-wise-2024/subsampled-ms-marco-ir-lab-20250105-test')

## Step 3: Create the retrieval pipeline with TIRA

In this example, we will just use two existing retrieval components from TIREx: BM25 and DirichletLM, two lexical rankers.
We load the approaches via the TIRA API.

In [6]:
from tira.rest_api_client import Client

tira_client = Client()

In [7]:
bo1 = tira_client.pt.from_retriever_submission(
    approach='ir-lab-wise-2024/ir-wise-24-th25/BM25 + BO1 Query Expansion',
    dataset='subsampled-ms-marco-ir-lab-20250105-test',
)
bo1

Download: 1.00MiB [00:00, 9.31MiB/s]


Download finished. Extract...
Extraction finished:  /home/codespace/.tira/extracted_runs/ir-lab-wise-2024/subsampled-ms-marco-ir-lab-20250105-test/ir-wise-24-th25


TiraSourceTransformer()

In [8]:
querydoc = tira_client.pt.from_retriever_submission(
    approach='ir-lab-wise-2024/ir-wise-24-th25/BM25 + query2doc',
    dataset='subsampled-ms-marco-ir-lab-20250105-test',
)
querydoc

Download: 1.02MiB [00:00, 9.35MiB/s]


Download finished. Extract...
Extraction finished:  /home/codespace/.tira/extracted_runs/ir-lab-wise-2024/subsampled-ms-marco-ir-lab-20250105-test/ir-wise-24-th25


TiraSourceTransformer()

In [9]:
wordnet = tira_client.pt.from_retriever_submission(
    approach='ir-lab-wise-2024/ir-wise-24-th25/BM25 + Wordnet QueryExpansion',
    dataset='subsampled-ms-marco-ir-lab-20250105-test',
)
wordnet

Download: 986kiB [00:00, 8.84MiB/s]

Download finished. Extract...
Extraction finished:  /home/codespace/.tira/extracted_runs/ir-lab-wise-2024/subsampled-ms-marco-ir-lab-20250105-test/ir-wise-24-th25





TiraSourceTransformer()

In [10]:
bm25 = tira_client.pt.from_retriever_submission(
    approach='ir-lab-wise-2024/ir-wise-24-uk-ir-1/BM25',
    dataset='subsampled-ms-marco-ir-lab-20250105-test',
)
bm25

TiraSourceTransformer()

## Step 4: Measure effectiveness

Now let us measure the nDCG@10 effectiveness of both systems on the Touché 2020 task 1 dataset.

In [11]:
from pyterrier.pipelines import Experiment

experiment = Experiment(
    retr_systems=[
        bo1,
        querydoc,
        wordnet,
        bm25,
    ],
    topics=dataset.get_topics("query"),
    qrels=dataset.get_qrels(),
    eval_metrics=["ndcg_cut_10"],
    names=[
        "BO1 Query Expansion",
        "BM25 + query2doc",
        "Wordnet QueryExpansion",
        "BM25",
    ],
    perquery=True,
)
experiment.sample(n=10)

Download from Zenodo: https://zenodo.org/records/14743268/files/subsampled-ms-marco-ir-lab-20250105-test-truths.zip


Download: 100%|██████████| 50.6k/50.6k [00:00<00:00, 369kiB/s] 


Download finished. Extract...
Extraction finished:  /home/codespace/.tira/extracted_datasets/ir-lab-wise-2024/subsampled-ms-marco-ir-lab-20250105-test/


Unnamed: 0,name,qid,measure,value
44,BO1 Query Expansion,16,ndcg_cut_10,0.322272
78,BM25 + query2doc,13,ndcg_cut_10,1.0
108,Wordnet QueryExpansion,36,ndcg_cut_10,0.220092
20,BO1 Query Expansion,24,ndcg_cut_10,0.135685
3,BO1 Query Expansion,48,ndcg_cut_10,0.472157
160,BM25,19,ndcg_cut_10,0.425208
31,BO1 Query Expansion,21,ndcg_cut_10,0.0
143,BM25,51,ndcg_cut_10,0.930569
149,BM25,45,ndcg_cut_10,1.0
169,BM25,21,ndcg_cut_10,0.0


In [19]:
experiment_bm25 = experiment[experiment["name"] == "BM25"]\
    .drop(columns=["name"])
experiment_bo1 = experiment[experiment["name"] == "BO1 Query Expansion"]\
    .drop(columns=["name"])


experiment_paired = experiment_bm25.merge(
    experiment_bo1,
    on=["qid", "measure"],
    suffixes=("_bm25", "_bo1"),
)
experiment_paired.head(n=10)

Unnamed: 0,qid,measure,value_bm25,value_bo1
0,10,ndcg_cut_10,0.546257,0.135685
1,11,ndcg_cut_10,0.063621,0.063621
2,12,ndcg_cut_10,0.224663,0.0
3,13,ndcg_cut_10,1.0,1.0
4,14,ndcg_cut_10,0.921602,0.864315
5,16,ndcg_cut_10,0.643404,0.322272
6,17,ndcg_cut_10,0.0,0.0
7,18,ndcg_cut_10,0.0,0.0
8,19,ndcg_cut_10,0.425208,0.425208
9,2,ndcg_cut_10,1.0,1.0


In [22]:
experiment_bm25 = experiment[experiment["name"] == "BM25"]\
    .drop(columns=["name"])
experiment_query = experiment[experiment["name"] == "BM25 + query2doc"]\
    .drop(columns=["name"])


experiment_paired = experiment_bm25.merge(
    
    experiment_query,
   
    on=["qid", "measure"],
    suffixes=("_bm25", "_querydoc"),
)
experiment_paired.head(n=10)

Unnamed: 0,qid,measure,value_bm25,value_querydoc
0,10,ndcg_cut_10,0.546257,0.249664
1,11,ndcg_cut_10,0.063621,0.168152
2,12,ndcg_cut_10,0.224663,0.0
3,13,ndcg_cut_10,1.0,1.0
4,14,ndcg_cut_10,0.921602,0.142019
5,16,ndcg_cut_10,0.643404,0.188444
6,17,ndcg_cut_10,0.0,0.0
7,18,ndcg_cut_10,0.0,0.0
8,19,ndcg_cut_10,0.425208,0.288382
9,2,ndcg_cut_10,1.0,1.0


In [25]:
experiment_bm25 = experiment[experiment["name"] == "BM25"]\
    .drop(columns=["name"])

experiment_wordnet = experiment[experiment["name"] == "Wordnet QueryExpansion"]\
    .drop(columns=["name"])

experiment_paired = experiment_bm25.merge(
    
    experiment_wordnet,
    on=["qid", "measure"],
    suffixes=("_bm25","_wordnet"),
)
experiment_paired.head(n=10)

Unnamed: 0,qid,measure,value_bm25,value_wordnet
0,10,ndcg_cut_10,0.546257,0.546257
1,11,ndcg_cut_10,0.063621,0.063621
2,12,ndcg_cut_10,0.224663,0.224663
3,13,ndcg_cut_10,1.0,1.0
4,14,ndcg_cut_10,0.921602,0.921602
5,16,ndcg_cut_10,0.643404,0.643404
6,17,ndcg_cut_10,0.0,0.0
7,18,ndcg_cut_10,0.0,0.0
8,19,ndcg_cut_10,0.425208,0.425208
9,2,ndcg_cut_10,1.0,1.0


## Step 5: Conduct hypothesis tests

On this _paired_ measurement data, we can now conduct _paired_ t-tests to test for statistical significance of given hypotheses.
Remember that the choice of your test depends (amongst other factors) on how the hypothesis is formulated.

Let us test some hypotheses to get a feeling of what this means:

#### Hypothesis 2.1: Der Einsatz von der BO1 Query Expansion von PyTerrier zusätzlich zur BM25-Methode führt zu signifikant verbesserten NDCG@10-Werten im Vergleich zur BM25-Methode, da durch die Erweiterung der Suchanfragen zusätzliche relevante Dokumente identifiziert werden können.

Significance test: one-sided paired t-test \
Significance level: $\alpha = 0.05$ (or $p < 0.05$)

In [20]:
from scipy.stats import ttest_rel

ttest_rel(
    experiment_paired["value_bo1"],
    experiment_paired["value_bm25"],
    alternative='two-sided',
).pvalue

0.02266637729151283

Because this is lower than our significance level. This suggests there is a statistically significant difference.

In [21]:
from scipy.stats import ttest_rel

ttest_rel(
    experiment_paired["value_bo1"],
    experiment_paired["value_bm25"],
    alternative='greater',
).pvalue

0.9886668113542436

This time, the probability p of the null hypothesis is higher than our significance level alpha.
So we cannot reject the null hypothesis and fail to confirm hypothesis 1.

#### Hypothesis 2.2: Der Einsatz unserer eigenen Query Expansion Methode, basierend auf LLMs, zusätzlich zur BM25-Methode führt zu signifikant verbesserten NDCG@10-Werten im Vergleich zur BM25-Methode, da durch die Erweiterung der Suchanfragen zusätzliche relevante Dokumente identifiziert werden können.

Significance test: one-sided paired t-test \
Significance level: $\alpha = 0.05$ (or $p < 0.05$)

In [23]:
from scipy.stats import ttest_rel

ttest_rel(
    experiment_paired["value_querydoc"],
    experiment_paired["value_bm25"],
    alternative='two-sided',
).pvalue

2.335319848825698e-05

In [24]:
from scipy.stats import ttest_rel

ttest_rel(
    experiment_paired["value_querydoc"],
    experiment_paired["value_bm25"],
    alternative='greater',
).pvalue

0.9999883234007559

#### Hypothesis 2.3: Der Einsatz unserer eigenen Query Expansion Methode, basierend auf der lexikalischen Datenbank Wordnet, zusätzlich zur BM25-Methode führt zu signifikant verbesserten NDCG@10-Werten im Vergleich zur BM25-Methode, da durch die Erweiterung der Suchanfragen zusätzliche relevante Dokumente identifiziert werden können.

Significance test: one-sided paired t-test \
Significance level: $\alpha = 0.05$ (or $p < 0.05$)

In [26]:
from scipy.stats import ttest_rel

ttest_rel(
    experiment_paired["value_wordnet"],
    experiment_paired["value_bm25"],
    alternative='two-sided',
).pvalue

0.32265765224996135

In [27]:
from scipy.stats import ttest_rel

ttest_rel(
    experiment_paired["value_wordnet"],
    experiment_paired["value_bm25"],
    alternative='greater',
).pvalue

0.8386711738750193