# IR Lab Tutorial: Document Expansion with DocT5Query

TBD...

# Import All Libraries

In [2]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client
ensure_pyterrier_is_loaded()
import pandas as pd
import pyterrier as pt
from tqdm import tqdm
import gzip
import json

tira = Client()

In [4]:
dataset = pt.get_dataset(f'irds:antique/test')

In [29]:
qrels = dataset.get_qrels()

qrels

Unnamed: 0,qid,docno,label,iteration
0,1964316,1964316_5,4,U0
1,1964316,1674088_11,1,Q0
2,1964316,1218838_13,2,Q0
3,1964316,1519022_15,2,Q0
4,1964316,3059341_5,2,Q0
...,...,...,...,...
6584,1262692,247023_6,3,Q0
6585,1262692,1499030_5,3,Q0
6586,1262692,2916758_0,3,Q0
6587,1262692,1105845_15,3,Q0


In [32]:
qrels_dict = {}

for _, i in qrels.iterrows():
    if i['qid'] not in qrels_dict:
        qrels_dict[i['qid']] = {}
    qrels_dict[i['qid']][i['docno']] = i['label']

def count_relevant_neighbours(entry, relevance_level=1):
    ret = 0
    for neighbor in entry['neighbors']:
        if qrels_dict[entry['qid']].get(neighbor, 0) >= relevance_level:
            ret += 1
    return ret

In [7]:
corpus_graph = tira.pt.transform_documents('ir-benchmarks/seanmacavaney/corpus-graph', dataset) 

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/corpus-graph/2024-03-21-15-00-49.zip
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 27.1M/27.1M [00:04<00:00, 6.83MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_runs/ir-benchmarks/antique-test-20230107-training/seanmacavaney


In [31]:
# if there is no neighbour, we have no entry in the corpus graph (maybe should allow default values in transform documents

missing_neighbours = ('3862765_0', '4372730_24', '601023_0', '2615234_3')

qrels_with_neighbors = corpus_graph(qrels.copy()[~qrels['docno'].isin(missing_neighbours)])
qrels_with_neighbors

Unnamed: 0,qid,docno,label,iteration,neighbors
0,1964316,1964316_5,4,U0,"[2245059_0, 1964316_2, 2273802_1, 3786452_1, 2..."
1,1964316,1674088_11,1,Q0,"[230048_7, 1136440_0, 3784253_16, 3786452_1, 2..."
2,1964316,1218838_13,2,Q0,"[3786452_1, 3059341_5, 1248144_1, 1674088_11, ..."
3,1964316,1519022_15,2,Q0,"[3900143_6, 1519022_8, 3972356_5, 92480_12, 19..."
4,1964316,3059341_5,2,Q0,"[3786452_1, 1218838_13, 1248144_1, 1674088_11,..."
...,...,...,...,...,...
6584,1262692,247023_6,3,Q0,"[3196461_3, 2338307_4, 4428688_2, 2910592_6, 1..."
6585,1262692,1499030_5,3,Q0,"[317469_9, 1134300_6, 4475151_0, 3982441_4, 39..."
6586,1262692,2916758_0,3,Q0,"[1033541_0, 1134300_0, 2916758_11, 766628_0, 3..."
6587,1262692,1105845_15,3,Q0,"[276463_4, 2917422_3, 247806_6, 317469_5, 9744..."


In [34]:
qrels_with_neighbors['relevant_neighbors'] = qrels_with_neighbors.apply(count_relevant_neighbours, axis=1)
qrels_with_neighbors

Unnamed: 0,qid,docno,label,iteration,neighbors,relevant_neighbors
0,1964316,1964316_5,4,U0,"[2245059_0, 1964316_2, 2273802_1, 3786452_1, 2...",3
1,1964316,1674088_11,1,Q0,"[230048_7, 1136440_0, 3784253_16, 3786452_1, 2...",7
2,1964316,1218838_13,2,Q0,"[3786452_1, 3059341_5, 1248144_1, 1674088_11, ...",7
3,1964316,1519022_15,2,Q0,"[3900143_6, 1519022_8, 3972356_5, 92480_12, 19...",1
4,1964316,3059341_5,2,Q0,"[3786452_1, 1218838_13, 1248144_1, 1674088_11,...",7
...,...,...,...,...,...,...
6584,1262692,247023_6,3,Q0,"[3196461_3, 2338307_4, 4428688_2, 2910592_6, 1...",2
6585,1262692,1499030_5,3,Q0,"[317469_9, 1134300_6, 4475151_0, 3982441_4, 39...",2
6586,1262692,2916758_0,3,Q0,"[1033541_0, 1134300_0, 2916758_11, 766628_0, 3...",2
6587,1262692,1105845_15,3,Q0,"[276463_4, 2917422_3, 247806_6, 317469_5, 9744...",0


In [35]:
# on average 2 relevant document retrieved via corpus graph for docs in the qrels
qrels_with_neighbors['relevant_neighbors'].describe()

count    6585.000000
mean        1.991192
std         2.747900
min         0.000000
25%         0.000000
50%         1.000000
75%         3.000000
max        15.000000
Name: relevant_neighbors, dtype: float64

In [36]:
# on average 2.25 relevant document retrieved via corpus graph for docs with relevance > 1 in the qrels
qrels_with_neighbors[qrels_with_neighbors['label'] > 1]['relevant_neighbors'].describe()

count    4943.000000
mean        2.257536
std         2.854886
min         0.000000
25%         0.000000
50%         1.000000
75%         3.000000
max        15.000000
Name: relevant_neighbors, dtype: float64

In [39]:
# on average 2.46 relevant document retrieved via corpus graph for docs with relevance > 3 in the qrels
qrels_with_neighbors[qrels_with_neighbors['label'] > 3]['relevant_neighbors'].describe()

count    1334.000000
mean        2.461769
std         2.910976
min         0.000000
25%         0.000000
50%         1.000000
75%         4.000000
max        15.000000
Name: relevant_neighbors, dtype: float64

# Analysis on LongEval

As preparation for the IR Lab in Padua

In [3]:
# not yet part of the official main branch of ir_datasets, so we load from tira

dataset = pt.get_dataset(f'irds:ir-benchmarks/longeval-train-20230513-training')

In [4]:
qrels = dataset.get_qrels()

qrels

Unnamed: 0,qid,docno,label,iteration
0,q06223196,doc062200112743,0,0
1,q06223196,doc062200205250,0,0
2,q06223196,doc062200101983,0,0
3,q06223196,doc062200204465,1,0
4,q06223196,doc062200115614,0,0
...,...,...,...,...
9651,q062225197,doc062200205276,0,0
9652,q062225197,doc062200107121,1,0
9653,q062225197,doc062200204419,0,0
9654,q062225197,doc062200103774,0,0


In [5]:
corpus_graph = tira.pt.transform_documents('ir-benchmarks/seanmacavaney/corpus-graph', 'longeval-train-20230513-training') 

: 