# IR Lab Tutorial: Document Expansion with DocT5Query

TBD...

# Import All Libraries

In [3]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client
ensure_pyterrier_is_loaded()
import pyterrier as pt

tira = Client()

In [4]:
dataset = pt.get_dataset(f'irds:antique/test')

In [29]:
qrels = dataset.get_qrels()

qrels

Unnamed: 0,qid,docno,label,iteration
0,1964316,1964316_5,4,U0
1,1964316,1674088_11,1,Q0
2,1964316,1218838_13,2,Q0
3,1964316,1519022_15,2,Q0
4,1964316,3059341_5,2,Q0
...,...,...,...,...
6584,1262692,247023_6,3,Q0
6585,1262692,1499030_5,3,Q0
6586,1262692,2916758_0,3,Q0
6587,1262692,1105845_15,3,Q0


In [32]:
qrels_dict = {}

for _, i in qrels.iterrows():
    if i['qid'] not in qrels_dict:
        qrels_dict[i['qid']] = {}
    qrels_dict[i['qid']][i['docno']] = i['label']

def count_relevant_neighbours(entry, relevance_level=1):
    ret = 0
    for neighbor in entry['neighbors']:
        if qrels_dict[entry['qid']].get(neighbor, 0) >= relevance_level:
            ret += 1
    return ret

In [7]:
corpus_graph = tira.pt.transform_documents('ir-benchmarks/seanmacavaney/corpus-graph', dataset) 

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/corpus-graph/2024-03-21-15-00-49.zip
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 27.1M/27.1M [00:04<00:00, 6.83MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_runs/ir-benchmarks/antique-test-20230107-training/seanmacavaney


In [31]:
# if there is no neighbour, we have no entry in the corpus graph (maybe should allow default values in transform documents

missing_neighbours = ('3862765_0', '4372730_24', '601023_0', '2615234_3')

qrels_with_neighbors = corpus_graph(qrels.copy()[~qrels['docno'].isin(missing_neighbours)])
qrels_with_neighbors

Unnamed: 0,qid,docno,label,iteration,neighbors
0,1964316,1964316_5,4,U0,"[2245059_0, 1964316_2, 2273802_1, 3786452_1, 2..."
1,1964316,1674088_11,1,Q0,"[230048_7, 1136440_0, 3784253_16, 3786452_1, 2..."
2,1964316,1218838_13,2,Q0,"[3786452_1, 3059341_5, 1248144_1, 1674088_11, ..."
3,1964316,1519022_15,2,Q0,"[3900143_6, 1519022_8, 3972356_5, 92480_12, 19..."
4,1964316,3059341_5,2,Q0,"[3786452_1, 1218838_13, 1248144_1, 1674088_11,..."
...,...,...,...,...,...
6584,1262692,247023_6,3,Q0,"[3196461_3, 2338307_4, 4428688_2, 2910592_6, 1..."
6585,1262692,1499030_5,3,Q0,"[317469_9, 1134300_6, 4475151_0, 3982441_4, 39..."
6586,1262692,2916758_0,3,Q0,"[1033541_0, 1134300_0, 2916758_11, 766628_0, 3..."
6587,1262692,1105845_15,3,Q0,"[276463_4, 2917422_3, 247806_6, 317469_5, 9744..."


In [34]:
qrels_with_neighbors['relevant_neighbors'] = qrels_with_neighbors.apply(count_relevant_neighbours, axis=1)
qrels_with_neighbors

Unnamed: 0,qid,docno,label,iteration,neighbors,relevant_neighbors
0,1964316,1964316_5,4,U0,"[2245059_0, 1964316_2, 2273802_1, 3786452_1, 2...",3
1,1964316,1674088_11,1,Q0,"[230048_7, 1136440_0, 3784253_16, 3786452_1, 2...",7
2,1964316,1218838_13,2,Q0,"[3786452_1, 3059341_5, 1248144_1, 1674088_11, ...",7
3,1964316,1519022_15,2,Q0,"[3900143_6, 1519022_8, 3972356_5, 92480_12, 19...",1
4,1964316,3059341_5,2,Q0,"[3786452_1, 1218838_13, 1248144_1, 1674088_11,...",7
...,...,...,...,...,...,...
6584,1262692,247023_6,3,Q0,"[3196461_3, 2338307_4, 4428688_2, 2910592_6, 1...",2
6585,1262692,1499030_5,3,Q0,"[317469_9, 1134300_6, 4475151_0, 3982441_4, 39...",2
6586,1262692,2916758_0,3,Q0,"[1033541_0, 1134300_0, 2916758_11, 766628_0, 3...",2
6587,1262692,1105845_15,3,Q0,"[276463_4, 2917422_3, 247806_6, 317469_5, 9744...",0


In [35]:
# on average 2 relevant document retrieved via corpus graph for docs in the qrels
qrels_with_neighbors['relevant_neighbors'].describe()

count    6585.000000
mean        1.991192
std         2.747900
min         0.000000
25%         0.000000
50%         1.000000
75%         3.000000
max        15.000000
Name: relevant_neighbors, dtype: float64

In [36]:
# on average 2.25 relevant document retrieved via corpus graph for docs with relevance > 1 in the qrels
qrels_with_neighbors[qrels_with_neighbors['label'] > 1]['relevant_neighbors'].describe()

count    4943.000000
mean        2.257536
std         2.854886
min         0.000000
25%         0.000000
50%         1.000000
75%         3.000000
max        15.000000
Name: relevant_neighbors, dtype: float64

In [39]:
# on average 2.46 relevant document retrieved via corpus graph for docs with relevance > 3 in the qrels
qrels_with_neighbors[qrels_with_neighbors['label'] > 3]['relevant_neighbors'].describe()

count    1334.000000
mean        2.461769
std         2.910976
min         0.000000
25%         0.000000
50%         1.000000
75%         4.000000
max        15.000000
Name: relevant_neighbors, dtype: float64

# Analysis on LongEval

As preparation for the IR Lab in Padua

In [4]:
# not yet part of the official main branch of ir_datasets, so we load from tira

dataset = pt.get_dataset(f'irds:ir-benchmarks/longeval-train-20230513-training')

In [6]:
qrels = dataset.get_qrels()

qrels

Unnamed: 0,qid,docno,label,iteration
0,q06223196,doc062200112743,0,0
1,q06223196,doc062200205250,0,0
2,q06223196,doc062200101983,0,0
3,q06223196,doc062200204465,1,0
4,q06223196,doc062200115614,0,0
...,...,...,...,...
9651,q062225197,doc062200205276,0,0
9652,q062225197,doc062200107121,1,0
9653,q062225197,doc062200204419,0,0
9654,q062225197,doc062200103774,0,0


In [7]:
corpus_graph = tira.pt.transform_documents('ir-benchmarks/seanmacavaney/corpus-graph', 'longeval-train-20230513-training') 

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/corpus-graph/2024-03-21-12-46-50.zip
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 93.2M/93.2M [00:09<00:00, 10.8MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_runs/ir-benchmarks/longeval-train-20230513-training/seanmacavaney


In [8]:
qrels_dict = {}

for _, i in qrels.iterrows():
    if i['qid'] not in qrels_dict:
        qrels_dict[i['qid']] = {}
    qrels_dict[i['qid']][i['docno']] = i['label']

def count_relevant_neighbours(entry, relevance_level=1):
    ret = 0
    for neighbor in entry['neighbors']:
        if qrels_dict[entry['qid']].get(neighbor, 0) >= relevance_level:
            ret += 1
    return ret

In [12]:
# if there is no neighbour, we have no entry in the corpus graph (maybe should allow default values in transform documents
qrels_with_neighbors = corpus_graph(qrels[qrels['label'] > 0].copy())
qrels_with_neighbors

Unnamed: 0,qid,docno,label,iteration,neighbors
3,q06223196,doc062200204465,1,0,"[doc062200401429, doc062200106171, doc06221110..."
10,q06223196,doc062200205493,1,0,"[doc062201708464, doc062200115614, doc06220500..."
20,q062228,doc062200116555,1,0,"[doc062208807613, doc062208706096, doc06221040..."
21,q062228,doc062200116273,2,0,"[doc062200100875, doc062201300294, doc06220380..."
25,q062287,doc062200209981,1,0,"[doc062200116769, doc062215804698, doc06220010..."
...,...,...,...,...,...
9636,q062225194,doc062200201379,2,0,"[doc062208607784, doc062200116561, doc06220830..."
9640,q062225194,doc062200205011,1,0,"[doc062208406169, doc062208603880, doc06220860..."
9641,q062225194,doc062200204433,1,0,"[doc062200112015, doc062200113353, doc06220841..."
9647,q062225197,doc062200207538,1,0,"[doc062202104464, doc062201710012, doc06220200..."


In [13]:
qrels_with_neighbors['relevant_neighbors'] = qrels_with_neighbors.apply(count_relevant_neighbours, axis=1)
qrels_with_neighbors

Unnamed: 0,qid,docno,label,iteration,neighbors,relevant_neighbors
3,q06223196,doc062200204465,1,0,"[doc062200401429, doc062200106171, doc06221110...",0
10,q06223196,doc062200205493,1,0,"[doc062201708464, doc062200115614, doc06220500...",0
20,q062228,doc062200116555,1,0,"[doc062208807613, doc062208706096, doc06221040...",0
21,q062228,doc062200116273,2,0,"[doc062200100875, doc062201300294, doc06220380...",0
25,q062287,doc062200209981,1,0,"[doc062200116769, doc062215804698, doc06220010...",0
...,...,...,...,...,...,...
9636,q062225194,doc062200201379,2,0,"[doc062208607784, doc062200116561, doc06220830...",0
9640,q062225194,doc062200205011,1,0,"[doc062208406169, doc062208603880, doc06220860...",0
9641,q062225194,doc062200204433,1,0,"[doc062200112015, doc062200113353, doc06220841...",1
9647,q062225197,doc062200207538,1,0,"[doc062202104464, doc062201710012, doc06220200...",0


In [20]:

len(qrels_with_neighbors['qid'].unique())

656

In [23]:
# On average 4 relevant documents per query
2626/655

4.009160305343512

In [14]:
# on average 0.6 relevant document retrieved via corpus graph for docs in the qrels
# Reasonable, as there are only 4 relevant documents per query on average, but if you find one, you are likely to find "0.6 more" via the corpus graph, which is especially reasonable as you can aggregate this over multiple top results per query.
qrels_with_neighbors['relevant_neighbors'].describe()

count    2626.000000
mean        0.610815
std         0.941377
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         7.000000
Name: relevant_neighbors, dtype: float64

In [27]:
#Usually, you apply it to some tranformer-based model...
monot5 = tira.pt.from_submission("ir-benchmarks/tira-ir-starter/MonoT5 Base (tira-ir-starter-gygaggle)", "longeval-train-20230513-training")

Download: 1.11MiB [00:00, 7.31MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_runs/ir-benchmarks/longeval-train-20230513-training/tira-ir-starter


In [28]:
monot5(dataset.get_topics('title'))

Unnamed: 0,qid,query,q0,rank,score,system,docno,tira_task,tira_dataset,tira_first_stage_run_id
0,q06223196,car shelter,0,1,-0.003520,castorini/monot5-base-msmarco-10k,doc062201708464,ir-benchmarks,longeval-train-20230513-training,2024-03-18-12-56-01
1,q06223196,car shelter,0,2,-0.005353,castorini/monot5-base-msmarco-10k,doc062200108613,ir-benchmarks,longeval-train-20230513-training,2024-03-18-12-56-01
2,q06223196,car shelter,0,3,-0.006328,castorini/monot5-base-msmarco-10k,doc062200206319,ir-benchmarks,longeval-train-20230513-training,2024-03-18-12-56-01
3,q06223196,car shelter,0,4,-0.006333,castorini/monot5-base-msmarco-10k,doc062200112743,ir-benchmarks,longeval-train-20230513-training,2024-03-18-12-56-01
4,q06223196,car shelter,0,5,-0.006599,castorini/monot5-base-msmarco-10k,doc062201708471,ir-benchmarks,longeval-train-20230513-training,2024-03-18-12-56-01
...,...,...,...,...,...,...,...,...,...,...
66562,q062225197,cheapest car,0,96,-10.073474,castorini/monot5-base-msmarco-10k,doc062202202627,ir-benchmarks,longeval-train-20230513-training,2024-03-18-12-56-01
66563,q062225197,cheapest car,0,97,-10.074920,castorini/monot5-base-msmarco-10k,doc062202005382,ir-benchmarks,longeval-train-20230513-training,2024-03-18-12-56-01
66564,q062225197,cheapest car,0,98,-10.288811,castorini/monot5-base-msmarco-10k,doc062202002893,ir-benchmarks,longeval-train-20230513-training,2024-03-18-12-56-01
66565,q062225197,cheapest car,0,99,-10.347551,castorini/monot5-base-msmarco-10k,doc062214906085,ir-benchmarks,longeval-train-20230513-training,2024-03-18-12-56-01


In [None]:
# ToDo: now integrate the adaptive re-ranking...