# Doc2Query using a T5 model specifically trained for this task

In [1]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt
import gzip
import json
from tqdm import tqdm

tira = Client()
ensure_pyterrier_is_loaded()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
# Create an IRDSDataset object and registers it under the name provided as an argument.
dataset = 'ir-lab-sose-2024/ir-acl-anthology-20240504-training'
pt_dataset = pt.get_dataset(f'irds:{dataset}')

def doc_t5_query(dataset):
    docs = tira.get_run_output('ir-benchmarks/seanmacavaney/DocT5Query', dataset) + '/documents.jsonl.gz'
    with gzip.open(docs, 'rt') as f:
        for l in tqdm(f):
            l = json.loads(l)
            l['text'] = l['querygen']
            l['docno'] = l['doc_id']
            del l['doc_id']
            del l['querygen']
            yield l

# Expand the documents
# Preprocess document_expansions into a dictionary for faster lookup
expansions_dict = {expansion['docno']: expansion['text'] for expansion in doc_t5_query(dataset)}
#expansions_dict

Download: 4.81MiB [00:00, 28.4MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_runs/ir-benchmarks/ir-acl-anthology-20240504-training/seanmacavaney


126958it [00:00, 264458.65it/s]


In [3]:
# Original documents and document expansion for first document
for document in iter(pt_dataset.get_corpus_iter()):
  print(document)
  # Show document expansion for first document
  print(expansions_dict[document['docno']])
  # we only show the first one
  break

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/ir-acl-anthology-20240504-inputs.zip?download=1
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 39.4M/39.4M [00:00<00:00, 69.5MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:   0%|          | 0/126958 [00:00<?, ?it/s]

{'text': 'A Study on Word Similarity using Context Vector Models\n\n\n There is a need to measure word similarity when processing natural languages, especially when using generalization, classification, or example -based approaches. Usually, measures of similarity between two words are defined according to the distance between their semantic classes in a semantic taxonomy . The taxonomy approaches are more or less semantic -based that do not consider syntactic similarit ies. However, in real applications, both semantic and syntactic similarities are required and weighted differently. Word similarity based on context vectors is a mixture of syntactic and semantic similarit ies. In this paper, we propose using only syntactic related co-occurrences as context vectors and adopt information theoretic models to solve the problems of data sparseness and characteristic precision. The probabilistic distribution of co-occurrence context features is derived by parsing the contextual environment o




In [4]:
# Check if expansions_dict was created correctly
print(f"Expansions dictionary created with {len(expansions_dict)} items.")

# Initialize a list to store the combined documents
expanded_documents = []

# Iterate through the documents and combine them with their expansions
for document in iter(pt_dataset.get_corpus_iter()):
    # Get the document's unique identifier
    docno = document['docno']
    
    # Check if there's an expansion for the current document
    if docno in expansions_dict:
        # Append the expansion text to the document's text
        original_text = document['text']
        expansion_text = expansions_dict[docno]
        combined_text = original_text + expansion_text
        
        # Create a new dictionary with the combined text and add it to the list
        expanded_documents.append({'docno': docno, 'text': combined_text})
        
        # Debug print to confirm concatenation
        # print(f"Document {docno} expanded. Original length: {len(original_text)}, Expansion length: {len(expansion_text)}, New length: {len(combined_text)}")
    else:
        # If there's no expansion, just add the original document
        expanded_documents.append({'docno': docno, 'text': document['text']})
        print(f"No expansion found for document {docno}.")

# Check the number of documents combined
print(f"Total combined documents: {len(expanded_documents)}")

Expansions dictionary created with 126958 items.


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:02<00:00, 46942.32it/s]

Total combined documents: 126958





In [5]:
# Test if the combination worked
for document in expanded_documents:
  print(document)
  # Show document expansion for first document
  #print(expansions_dict[document['docno']])
  # we only show the first one
  break

{'docno': 'O02-2002', 'text': 'A Study on Word Similarity using Context Vector Models\n\n\n There is a need to measure word similarity when processing natural languages, especially when using generalization, classification, or example -based approaches. Usually, measures of similarity between two words are defined according to the distance between their semantic classes in a semantic taxonomy . The taxonomy approaches are more or less semantic -based that do not consider syntactic similarit ies. However, in real applications, both semantic and syntactic similarities are required and weighted differently. Word similarity based on context vectors is a mixture of syntactic and semantic similarit ies. In this paper, we propose using only syntactic related co-occurrences as context vectors and adopt information theoretic models to solve the problems of data sparseness and characteristic precision. The probabilistic distribution of co-occurrence context features is derived by parsing the con

In [6]:
# Initialize variables to store the longest docno and its length
longest_docno = ''
max_length = 0

# Iterate over the list of documents
for document in expanded_documents:
    # Get the current docno
    docno = document['docno']
    
    # Check if the current docno is longer than the longest found so far
    if len(docno) > max_length:
        longest_docno = docno
        max_length = len(docno)

# Output the longest docno and its length
print(f"The longest docno is: '{longest_docno}' with a length of {max_length}")

The longest docno is: '2007.wwwconf_conference-GeorgakopoulosBNC07.0' with a length of 45


In [7]:
# Create the Index
# Index the documents using PyTerrier
#indexer = pt.IterDictIndexer("./index_Doc2QueryT5")
#indexref = indexer.index(expanded_documents, fields=["text"], meta=["docno"])

# Create the index using PyTerrier
indexer = pt.IterDictIndexer(
    "./indexes/index_Doc2QueryT5",
    overwrite=True,
    fields=["text"],
    meta=["docno"],
    meta_lengths=[max_length],  # Adjust length based on expected docno length
)
# Index the documents
indexref = indexer.index(expanded_documents)

# Retrieve documents using BM25
bm25 = pt.BatchRetrieve(indexref, wmodel="BM25")

# Perform retrieval
#queries_df = pt_dataset.get_topics()
#run = bm25.transform(queries_df)
run = bm25(pt_dataset.get_topics('text'))

# Evaluate the results
qrels_df = pt_dataset.get_qrels()
eval = pt.Evaluate(run, qrels_df, metrics=["map", "ndcg", "ndcg_cut.10", "recip_rank", "recall_100"])
print("Evaluation Metrics:")
print(eval)

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/ir-acl-anthology-20240504-truth.zip?download=1
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 29.6k/29.6k [00:00<00:00, 1.49MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/
Evaluation Metrics:
{'map': 0.22193809243315205, 'ndcg': 0.5157937714634933, 'ndcg_cut.10': 0.28701453202938193, 'recip_rank': 0.484459943836947, 'recall_100': 0.5656737564073434}


In [8]:
# Filter run to include only judged documents
qrels_df = pt_dataset.get_qrels()
judged_docnos = qrels_df['docno'].unique()
filtered_run = run[run['docno'].isin(judged_docnos)]

# Evaluate the results
eval = pt.Evaluate(filtered_run, qrels_df, metrics=["map", "ndcg", "ndcg_cut.10", "recip_rank", "recall_100"])
print("Evaluation Metrics:")
print(eval)

Evaluation Metrics:
{'map': 0.5258490016383709, 'ndcg': 0.7128565585418872, 'ndcg_cut.10': 0.6234282848610385, 'recip_rank': 0.7543521429550843, 'recall_100': 0.8264849391152223}


In [9]:
filtered_run.head(10)

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,125137,1989.ipm_journal-ir0volumeA25A4.2,0,16.939318,retrieval system improving effectiveness
1,1,94858,2004.cikm_conference-2004.47,1,15.905389,retrieval system improving effectiveness
2,1,125817,2005.ipm_journal-ir0volumeA41A5.11,2,15.580315,retrieval system improving effectiveness
3,1,82472,1998.sigirconf_conference-98.15,3,15.10351,retrieval system improving effectiveness
4,1,74730,2008.clef_workshop-2008.10,4,14.658269,retrieval system improving effectiveness
5,1,94415,2008.cikm_conference-2008.183,5,14.268209,retrieval system improving effectiveness
6,1,84876,2016.ntcir_conference-2016.90,6,14.238205,retrieval system improving effectiveness
8,1,82490,1998.sigirconf_conference-98.33,8,14.228081,retrieval system improving effectiveness
13,1,125153,2008.ipm_journal-ir0volumeA44A3.9,13,13.963897,retrieval system improving effectiveness
14,1,101686,2018.ictir_conference-2018.20,14,13.949933,retrieval system improving effectiveness


In [10]:
# Persist the run file for subsequent evaluations
persist_and_normalize_run(run, system_name='Doc2QueryT5-BM25', default_output='../runs')

The run file is normalized outside the TIRA sandbox, I will store it at "../runs".
Done. run file is stored under "../runs/run.txt".


### Compare to BM25 Baseline

In [11]:
# Baseline without Doc2Query
# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)
# Retrieve documents using BM25
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
# Perform retrieval
run = bm25(pt_dataset.get_topics('text'))
# Evaluate the results
eval = pt.Evaluate(run, qrels_df, metrics=["map", "ndcg", "ndcg_cut.10", "recip_rank", "recall_100"])
print("Evaluation Metrics:")
print(eval)

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/2024-05-04-16-05-53.zip
	This is only used for last spot checks before archival to Zenodo.


Download:   3%|▎         | 500k/19.5M [00:00<00:03, 5.12MiB/s]

Download: 100%|██████████| 19.5M/19.5M [00:00<00:00, 43.3MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_runs/ir-lab-sose-2024/ir-acl-anthology-20240504-training/tira-ir-starter
Evaluation Metrics:
{'map': 0.2623109779858802, 'ndcg': 0.5494611680377397, 'ndcg_cut.10': 0.3740414675768205, 'recip_rank': 0.5798765367925459, 'recall_100': 0.6013331716358514}


In [12]:
# Filter run to include only judged documents
qrels_df = pt_dataset.get_qrels()
judged_docnos = qrels_df['docno'].unique()
filtered_run = run[run['docno'].isin(judged_docnos)]

# Evaluate the results
eval = pt.Evaluate(filtered_run, qrels_df, metrics=["map", "ndcg", "ndcg_cut.10", "recip_rank", "recall_100"])
print("Evaluation Metrics:")
print(eval)

Evaluation Metrics:
{'map': 0.5307283855863614, 'ndcg': 0.71563397739462, 'ndcg_cut.10': 0.6443137714663788, 'recip_rank': 0.7607843137254903, 'recall_100': 0.817271215638198}


Somehow the BM25 Baseline is almost always better.