# Doc2Query using a T5 model specifically trained for this task

In [43]:
from tira.rest_api_client import Client
import pyterrier as pt
import gzip
import json
from tqdm import tqdm
from tira.third_party_integrations import ensure_pyterrier_is_loaded

tira = Client()
ensure_pyterrier_is_loaded()

In [44]:
# Create an IRDSDataset object and registers it under the name provided as an argument.
dataset = 'ir-lab-sose-2024/ir-acl-anthology-20240504-training'
pt_dataset = pt.get_dataset(f'irds:{dataset}')

def doc_t5_query(dataset):
    docs = tira.get_run_output('ir-benchmarks/seanmacavaney/DocT5Query', dataset) + '/documents.jsonl.gz'
    with gzip.open(docs, 'rt') as f:
        for l in tqdm(f):
            l = json.loads(l)
            l['text'] = l['querygen']
            l['docno'] = l['doc_id']
            del l['doc_id']
            del l['querygen']
            yield l

# Expand the documents
# Preprocess document_expansions into a dictionary for faster lookup
expansions_dict = {expansion['docno']: expansion['text'] for expansion in doc_t5_query(dataset)}
#expansions_dict

126958it [00:00, 247462.38it/s]


{'2014.sigirconf_conference-2014.147': 'what is weighted aspect based collaborative filtering\nwhy weighted aspect filter\nwhat is collaborative filtering weighted aspects',
 'D19-1254': 'which domain of comprehension can unsupervised adaptive domain adaptation be based on?\ndefinition of adaptive domain for machine reading\nwhat is the domain adaptation for machine reading?',
 '2011.eamt-1.33': 'what is smt\nwhat do statistical machines do\nwhat is smt',
 '2012.sigirconf_conference-2012.34': 'why is search so important\nwhy is search interruptable\ndefinition of search, interrupted',
 'S07-1097': 'what is the fuzzy borda voting system\nwhen do you use fuzzy borda voting?\nwhen did wsd become available',
 '2020.wwwjournals_journal-ir0volumeA23A5.14': 'who are representative users of social networks?\nwhich feature of the social network represents a subset of the users?\nwhy are social networks representative of users',
 'W04-0216': 'animacy encoding\ndefinition of animacy in english\nw

In [45]:
# Original documents and document expansion for first document
for document in iter(pt_dataset.get_corpus_iter()):
  print(document)
  # Show document expansion for first document
  print(expansions_dict[document['docno']])
  # we only show the first one
  break

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:   0%|          | 0/126958 [00:00<?, ?it/s]

{'text': 'A Study on Word Similarity using Context Vector Models\n\n\n There is a need to measure word similarity when processing natural languages, especially when using generalization, classification, or example -based approaches. Usually, measures of similarity between two words are defined according to the distance between their semantic classes in a semantic taxonomy . The taxonomy approaches are more or less semantic -based that do not consider syntactic similarit ies. However, in real applications, both semantic and syntactic similarities are required and weighted differently. Word similarity based on context vectors is a mixture of syntactic and semantic similarit ies. In this paper, we propose using only syntactic related co-occurrences as context vectors and adopt information theoretic models to solve the problems of data sparseness and characteristic precision. The probabilistic distribution of co-occurrence context features is derived by parsing the contextual environment o




In [46]:
# Check if expansions_dict was created correctly
print(f"Expansions dictionary created with {len(expansions_dict)} items.")

# Initialize a list to store the combined documents
expanded_documents = []

# Iterate through the documents and combine them with their expansions
for document in iter(pt_dataset.get_corpus_iter()):
    # Get the document's unique identifier
    docno = document['docno']
    
    # Check if there's an expansion for the current document
    if docno in expansions_dict:
        # Append the expansion text to the document's text
        original_text = document['text']
        expansion_text = expansions_dict[docno]
        combined_text = original_text + expansion_text
        
        # Create a new dictionary with the combined text and add it to the list
        expanded_documents.append({'docno': docno, 'text': combined_text})
        
        # Debug print to confirm concatenation
        # print(f"Document {docno} expanded. Original length: {len(original_text)}, Expansion length: {len(expansion_text)}, New length: {len(combined_text)}")
    else:
        # If there's no expansion, just add the original document
        expanded_documents.append({'docno': docno, 'text': document['text']})
        print(f"No expansion found for document {docno}.")

# Check the number of documents combined
print(f"Total combined documents: {len(expanded_documents)}")

Expansions dictionary created with 126958 items.


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:  37%|███▋      | 47081/126958 [00:01<00:02, 38408.55it/s]

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:02<00:00, 51228.18it/s]

Total combined documents: 126958





In [48]:
# Test if the combination worked
for document in expanded_documents:
  print(document)
  # Show document expansion for first document
  print(expansions_dict[document['docno']])
  # we only show the first one
  break

{'docno': 'O02-2002', 'text': 'A Study on Word Similarity using Context Vector Models\n\n\n There is a need to measure word similarity when processing natural languages, especially when using generalization, classification, or example -based approaches. Usually, measures of similarity between two words are defined according to the distance between their semantic classes in a semantic taxonomy . The taxonomy approaches are more or less semantic -based that do not consider syntactic similarit ies. However, in real applications, both semantic and syntactic similarities are required and weighted differently. Word similarity based on context vectors is a mixture of syntactic and semantic similarit ies. In this paper, we propose using only syntactic related co-occurrences as context vectors and adopt information theoretic models to solve the problems of data sparseness and characteristic precision. The probabilistic distribution of co-occurrence context features is derived by parsing the con

In [55]:
# Initialize variables to store the longest docno and its length
longest_docno = ''
max_length = 0

# Iterate over the list of documents
for document in expanded_documents:
    # Get the current docno
    docno = document['docno']
    
    # Check if the current docno is longer than the longest found so far
    if len(docno) > max_length:
        longest_docno = docno
        max_length = len(docno)

# Output the longest docno and its length
print(f"The longest docno is: '{longest_docno}' with a length of {max_length}")

The longest docno is: '2007.wwwconf_conference-GeorgakopoulosBNC07.0' with a length of 45


In [60]:
# Create the Index
# Index the documents using PyTerrier
#indexer = pt.IterDictIndexer("./index_Doc2QueryT5")
#indexref = indexer.index(expanded_documents, fields=["text"], meta=["docno"])

# Create the index using PyTerrier
indexer = pt.IterDictIndexer(
    "./indexes/index_Doc2QueryT5",
    fields=["text"],
    meta=["docno"],
    meta_lengths=[max_length],  # Adjust length based on expected docno length
)
# Index the documents
indexref = indexer.index(expanded_documents)

# Retrieve documents using BM25
bm25 = pt.BatchRetrieve(indexref, wmodel="BM25")

# Perform retrieval
#queries_df = pt_dataset.get_topics()
#run = bm25.transform(queries_df)
run = bm25(pt_dataset.get_topics('text'))

# Evaluate the results
qrels_df = pt_dataset.get_qrels()
eval = pt.Evaluate(run, qrels_df, metrics=["map", "ndcg"])
print("Evaluation Metrics:")
print(eval)

Evaluation Metrics:
{'map': 0.22193809243315205, 'ndcg': 0.5157937714634933}


In [52]:
# Baseline without Doc2Query
# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)
# Retrieve documents using BM25
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
# Perform retrieval
run = bm25(pt_dataset.get_topics('text'))
# Evaluate the results
eval = pt.Evaluate(run, qrels_df, metrics=["map", "ndcg"])
print("Evaluation Metrics:")
print(eval)

Evaluation Metrics:
{'map': 0.2623109779858802, 'ndcg': 0.5494611680377397}
