# Doc2Query using a T5 model specifically trained for this task

In [2]:
from tira.rest_api_client import Client
import pyterrier as pt
import gzip
import json
from tqdm import tqdm
from tira.third_party_integrations import ensure_pyterrier_is_loaded

tira = Client()
ensure_pyterrier_is_loaded()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
dataset = 'ir-lab-sose-2024/ir-acl-anthology-20240504-training'
pt_dataset = pt.get_dataset(f'irds:{dataset}')

def doc_t5_query(dataset):
    docs = tira.get_run_output('ir-benchmarks/seanmacavaney/DocT5Query', dataset) + '/documents.jsonl.gz'
    with gzip.open(docs, 'rt') as f:
        for l in tqdm(f):
            l = json.loads(l)
            l['text'] = l['querygen']
            l['docno'] = l['doc_id']
            del l['doc_id']
            del l['querygen']
            yield l

# Expand the documents
expanded_documents = doc_t5_query(dataset)


In [4]:
for document in iter(pt_dataset.get_corpus_iter()):
  print(document)
  # we only show the first one
  break

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/ir-acl-anthology-20240504-inputs.zip?download=1
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 39.4M/39.4M [00:03<00:00, 11.6MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:   0%|          | 0/126958 [00:00<?, ?it/s]

{'text': 'A Study on Word Similarity using Context Vector Models\n\n\n There is a need to measure word similarity when processing natural languages, especially when using generalization, classification, or example -based approaches. Usually, measures of similarity between two words are defined according to the distance between their semantic classes in a semantic taxonomy . The taxonomy approaches are more or less semantic -based that do not consider syntactic similarit ies. However, in real applications, both semantic and syntactic similarities are required and weighted differently. Word similarity based on context vectors is a mixture of syntactic and semantic similarit ies. In this paper, we propose using only syntactic related co-occurrences as context vectors and adopt information theoretic models to solve the problems of data sparseness and characteristic precision. The probabilistic distribution of co-occurrence context features is derived by parsing the contextual environment o




In [5]:
for document_expansion in expanded_documents:
  print(document_expansion)
  # we only show the first one
  break

Download: 4.81MiB [00:00, 24.6MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_runs/ir-benchmarks/ir-acl-anthology-20240504-training/seanmacavaney


0it [00:00, ?it/s]

{'text': 'what is weighted aspect based collaborative filtering\nwhy weighted aspect filter\nwhat is collaborative filtering weighted aspects', 'docno': '2014.sigirconf_conference-2014.147'}


In [6]:
queries_df = pt_dataset.get_topics()
qrels_df = pt_dataset.get_qrels()

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/ir-acl-anthology-20240504-truth.zip?download=1
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 29.6k/29.6k [00:00<00:00, 1.47MiB/s]

Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/
There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.





In [7]:
# Create the Index
# Index the documents using PyTerrier
indexer = pt.IterDictIndexer("./index_Doc2QueryT5")
indexref = indexer.index(expanded_documents, fields=["text"], meta=["docno"])

# Retrieve documents using BM25
bm25 = pt.BatchRetrieve(indexref, wmodel="BM25")

# Perform retrieval
#run = bm25.transform(queries_df)
run = bm25(pt_dataset.get_topics('text'))

# Evaluate the results
eval = pt.Evaluate(run, qrels_df, metrics=["map", "ndcg"])
print("Evaluation Metrics:")
print(eval)

  indexref = indexer.index(expanded_documents, fields=["text"], meta=["docno"])
20101it [00:12, 4180.10it/s]



126958it [00:27, 4553.59it/s] 


17:42:29.467 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 8 empty documents
Evaluation Metrics:
{'map': 0.07750440760328463, 'ndcg': 0.283299990515167}


In [8]:
# Baseline without Doc2Query
# A (pre-built) PyTerrier index loaded from TIRA
index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)
# Retrieve documents using BM25
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
# Perform retrieval
run = bm25(pt_dataset.get_topics('text'))
# Evaluate the results
eval = pt.Evaluate(run, qrels_df, metrics=["map", "ndcg"])
print("Evaluation Metrics:")
print(eval)

Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/2024-05-04-16-05-53.zip
	This is only used for last spot checks before archival to Zenodo.


Download: 100%|██████████| 19.5M/19.5M [00:00<00:00, 32.0MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_runs/ir-lab-sose-2024/ir-acl-anthology-20240504-training/tira-ir-starter
Evaluation Metrics:
{'map': 0.2623109779858802, 'ndcg': 0.5494611680377397}
