# Doc2Query

In [1]:
#from Doc2Query.Doc2Query import Doc2Query
from Doc2Query import Doc2Query
import pyterrier as pt
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
# PyTerrier dataset to pass
pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
type(pt_dataset)

pyterrier.datasets.IRDSDataset

### Expand the Documents

In [4]:
# Create an instance of Doc2Query
Doc2Query = Doc2Query("google/flan-t5-small", temperatur=None,promting_technique=None)

In [5]:
# The text documents from the PyTerrier dataset
documents = Doc2Query.getDocumentsDfFromPtDataset(pt_dataset)

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:02<00:00, 56293.20it/s]


In [6]:
# Generate queries for the dataset and extend the documents by the queries
expanded_documents = Doc2Query.expandDocumentsByQueries(documents)



In [None]:
expanded_documents[['text']].to_dict()

In [None]:
# Create the index using PyTerrier
indexer = pt.IterDictIndexer(
    "./indexes/index_Doc2Query-flan-t5-small",
    overwrite=True,
    fields=["text"],
    meta=["docno"]
)
# Index the documents
indexref = indexer.index(expanded_documents)

# Retrieve documents using BM25
bm25 = pt.BatchRetrieve(indexref, wmodel="BM25")

# Perform retrieval
#queries_df = pt_dataset.get_topics()
#run = bm25.transform(queries_df)
run = bm25(pt_dataset.get_topics('text'))

# Evaluate the results
qrels_df = pt_dataset.get_qrels()
eval = pt.Evaluate(run, qrels_df, metrics=["map", "ndcg", "ndcg_cut.10", "recip_rank", "recall_100"])
print("Evaluation Metrics:")
print(eval)

In [None]:
# Filter run to include only judged documents
qrels_df = pt_dataset.get_qrels()
judged_docnos = qrels_df['docno'].unique()
filtered_run = run[run['docno'].isin(judged_docnos)]

# Evaluate the results
eval = pt.Evaluate(filtered_run, qrels_df, metrics=["map", "ndcg", "ndcg_cut.10", "recip_rank", "recall_100"])
print("Evaluation Metrics:")
print(eval)

In [None]:
# Persist the run file for subsequent evaluations
#persist_and_normalize_run(run, system_name='Doc2Query-flan-t5-small-BM25-judged_only', default_output='../runs')

### Get few shot exampels

In [8]:
documents

Unnamed: 0,text,docno
0,A Study on Word Similarity using Context Vecto...,O02-2002
1,Bootstrapping Large Sense Tagged Corpora,L02-1310
2,"Headerless, Quoteless, but not Hopeless? Using...",R13-1042
3,Aligning Words in {E}nglish-{H}indi Parallel C...,W05-0819
4,Proposal of a very-large-corpus acquisition me...,L02-1309
...,...,...
126953,Self-Spacial Join Selectivity Estimation Using...,1998.tois_journal-ir0volumeA16A2.2
126954,Hyperdocuments as Automata: Verification of Tr...,1998.tois_journal-ir0volumeA16A1.0
126955,Evaluation of an Algorithm for Finding a Match...,1998.tois_journal-ir0volumeA16A1.1
126956,Corpus-Based Stemming Using Cooccurrence of Wo...,1998.tois_journal-ir0volumeA16A1.2


In [33]:
queries = pt_dataset.get_topics('text')
queries

Unnamed: 0,qid,query
0,1,retrieval system improving effectiveness
1,2,machine learning language identification
2,3,social media detect self harm
3,4,stemming for arabic languages
4,5,audio based animal recognition
...,...,...
63,65,information in different language
64,66,abbreviations in queries
65,67,lemmatization algorithms
66,68,filter ad rich documents


In [34]:
qrels = pt_dataset.get_qrels()
qrels = qrels[qrels['label'] == 1]

In [57]:
# Count occurrences of each docno
docno_counts = qrels['docno'].value_counts()

# Find the maximum count value
max_count = docno_counts.max()

# Filter docnos with the maximum count value
max_count_docnos = docno_counts[docno_counts == max_count]

# Print the results
print(max_count_docnos)

docno
2011.sigirconf_conference-2011.58             3
2000.clef_workshop-2000.0                     3
1985.tois_journal-ir0volumeA3A2.1             3
2019.wwwjournals_journal-ir0volumeA22A3.19    3
2015.fire_workshop-2015w.20                   3
1985.tois_journal-ir0volumeA3A2.5             3
2016.wwwconf_conference-2016.11               3
2004.ipm_journal-ir0volumeA40A6.4             3
2011.tois_journal-ir0volumeA29A2.0            3
2016.sigirconf_conference-2016.230            3
Name: count, dtype: int64


In [61]:
filtered_qrels = qrels[qrels['docno'] == '2016.wwwconf_conference-2016.11']
filtered_qrels

Unnamed: 0,qid,docno,label,iteration
772,21,2016.wwwconf_conference-2016.11,1,0
842,23,2016.wwwconf_conference-2016.11,1,0
1375,36,2016.wwwconf_conference-2016.11,1,0


In [62]:
# Merge with queries DataFrame
merged_df = filtered_qrels.merge(queries, on='qid', how='left')

# Merge with documents DataFrame
final_df = merged_df.merge(documents, on='docno', how='left')
final_df

Unnamed: 0,qid,docno,label,iteration,query,text
0,21,2016.wwwconf_conference-2016.11,1,0,recommenders influence on users,When do Recommender Systems Work the Best?: Th...
1,23,2016.wwwconf_conference-2016.11,1,0,consumer product reviews,When do Recommender Systems Work the Best?: Th...
2,36,2016.wwwconf_conference-2016.11,1,0,recommendation systems,When do Recommender Systems Work the Best?: Th...


In [63]:
final_df['text'][0]

"When do Recommender Systems Work the Best?: The Moderating Effects of Product Attributes and Consumer Reviews on Recommender Performance\n\n\n ABSTRACTWe investigate the moderating effect of product attributes and consumer reviews on the efficacy of a collaborative filtering recommender system on an e-commerce site. We run a randomized field experiment on a top North American retailer's website with 184,375 users split into a recommendertreated group and a control group with 37,215 unique products in the dataset. By augmenting the dataset with Amazon Mechanical Turk tagged product attributes and consumer review data from the website, we study their moderating influence on recommenders in generating conversion.We first confirm that the use of recommenders increases the baseline conversion rate by 5.9%. We find that the recommenders act as substitutes for high average review ratings with the effect of using recommenders increasing the conversion rate as much as about 1.4 additional aver