In [1]:
!pip install python-terrier
import nltk
import pandas as pd

# Ensure you have the necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

import random
from pathlib import Path
import pyterrier as pt
if not pt.started():
    pt.init()



Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jasperbruin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jasperbruin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/jasperbruin/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/jasperbruin

In [2]:
# Example of loading a dataset from PyTerrier
dataset = pt.get_dataset('irds:msmarco-passage/trec-dl-2020')

In [3]:
topics = dataset.get_topics()
topics.head()

Unnamed: 0,qid,query
0,1030303,who is aziz hashim
1,1037496,who is rep scalise
2,1043135,who killed nicholas ii of russia
3,1045109,who owns barnhart crane
4,1049519,who said no one can make you feel inferior


In [4]:
qrels = dataset.get_qrels()
qrels.head()

Unnamed: 0,qid,docno,label,iteration
0,23849,1020327,2,0
1,23849,1034183,3,0
2,23849,1120730,0,0
3,23849,1139571,1,0
4,23849,1143724,0,0


In [5]:
corpus_iter = dataset.get_corpus_iter()

# Convert to an iterator
corpus_iterator = iter(corpus_iter)

first_doc = next(corpus_iterator)
print(first_doc)

msmarco-passage/trec-dl-2020 documents:   0%|          | 0/8841823 [00:00<?, ?it/s]

{'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.', 'docno': '0'}


1. **CRDR Model Proposal**: The paper introduces the CRDR model, which consists of a Query Rewrite module and a Dense Retrieval module. This model aims to overcome query ambiguity by enhancing the query embedding with relevant terms identified during the query rewriting process.

2. **Query Rewriting via Modification**: Instead of creating a new query or simply expanding the existing query with relevant terms, the CRDR model modifies the current query by replacing or inserting tokens based on the context. This process involves:
   - Encoding the query context to understand its meaning.
   - Tagging each token in the context with labels indicating their relevance or potential as insertion points for modification.
   - Modifying the current query by either replacing tokens with relevant ones or inserting relevant terms at appropriate positions.

3. **Dense Retrieval Implementation**: The model employs a dense retrieval approach where both the query and document are encoded into dense representations using a deep neural model (like BERT). The relevance between a query and document is then determined by computing the similarity between their dense embeddings.

### Implementation Steps for an Algorithm

1. **Encode-Tag-Modify Framework for Query Rewriting**:
   - **Encode**: Use a pretrained language model to encode the multi-turn query context into contextualized token representations.
   - **Tag**: Apply a token-level classification (using an MLP with Softmax) to assign each token a label indicating its relevance or role in query modification.
   - **Modify**: Based on the tagging, modify the current query by replacing or inserting relevant terms to generate a self-contained query that accurately represents the user's intent.

2. **Dense Retrieval with Contextualized Query Embedding**:
   - Encode the query and documents into dense embeddings.
   - Enhance the query embedding by integrating embeddings of relevant terms identified during the query rewriting phase. This is aimed at making the query representation more comprehensive and context-aware.
   - Use similarity scoring between the enhanced query embedding and document embeddings to retrieve the most relevant document.

3. **Optimization and Enhancement**:
   - Leverage a teacher-student framework for further refining the query encoder, where a teacher model encodes a manually curated "oracle" query and a student model learns from this to encode the contextualized query more effectively.
   - Enhance the query embedding by dynamically adjusting the influence of relevant term embeddings based on their attention scores, ensuring that all important information from the query context is captured.

### Algorithmic Considerations

- Implementing this methodology requires a deep understanding of natural language processing (NLP) and familiarity with pretrained language models like BERT.
- It involves sophisticated data preprocessing to manage multi-turn conversations, token classification to understand query context, and the application of neural networks for generating dense embeddings.
- Efficient similarity scoring mechanisms (like approximate nearest neighbor search) are crucial for the retrieval phase to ensure scalability and responsiveness of the search system.

# 1. Generate Document Embeddings
Here's a simplified way to encode documents and generate embeddings. Note that this process could be computationally intensive and may require adjustments based on the size of your dataset and the compute resources available.

In [6]:
from transformers import BertTokenizer, BertModel
import torch

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Function to encode a single document
def encode_document(doc_text):
    inputs = tokenizer(doc_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.pooler_output.detach().cpu().numpy()  # Get pooled output as numpy array

# Initialize a list to hold all document embeddings
doc_embeddings = []

# Assume `corpus_iter` is an iterator for your dataset documents
for doc in corpus_iter:
    doc_text = doc['text']  # Adjust based on the actual structure of your dataset
    embedding = encode_document(doc_text)
    doc_embeddings.append(embedding)


msmarco-passage/trec-dl-2020 documents:   0%|          | 1207/8841823 [01:03<124:46:40, 19.68it/s]

KeyboardInterrupt: 

# 2. Create and Populate a FAISS Index
Once you have all document embeddings, create a FAISS index and add these embeddings to it.

In [None]:
import numpy as np
import faiss

# Concatenate all embeddings into a single numpy array
doc_embeddings_np = np.vstack(doc_embeddings)

# Number of dimensions for the embeddings
d = doc_embeddings_np.shape[1]

# Creating a FAISS index
index = faiss.IndexFlatL2(d)
index.add(doc_embeddings_np)

# Save the index to disk for later use
faiss.write_index(index, Path("my_faiss_index.faiss"))

In [None]:
from pyterrier.transformer import TransformerBase
import torch
from transformers import BertTokenizer, BertModel
import faiss

class CRDRModel:
    def __init__(self, model_name='bert-base-uncased', index_path=None):
        # Initialize tokenizer and model for BERT
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)

        # Initialize FAISS index for dense retrieval
        # Assuming the embeddings have already been indexed in FAISS
        self.faiss_index = faiss.read_index(index_path) if index_path else None

        # Placeholder for other initializations (e.g., query history, embeddings index)

    def encode(self, texts):
        """Encodes a list of texts into contextualized embeddings."""
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = self.model(**inputs)
        return outputs.pooler_output  # Use pooled output for simplicity

    def tag_and_modify(self, query, query_history):
        """Tags tokens in the query and modifies the query based on the history.
        This is a simplified placeholder. Actual implementation would involve detailed logic."""
        # Simplified logic: just return the query as is
        # In practice, use encoded context and query to determine modifications
        return query

    def retrieve_documents(self, query_embedding):
        """Performs dense retrieval given a query embedding.
        Returns document IDs and their corresponding similarity scores."""
        # Example FAISS search (top 10 results)
        D, I = self.faiss_index.search(query_embedding.cpu().detach().numpy(), 10)
        return I[0], D[0]  # IDs and distances

    def process_query(self, query, query_history):
        """Processes a single query within the context of its history."""
        modified_query = self.tag_and_modify(query, query_history)
        query_embedding = self.encode([modified_query])

        doc_ids, scores = self.retrieve_documents(query_embedding)

        # Placeholder for how to return or process the results
        return doc_ids, scores


# Initialize the CRDR model with the index
crdr_model = CRDRModel(index_path=Path("my_faiss_index.faiss"))

# Example usage with a PyTerrier DataFrame
queries = pt.new.queries(["What is PyTerrier?", "Explain deep learning in IR"])
results = crdr_model(queries)

print(results)

In [None]:
# Instantiate the query rewriting transformer
query_rewriter = QueryRewriter()

In [None]:
# Transform the topics with the query rewriter
rewritten_topics = query_rewriter.transform(topics)

In [None]:
# Compare the original and rewritten queries
for i in range(10):
    # where i is random number
    i = random.randint(0, len(topics) - 1)
    print(f"Original: {topics['query'][i]}")
    print(f"Rewritten: {rewritten_topics['query'][i]}\n")

In [None]:
# indexer = pt.IterDictIndexer(str(Path("index").absolute()))
# index_ref = indexer.index(corpus_iter)

In [None]:
index = pt.IndexFactory.of(str(Path("index").absolute()))
tf_idf = pt.BatchRetrieve(index, wmodel="TF_IDF")
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [None]:
from pyterrier.measures import RR, nDCG, MAP

# results_dir = Path("results")
# results_dir.mkdir(exist_ok=True)

pt.Experiment(
    [tf_idf, bm25],
    dataset.get_topics(),
    dataset.get_qrels(),
    names=["TF-IDF", "BM25"],
    eval_metrics=[RR @ 10, nDCG @ 20, MAP, nDCG @ 10],
)

In [None]:
pt.Experiment(
    [tf_idf, bm25],
    rewritten_topics,  
    dataset.get_qrels(),
    names=["TF-IDF", "BM25"],
    eval_metrics=[RR @ 10, nDCG @ 20, MAP, nDCG @ 10],
)