In [1]:
import json
from datasets import load_dataset
from huggingface_hub import hf_hub_download

# Initialize lists
queries = []
passages = []
query_ids = []
passage_ids = []

In [2]:
# Load the dataset
dataset = load_dataset('McAuley-Lab/Amazon-C4', split='test')

print(f"Total entries in dataset: {len(dataset)}")
print(dataset[0])  # View the first entry

Total entries in dataset: 21223
{'qid': 0, 'query': "I need filters that effectively trap dust and improve the air quality in my home. It's surprising how much dust they can collect in just a few months.", 'item_id': 'B0C5QYYHTJ', 'user_id': 'AGREO2G3GTRNYOJK4CIQV2DTZLSQ', 'ori_rating': 5, 'ori_review': 'These filters work I could not believe the amount of dust in the old filter when it was replaced after 3 months.  These really trap the dust and make my home a much healthier place.'}


In [3]:
# Download the item metadata file
filepath = hf_hub_download(
    repo_id='McAuley-Lab/Amazon-C4',
    filename='sampled_item_metadata_1M.jsonl',
    repo_type='dataset'
)

# Build a dictionary mapping item_id to metadata
item_dict = {}

with open(filepath, 'r', encoding='utf-8') as file:
    for line in file:
        item = json.loads(line.strip())
        item_id = item.get('item_id')
        metadata = item.get('metadata')
        if item_id and metadata:
            item_dict[item_id] = metadata

print(f"Total items in item_dict: {len(item_dict)}")

Total items in item_dict: 1058417


In [4]:
for data in dataset:
    query_text = data.get('query', '').strip()
    item_id = data.get('item_id', '').strip()
    qid = data.get('qid', None)
    
    # Get the passage text from item_dict using item_id
    passage_text = item_dict.get(item_id, None)
    
    # Check if both query and passage_text are available
    if query_text and passage_text:
        # Append prefixed texts to respective lists
        queries.append('query: ' + query_text)
        passages.append('passage: ' + passage_text)
        query_ids.append(qid)
        passage_ids.append(item_id)
    else:
        # Handle missing data (optional)
        continue

In [5]:
queries[0]

"query: I need filters that effectively trap dust and improve the air quality in my home. It's surprising how much dust they can collect in just a few months."

In [6]:
passages[0]

'passage: Flintar Core 300 True HEPA Replacement Filters, Compatible with LEVOIT Core 300, Core 300S VortexAir Air Purifier, 3-in-1 H13 Grade True HEPA Filter Replacement, Core 300-RF, 2-Pack. Flintar Premium high-efficiency H13 Grade True HEPA Replacement Filter is made in Taiwan and is fully compatible with LEVOIT Core 300 and Core 300S VortexAir Air Purifier. This True HEPA Filtration System includes:   - Fine Pre-Filter: Traps larger particles in the air like dust, hairs, pet fur, lint, and more - H13 Grade True HEPA Filter: Captures 99.97% of harmful airborne particles down to 0.3 microns in size   - High-Efficiency Activated Carbon Filter: Absorbs household odors from pets, cooking, smoke, wildfire, and harmful VOC’s Using Flintar premium high-efficiency air purifier filters and replacing the filters regularly will help optimize air cleaning performance. Replace your HEPA Filter every 6 months for optimal performance. Fully compatible with LEVOIT Cor 300 and Core 300S VortexAir A

# Baseline: TF-IDF

In [7]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Preprocess the data
clean_queries = [q.replace('query: ', '') for q in queries]
clean_passages = [p.replace('passage: ', '') for p in passages]

# Step 2: Vectorize the passages
vectorizer = TfidfVectorizer()
vectorizer.fit(clean_passages)
passage_tfidf = vectorizer.transform(clean_passages)

# Step 3: Vectorize the queries
query_tfidf = vectorizer.transform(clean_queries)

# Step 4: Compute cosine similarity
# Since TF-IDF vectors are L2-normalized by default, the dot product yields cosine similarity
cosine_sim_matrix = query_tfidf.dot(passage_tfidf.T)

# Step 5 and 6: Retrieve top 200 passages and calculate accuracy
accuracy_count = 0
total_queries = len(queries)

for i in range(total_queries):
    # Get similarity scores for query i
    sim_scores = cosine_sim_matrix[i].toarray().flatten()
    # Retrieve indices of top 200 passages
    top_200_indices = np.argsort(-sim_scores)[:200]
    # Check if the correct passage is among the top 200
    if i in top_200_indices:
        accuracy_count += 1

# Step 7: Calculate accuracy
accuracy = accuracy_count / total_queries
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 42.34%


# End of Baseline

In [21]:
# Get unique passages
unique_passages = list(set(passages))
print(f"Total unique passages: {len(unique_passages)}")

# Get unique queries
unique_queries = list(set(queries))
print(f"Total unique queries: {len(unique_queries)}")

Total unique passages: 20367
Total unique queries: 21222


In [25]:
# from collections import defaultdict

# # Create a dictionary to map each unique query to its associated unique passage(s)
# query_to_passages = defaultdict(set)
# for q, p in zip(queries, passages):
#     query_to_passages[q].add(p)

# # Create a list mapping each unique query to its relevant unique passage index
# query_to_unique_p_idx = []
# for q in unique_queries:
#     passages_set = query_to_passages.get(q, set())
#     if len(passages_set) == 1:
#         # If there's only one passage for the query
#         p = next(iter(passages_set))
#         p_idx = unique_passages.index(p)  # Get the index in unique_passages
#         query_to_unique_p_idx.append(p_idx)
#     elif len(passages_set) > 1:
#         # If multiple passages are associated, decide how to handle
#         # For simplicity, choose the first one
#         p = next(iter(passages_set))
#         p_idx = unique_passages.index(p)
#         query_to_unique_p_idx.append(p_idx)
#     else:
#         # If no passage is found for the query
#         query_to_unique_p_idx.append(-1)

# # Check for queries without a corresponding passage
# num_missing = query_to_unique_p_idx.count(-1)
# print(f"Number of queries with no passage: {num_missing}")

# e5-large model pipeline

In [13]:
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import faiss
import numpy as np

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

# Set the device (adjust 'cuda:1' based on your GPU setup)
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large').to(device)

# Assuming 'queries' and 'passages' lists are already defined and populated
# Example:
# queries = ['query: How much protein should a female eat', 'query: 南瓜的家常做法', ...]  # 21,223 entries
# passages = ['passage: As a general guideline...', 'passage: 1.清炒南瓜丝...', ...]  # 21,223 entries

print(f"Number of Queries: {len(queries)}")
print(f"Number of Passages: {len(passages)}")

# Step 1: Encode and Index Passages
batch_size = 64
passage_embeddings = []

for i in tqdm(range(0, len(passages), batch_size), desc="Encoding Passages"):
    batch_passages = passages[i:i+batch_size]
    batch_dict = tokenizer(
        batch_passages,
        max_length=512,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )
    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
    
    with torch.no_grad():
        outputs = model(**batch_dict)
    
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1)
    embeddings = embeddings.cpu().numpy()
    passage_embeddings.append(embeddings)

passage_embeddings = np.vstack(passage_embeddings)
print(f"Passage Embeddings Shape: {passage_embeddings.shape}")

embedding_dim = passage_embeddings.shape[1]
index = faiss.IndexFlatIP(embedding_dim)
index.add(passage_embeddings)
print(f"Number of Passages Indexed: {index.ntotal}")

# Step 2: Encode Queries
query_batch_size = 64
query_embeddings = []

for i in tqdm(range(0, len(queries), query_batch_size), desc="Encoding Queries"):
    batch_queries = queries[i:i+query_batch_size]
    batch_dict = tokenizer(
        batch_queries,
        max_length=512,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )
    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
    
    with torch.no_grad():
        outputs = model(**batch_dict)
    
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1)
    embeddings = embeddings.cpu().numpy()
    query_embeddings.append(embeddings)

query_embeddings = np.vstack(query_embeddings)
print(f"Query Embeddings Shape: {query_embeddings.shape}")

Using device: cuda:3
Number of Queries: 21223
Number of Passages: 21223


Encoding Passages: 100%|██████████████████████████████████████████████████████████████| 332/332 [08:20<00:00,  1.51s/it]


Passage Embeddings Shape: (21223, 1024)
Number of Passages Indexed: 21223


Encoding Queries: 100%|███████████████████████████████████████████████████████████████| 332/332 [02:52<00:00,  1.93it/s]


Query Embeddings Shape: (21223, 1024)
Distances Shape: (21223, 5)
Indices Shape: (21223, 5)

Query 1: I need filters that effectively trap dust and improve the air quality in my home. It's surprising how much dust they can collect in just a few months.
Top Passages:
  Rank 1:
    Similarity Score: 0.8458
    Passage: 30x60 (cut-to-fit) Filtrete Hammock Filter by 3M. This 3M Filtrete Filter is a Cut-to-Fit. It is a great choice for people who want to improve their Indoor Air Quality and help reduce allergens and microscopic particles from the air in their homes including nuisance household dust.

  Rank 2:
    Similarity Score: 0.8435
    Passage: Dreo Air Purifiers Macro Pro, True HEPA Filter, Up to 1358ft² Coverage, 20dB Low Noise, PM2.5 Sensor, 6 Modes, 360 Filtration Cleaner Remove 99.985% Dust Smoke Pollen, Black. .

  Rank 3:
    Similarity Score: 0.8429
    Passage: Dayette HEPA Air Purifiers for Home Large Room, CADR 300+m³/h Up to 1290ft² with Air Quality Sensor, H13 True HEPA

In [27]:
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import faiss
import numpy as np
from collections import defaultdict

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

# Set the device (adjust 'cuda:3' based on your GPU setup)
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large').to(device)

# Assuming 'queries' and 'passages' lists are already defined and populated
# Example:
# queries = ['query: How much protein should a female eat', 'query: 南瓜的家常做法', ...]  # 21,223 entries
# passages = ['passage: As a general guideline...', 'passage: 1.清炒南瓜丝...', ...]  # 21,223 entries

print(f"Number of Queries: {len(queries)}")
print(f"Number of Passages: {len(passages)}")

# Step 1: Get unique passages and queries
unique_passages = list(set(passages))
print(f"Total unique passages: {len(unique_passages)}")  # 20,367

unique_queries = list(set(queries))
print(f"Total unique queries: {len(unique_queries)}")    # 21,222

# Step 2: Create mapping from passage text to unique index
passage_text_to_idx = {text: idx for idx, text in enumerate(unique_passages)}

# Step 3: Create mapping from unique query to unique passage index
query_to_passages = defaultdict(set)
for q, p in zip(queries, passages):
    query_to_passages[q].add(p)

query_to_unique_p_idx = []
for q in unique_queries:
    passages_set = query_to_passages.get(q, set())
    if len(passages_set) == 1:
        p = next(iter(passages_set))
        p_idx = passage_text_to_idx.get(p, -1)
        query_to_unique_p_idx.append(p_idx)
    elif len(passages_set) > 1:
        # Choose the first passage if multiple are associated
        p = next(iter(passages_set))
        p_idx = passage_text_to_idx.get(p, -1)
        query_to_unique_p_idx.append(p_idx)
    else:
        # No passage found
        query_to_unique_p_idx.append(-1)

# Check for queries without a corresponding passage
num_missing = query_to_unique_p_idx.count(-1)
print(f"Number of queries with no passage: {num_missing}")

# Step 4: Encode unique_passages and index with FAISS
batch_size = 64
passage_embeddings = []

for i in tqdm(range(0, len(unique_passages), batch_size), desc="Encoding Passages"):
    batch_passages = unique_passages[i:i+batch_size]
    batch_dict = tokenizer(
        batch_passages,
        max_length=512,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )
    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
    with torch.no_grad():
        outputs = model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1)
    embeddings = embeddings.cpu().numpy()
    passage_embeddings.append(embeddings)

passage_embeddings = np.vstack(passage_embeddings)
print(f"Passage Embeddings Shape: {passage_embeddings.shape}")  # (20367, embedding_dim)

embedding_dim = passage_embeddings.shape[1]
index = faiss.IndexFlatIP(embedding_dim)
index.add(passage_embeddings)
print(f"Number of Passages Indexed: {index.ntotal}")  # 20,367

# Step 5: Encode unique_queries
query_batch_size = 64
query_embeddings = []

for i in tqdm(range(0, len(unique_queries), query_batch_size), desc="Encoding Queries"):
    batch_queries = unique_queries[i:i+query_batch_size]
    batch_dict = tokenizer(
        batch_queries,
        max_length=512,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )
    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
    with torch.no_grad():
        outputs = model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1)
    embeddings = embeddings.cpu().numpy()
    query_embeddings.append(embeddings)

query_embeddings = np.vstack(query_embeddings)
print(f"Query Embeddings Shape: {query_embeddings.shape}")  # (21222, embedding_dim)

# Step 6: Perform Retrieval
top_k = 5
distances, indices = index.search(query_embeddings, top_k)
print(f"Distances Shape: {distances.shape}")  # (21222, 5)
print(f"Indices Shape: {indices.shape}")      # (21222, 5)

# Step 7: Evaluate Retrieval Performance
# query_to_unique_p_idx: list of length 21222, each entry is the index of the relevant passage
# in unique_passages (0 to 20366)

hits = 0
for i in range(len(unique_queries)):
    relevant_idx = query_to_unique_p_idx[i]
    if relevant_idx == -1:
        continue  # Skip if no relevant passage
    if relevant_idx in indices[i]:
        hits += 1

recall_at_k = hits / len(unique_queries)
print(f"Recall@{top_k}: {recall_at_k * 100:.2f}%")

# Calculate Mean Reciprocal Rank (MRR)
mrr_total = 0.0
for i in range(len(unique_queries)):
    relevant_idx = query_to_unique_p_idx[i]
    if relevant_idx == -1:
        continue  # Skip if no relevant passage
    try:
        rank = np.where(indices[i] == relevant_idx)[0][0] + 1  # 1-based rank
        mrr_total += 1.0 / rank
    except IndexError:
        pass  # Relevant passage not in top_k

mrr = mrr_total / len(unique_queries)
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")

# Step 8: Display Sample Retrievals
passage_idx_to_text = {idx: text for idx, text in enumerate(unique_passages)}

num_samples = 5  # Adjust as needed
for i in range(num_samples):
    query = unique_queries[i][7:]  # Remove 'query: ' prefix
    print(f"\nQuery {i+1}: {query}")
    print("Top Passages:")
    for rank in range(top_k):
        passage_idx = indices[i][rank]
        similarity_score = distances[i][rank]
        passage = unique_passages[passage_idx][9:]  # Remove 'passage: ' prefix
        print(f"  Rank {rank+1}:")
        print(f"    Similarity Score: {similarity_score:.4f}")
        print(f"    Passage: {passage}\n")

Using device: cuda:2
Number of Queries: 21223
Number of Passages: 21223
Total unique passages: 20367
Total unique queries: 21222
Number of queries with no passage: 0


Encoding Passages: 100%|██████████████████████████████████████████████████████████████| 319/319 [10:20<00:00,  1.95s/it]


Passage Embeddings Shape: (20367, 1024)
Number of Passages Indexed: 20367


Encoding Queries: 100%|███████████████████████████████████████████████████████████████| 332/332 [06:28<00:00,  1.17s/it]


Query Embeddings Shape: (21222, 1024)
Distances Shape: (21222, 5)
Indices Shape: (21222, 5)
Recall@5: 34.68%
Mean Reciprocal Rank (MRR): 0.2371

Query 1: I'm looking for supplements that can help with my dog's anxiety, especially during car rides. I want something that will not make her sedated but instead keep her calm and relaxed. It would be great if the supplements can make the entire experience stress-free for both my dog and myself.
Top Passages:
  Rank 1:
    Similarity Score: 0.8574
    Passage: NaturVet Quiet Moments Calming Aid Cat Supplement Plus Melatonin – Helps Reduce Stress in Cats – for Pet Storm Anxiety, Motion Sickness, Grooming, Separation, Travel – 50 Ct. Soft Chews. .

  Rank 2:
    Similarity Score: 0.8553
    Passage: Hemp Calming Chews for Dogs with Anxiety and Stress - Dog Calming Treats - Dog Anxiety Relief - Storms, Barking, Separation - Valerian - Hemp Oil - Calming Treats for Dogs - Made in USA. .

  Rank 3:
    Similarity Score: 0.8552
    Passage: PREMIUM

In [None]:
# Step 3: Perform Retrieval
top_k = 5
distances, indices = index.search(query_embeddings, top_k)
print(f"Distances Shape: {distances.shape}")
print(f"Indices Shape: {indices.shape}")

# Step 4: Display Sample Retrievals
passage_idx_to_text = {idx: text for idx, text in enumerate(passages)}

num_samples = 5  # Adjust as needed
for i in range(num_samples):
    query = queries[i][7:]  # Remove 'query: ' prefix
    print(f"\nQuery {i+1}: {query}")
    print("Top Passages:")
    for rank in range(top_k):
        passage_idx = indices[i][rank]
        similarity_score = distances[i][rank]
        passage = passages[passage_idx][9:]  # Remove 'passage: ' prefix
        print(f"  Rank {rank+1}:")
        print(f"    Similarity Score: {similarity_score:.4f}")
        print(f"    Passage: {passage}\n")

# (Optional) Step 5: Evaluate Retrieval Performance
# Assuming each query i is relevant to passage i
relevant_indices = np.arange(len(queries))  # Modify based on actual ground truth

hits = 0
for i in range(len(queries)):
    if relevant_indices[i] in indices[i]:
        hits += 1

recall_at_k = hits / len(queries)
print(f"Recall@{top_k}: {recall_at_k * 100:.2f}%")

# Calculate Mean Reciprocal Rank (MRR)
mrr_total = 0.0
for i in range(len(queries)):
    try:
        rank = np.where(indices[i] == relevant_indices[i])[0][0] + 1  # 1-based rank
        mrr_total += 1.0 / rank
    except IndexError:
        pass  # Relevant passage not in top_k

mrr = mrr_total / len(queries)
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")

In [44]:
# import torch
# import torch.nn.functional as F
# from torch import Tensor
# from transformers import AutoTokenizer, AutoModel

# def average_pool(last_hidden_states: Tensor,
#                  attention_mask: Tensor) -> Tensor:
#     last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
#     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

# # Set the device (GPU if available, else CPU)
# device = torch.device('cuda:1')
# print(f'Using device: {device}')

# # Each input text should start with "query: " or "passage: ", even for non-English texts.
# # For tasks other than retrieval, you can simply use the "query: " prefix.
# input_texts = input_t

# # Load tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
# model = AutoModel.from_pretrained('intfloat/multilingual-e5-large')

# # Move the model to the GPU (if available)
# model.to(device)

# # Tokenize the input texts
# batch_dict = tokenizer(
#     input_texts,
#     max_length=512,
#     padding=True,
#     truncation=True,
#     return_tensors='pt'
# )

# # Move the input tensors to the GPU (if available)
# batch_dict = {k: v.to(device) for k, v in batch_dict.items()}

# # Disable gradient calculation for inference
# with torch.no_grad():
#     outputs = model(**batch_dict)

# # Compute embeddings using average pooling
# embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# # Normalize embeddings
# embeddings = F.normalize(embeddings, p=2, dim=1)

# # Compute similarity scores
# scores = (embeddings[:2] @ embeddings[2:].T) * 100
# print(scores.tolist())

In [45]:
# import torch
# import torch.nn.functional as F
# from torch import Tensor
# from transformers import AutoTokenizer, AutoModel
# from tqdm import tqdm
# import faiss
# import numpy as np

# def average_pool(last_hidden_states: Tensor,
#                  attention_mask: Tensor) -> Tensor:
#     last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
#     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

# # Set the device (adjust 'cuda:3' based on your setup)
# device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
# print(f'Using device: {device}')

# # Load tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
# model = AutoModel.from_pretrained('intfloat/multilingual-e5-large').to(device)

# # Assuming 'queries' and 'passages' lists are already defined and populated
# # queries = ['query: ...', 'query: ...', ..., 'query: ...']  # 21,223 entries
# # passages = ['passage: ...', 'passage: ...', ..., 'passage: ...']  # 21,223 entries

# print(f"Number of Queries: {len(queries)}")
# print(f"Number of Passages: {len(passages)}")

# # Step 1: Encode and Index Passages
# batch_size = 64
# passage_embeddings = []

# for i in tqdm(range(0, len(passages), batch_size), desc="Encoding Passages"):
#     batch_passages = passages[i:i+batch_size]
#     batch_dict = tokenizer(
#         batch_passages,
#         max_length=512,
#         padding=True,
#         truncation=True,
#         return_tensors='pt'
#     )
#     batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
    
#     with torch.no_grad():
#         outputs = model(**batch_dict)
    
#     embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
#     embeddings = F.normalize(embeddings, p=2, dim=1)
#     embeddings = embeddings.cpu().numpy()
#     passage_embeddings.append(embeddings)

# passage_embeddings = np.vstack(passage_embeddings)
# print(f"Passage Embeddings Shape: {passage_embeddings.shape}")

# embedding_dim = passage_embeddings.shape[1]
# index = faiss.IndexFlatIP(embedding_dim)
# index.add(passage_embeddings)
# print(f"Number of Passages Indexed: {index.ntotal}")

# # Step 2: Encode Queries
# query_batch_size = 64
# query_embeddings = []

# for i in tqdm(range(0, len(queries), query_batch_size), desc="Encoding Queries"):
#     batch_queries = queries[i:i+query_batch_size]
#     batch_dict = tokenizer(
#         batch_queries,
#         max_length=512,
#         padding=True,
#         truncation=True,
#         return_tensors='pt'
#     )
#     batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
    
#     with torch.no_grad():
#         outputs = model(**batch_dict)
    
#     embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
#     embeddings = F.normalize(embeddings, p=2, dim=1)
#     embeddings = embeddings.cpu().numpy()
#     query_embeddings.append(embeddings)

# query_embeddings = np.vstack(query_embeddings)
# print(f"Query Embeddings Shape: {query_embeddings.shape}")

In [46]:
# # Step 3: Perform Retrieval
# top_k = 10
# distances, indices = index.search(query_embeddings, top_k)

# print(f"Distances Shape: {distances.shape}")
# print(f"Indices Shape: {indices.shape}")

# # Step 4: Display Sample Retrievals
# num_samples = 1  # Adjust as needed
# for i in range(num_samples):
#     query = queries[i][7:]  # Remove 'query: ' prefix
#     print(f"\nQuery {i+1}: {query}")
#     print("Top Passages:")
#     for rank in range(top_k):
#         passage_idx = indices[i][rank]
#         similarity_score = distances[i][rank]
#         passage = passages[passage_idx][9:]  # Remove 'passage: ' prefix
#         print(f"  Rank {rank+1}:")
#         print(f"    Similarity Score: {similarity_score:.4f}")
#         print(f"    Passage: {passage}\n")

# # (Optional) Step 5: Evaluate Retrieval Performance
# # Assuming each query i is relevant to passage i
# relevant_indices = np.arange(len(queries))  # Adjust based on actual ground truth

# hits = 0
# for i in range(len(queries)):
#     if relevant_indices[i] in indices[i]:
#         hits += 1

# recall_at_k = hits / len(queries)
# print(f"Recall@{top_k}: {recall_at_k * 100:.2f}%")