In [53]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader
from torch.utils.data import Dataset
import torch

# Download and unzip the dataset
dataset_name = "quora"
model_name = "facebook/contriever"
url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset_name}.zip"
data_path = util.download_and_unzip(url, "datasets")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the dataset
corpus_dev, queries_dev, qrels_dev = GenericDataLoader(data_folder=data_path).load(split="dev")
corpus_test, queries_test, qrels_test = GenericDataLoader(data_folder=data_path).load(split="test")

  0%|          | 0/522931 [00:00<?, ?it/s]

  0%|          | 0/522931 [00:00<?, ?it/s]

In [54]:
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model = model.to(device)
model.eval()

tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [55]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [56]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Tokenize and convert to tensors
    inputs = [tokenizer(text, return_tensors='pt', truncation=True, padding=False) for text in batch]
    input_ids = [item['input_ids'].squeeze(0) for item in inputs]
    attention_masks = [item['attention_mask'].squeeze(0) for item in inputs]

    # Pad sequences
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_masks_padded = pad_sequence(attention_masks, batch_first=True, padding_value=0)

    return input_ids_padded, attention_masks_padded

In [57]:
from torch.utils.data import DataLoader

# Assuming corpus_dev is a dictionary with document IDs as keys and text as values
texts = [corpus_dev[doc_id]['text'] for doc_id in corpus_dev.keys()]
dataset = TextDataset(texts)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

doc_embeddings = []

for input_ids, attention_masks in tqdm(dataloader):
    input_ids = input_ids.to(device)
    attention_masks = attention_masks.to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_masks)
    embeddings = outputs.last_hidden_state[:, 0, :]
    doc_embeddings.append(embeddings)

# Concatenate all embeddings
doc_embeddings = torch.cat(doc_embeddings, dim=0).cpu()
doc_embeddings = np.vstack(doc_embeddings).astype("float32")

# Build FAISS index
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)


  0%|          | 0/16342 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 22.00 MiB. GPU 0 has a total capacity of 23.61 GiB of which 35.44 MiB is free. Including non-PyTorch memory, this process has 22.27 GiB memory in use. Of the allocated memory 20.89 GiB is allocated by PyTorch, and 1.18 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [44]:
query_embeddings = []
query_ids = list(queries_test.keys())

for query_id in tqdm(query_ids):
    query_text = queries_test[query_id]
    inputs = tokenizer(query_text, return_tensors='pt', truncation=True, padding=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    query_embedding = embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    query_embeddings.append(query_embedding)

# Convert to a NumPy array
query_embeddings = np.vstack(query_embeddings).astype("float32")

  0%|          | 0/10000 [00:00<?, ?it/s]

In [45]:
# Search the FAISS index
print('Searching through index...')
D, I = index.search(query_embeddings, k=10)  # Retrieve top 10 documents

# Map retrieved indices to document IDs
retrieved_results = {}
doc_ids = list(corpus_test.keys())

print('Storing retrieved results...')
for i, query_id in tqdm(enumerate(query_ids)):
    retrieved_results[query_id] = {doc_ids[idx]: float(D[i][j]) for j, idx in enumerate(I[i])}

Searching through index...
Storing retrieved results...


0it [00:00, ?it/s]

In [46]:
from beir.retrieval.evaluation import EvaluateRetrieval

# Evaluate using BEIR’s built-in function
evaluator = EvaluateRetrieval()
ndcg, _map, recall, precision = evaluator.evaluate(qrels_test, retrieved_results, [10, 100])

# Print results
print(f"NDCG@10: {ndcg['NDCG@10']:.4f}")
print(f"MAP@10: {_map['MAP@10']:.4f}")
print(f"Recall@10: {recall['Recall@10']:.4f}")
print(f"Precision@10: {precision['P@10']:.4f}")
print('\n')
print(f"NDCG@100: {ndcg['NDCG@100']:.4f}")
print(f"MAP@100: {_map['MAP@100']:.4f}")
print(f"Recall@100: {recall['Recall@100']:.4f}")
print(f"Precision@100: {precision['P@100']:.4f}")

NDCG@10: 0.0726
MAP@10: 0.0332
Recall@10: 0.1977
Precision@10: 0.0260


NDCG@100: 0.0722
MAP@100: 0.0332
Recall@100: 0.1977
Precision@100: 0.0026


In [47]:
# Save results to a file
with open(f"results/{model_name}+{dataset_name}.txt", "w") as f:
    f.write("Evaluation Results:\n")
    f.write(f"NDCG@10: {ndcg['NDCG@10']:.4f}\n")
    f.write(f"MAP@10: {_map['MAP@10']:.4f}\n")
    f.write(f"Recall@10: {recall['Recall@10']:.4f}\n")
    f.write(f"Precision@10: {precision['P@10']:.4f}\n")
    f.write("\n")
    f.write(f"NDCG@100: {ndcg['NDCG@100']:.4f}\n")
    f.write(f"MAP@100: {_map['MAP@100']:.4f}\n")
    f.write(f"Recall@100: {recall['Recall@100']:.4f}\n")
    f.write(f"Precision@100: {precision['P@100']:.4f}\n")