In [1]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader

from transformers import AutoTokenizer, AutoModel
import torch


import faiss
import vertexai
import numpy as np
import pandas as pd
import pytrec_eval

  from tqdm.autonotebook import tqdm


In [2]:
# Download smallish NFCorpus dataset of questions and document text
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nfcorpus.zip"
data_path = util.download_and_unzip(url, "datasets")

# Corpus of text chunks, text queries and "gold" set of query to relevant documents dict
corpus, queries, qrels = GenericDataLoader("datasets/nfcorpus").load(split="test")

100%|██████████| 3633/3633 [00:00<00:00, 148165.24it/s]


In [6]:
# Specify the model name from Hugging Face Model Hub
model_name = "sentence-transformers/all-mpnet-base-v2"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = model.to(device)
torch.mps.empty_cache()

# Function to generate embeddings
def generate_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        model_output = model(input_ids=input_ids, attention_mask=attention_mask)
    
    # Perform pooling to get sentence embeddings
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    return mean_pooling(model_output, attention_mask).cpu()


doc_ids, docs = zip(*( (doc_id, doc['text']) for doc_id, doc in corpus.items() ))
q_ids, questions = zip(*( (q_id, q) for q_id, q in queries.items() ))


In [7]:
# Embed the documents and queries jointly using different models
doc_embeddings = generate_embeddings(docs)
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

# Example lookup example query to find relevant doc - note using 'RETRIEVAL_QUERY'
example_embed = generate_embeddings(['Is Caffeinated Tea Really Dehydrating?'])
s, q = index.search(example_embed, 1)
print(f"Score: {s[0][0]:.2f}, Text: {docs[q[0][0]]}")

# Embed all queries to evaluate quality compared to "gold" answers
query_embeddings = generate_embeddings(questions)
q_scores, q_doc_ids = index.search(query_embeddings, 10)

# Create a dict of query to document scores dict for pytrec evaluation
search_qrels = {q_ids[i]: {doc_ids[_id]: -1 * s.item() for _id, s in zip(q_doc_ids[i], q_scores[i])} for i in range(len(q_ids))}
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'ndcg_cut.10', 'P.1', 'recall_10'})
eval_results = evaluator.evaluate(search_qrels)
df = pd.DataFrame.from_dict(eval_results, orient='index')
print(df.mean())

RuntimeError: MPS backend out of memory (MPS allocated: 17.33 GB, other allocations: 10.38 MB, max allowed: 18.13 GB). Tried to allocate 5.32 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).