In [None]:

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np
import os
import pickle

In [None]:

def load_and_prepare_data():
    dataset = load_dataset("enelpol/rag-mini-bioasq", "text-corpus")
    
    texts = dataset['test']['passage'][:1000]
    ids = dataset['test']['id'][:1000]
    return texts, ids

In [None]:

def load_model_and_tokenizer():
    model_name = "ncbi/MedCPT-Query-Encoder"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    return model, tokenizer, device

In [None]:

def compute_embeddings(texts, model, tokenizer, device, batch_size=32):
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        
        
        inputs = tokenizer(batch_texts, padding=True, truncation=True, 
                         max_length=512, return_tensors="pt")
        
        
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        
        with torch.no_grad():
            outputs = model(**inputs)
            
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(batch_embeddings)
    
    
    embeddings = np.vstack(embeddings)
    return embeddings

In [None]:

def create_and_save_faiss_index(embeddings, texts, ids, save_dir="faiss_index"):
    
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    
    
    index.add(embeddings)
    
    
    os.makedirs(save_dir, exist_ok=True)
    
    
    faiss.write_index(index, os.path.join(save_dir, "docs.index"))
    
    
    with open(os.path.join(save_dir, "texts.pkl"), "wb") as f:
        pickle.dump({"texts": texts, "ids": ids}, f)
    
    return index

In [None]:

def main():
    
    texts, ids = load_and_prepare_data()
    print(f"Loaded {len(texts)} documents")
    
    
    model, tokenizer, device = load_model_and_tokenizer()
    print(f"Model loaded and moved to {device}")
    
    
    embeddings = compute_embeddings(texts, model, tokenizer, device)
    print(f"Computed embeddings with shape: {embeddings.shape}")
    
    
    index = create_and_save_faiss_index(embeddings, texts, ids)
    print(f"Created and saved FAISS index with {index.ntotal} vectors")

In [34]:
main()

Loaded 1000 documents
Model loaded and moved to cuda
Computed embeddings with shape: (1000, 768)
Created and saved FAISS index with 1000 vectors
