In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import faiss
import numpy as np
from tqdm import tqdm
from typing import List

from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings.base import Embeddings


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_pubmedqa_data():
    dataset = load_dataset('qiaojin/PubMedQA', 'pqa_artificial')  # Adjust split if needed
    print(dataset.keys())
    contexts = dataset['train']['context']  # Extract contexts (abstracts)
    questions = dataset['train']['question']  # Extract questions
    ids = dataset['train']['pubid']  # Extract unique IDs
    return contexts, questions, ids

In [3]:
class CustomHuggingFaceEmbeddings(Embeddings):  # Inherit from Embeddings base class
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        inputs = self.tokenizer(texts, padding=True, truncation=True, 
                            max_length=512, return_tensors="pt").to('cuda')
        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        return embeddings.cpu().numpy().tolist()  # Convert to list format
    
    def embed_query(self, text: str) -> List[float]:
        return self.embed_documents([text])[0]

In [4]:
def compute_embeddings_vectorDB(input_texts, model_name="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"):
    # Initialize the model and tokenizer directly
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to('cuda')
    
    # Create a custom embeddings clas
    
    # Process documents in batches
    batch_size = 32
    documents = []
    
    # Create Document objects
    print("Processing documents...")
    for text in tqdm(input_texts):
        content = ' '.join(text['contexts'])
        documents.append(Document(page_content=content))
    
    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=50,
        length_function=len
    )
    splits = text_splitter.split_documents(documents)
    
    # Compute embeddings in batches
    print("Computing embeddings...")
    texts = [doc.page_content for doc in splits]  # Extract text content
    all_embeddings = []
    
    embeddings = CustomHuggingFaceEmbeddings(model, tokenizer)
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = embeddings.embed_documents(batch_texts)
        all_embeddings.extend(batch_embeddings)
    
    # Create FAISS index
    all_embeddings = np.array(all_embeddings)
    dimension = all_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(all_embeddings.astype('float32'))
    
    # Create and return FAISS vector store
    vectorstore = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore={i: doc for i, doc in enumerate(splits)},  # splits are already Document objects
        index_to_docstore_id=dict(enumerate(range(len(splits))))
    )
    
    return vectorstore

In [5]:

def save_vectorstore(vectorstore, path="./pubmedqa_vectorstore"):
    """Save the FAISS vector store"""
    vectorstore.save_local(path)

In [6]:
contexts, _, ids = load_pubmedqa_data()

dict_keys(['train'])


In [7]:
vectorstore = compute_embeddings_vectorDB(contexts)

Processing documents...


100%|██████████| 211269/211269 [00:00<00:00, 214230.00it/s]


Computing embeddings...


100%|██████████| 22191/22191 [09:36<00:00, 38.49it/s]


In [8]:
save_vectorstore(vectorstore, path="./pubmedqa_vectorstore")