In [1]:
#fastembed
# Import necessary modules
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
import os
import threading

# Define paths
data_dir = 'data/'
faiss_db = 'vectorstore/db_faiss'

# Function to create a vector database
def create_vector_db():
    # Check if the directory exists, if not, create it
    if not os.path.exists(data_dir):
        os.makedirs(faiss_db)
    
    # Create a DirectoryLoader instance to load PDF documents
    loader = DirectoryLoader(data_dir,
                            glob='*.pdf',
                            loader_cls=PyPDFLoader,
                            use_multithreading=True)
    
    # Load documents from the directory
    document = loader.load()
    print('.....document_loaded.....')
    
    # Initialize a text splitter to divide documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                                  chunk_overlap=100)
    print('.....document_splitter.....')
    
    # Split documents into smaller text chunks
    texts = text_splitter.split_documents(document)
    print('.....document_splitted.....')
    
    # Initialize HuggingFaceEmbeddings using a specific model
    embeddings = FastEmbedEmbeddings()
    
    # Create a vector store using FAISS from the text chunks and embeddings
    db = FAISS.from_documents(texts, embeddings)
    print('.....document_loaded_at_db.....')
    
    # Save the vector store locally
    db.save_local(faiss_db)

if __name__ == "__main__":
    # Create a new thread to execute the function
    document_thread = threading.Thread(target=create_vector_db)
    document_thread.start()
    document_thread.join()


.....document_loaded.....
.....document_splitter.....
.....document_splitted.....


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 16.97it/s]


.....document_loaded_at_db.....


In [2]:
embeddings = FastEmbedEmbeddings()

new_db = FAISS.load_local(faiss_db,embeddings)

query="what is medical?"
docs = new_db.similarity_search(query)
print(docs)

Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 14095.87it/s]


[Document(page_content='gene or genes.\nGene —A building block of inheritance, made up of\na compound called DNA (deoxyribonucleic acid)and containing the instructions for the productionof a particular protein. Each gene is found on a spe-cific location on a chromosome.\nMagnetic resonance imaging (MRI) —A test whichuses an external magnetic field instead of x rays to\nvisualize different tissues of the body.\nMyelin sheath —The cover that surrounds many\nnerve cells and helps to increase the speed bywhich information travels along the nerve.\nNeurofibromatosis type 2 (NF2) —A hereditary\ncondition associated with an increased risk of bilat-eral acoustic neuromas, other nerve cell tumors andcataracts.\nProtein —A substance produced by a gene that is\ninvolved in creating the traits of the human bodysuch as hair and eye color or is involved in control-ling the basic functions of the human body.\nSchwannoma —A tumor derived from the cells of\nthe myelin sheath that surrounds many nerve c