In [2]:
import os
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from chromadb import PersistentClient

# MACROS
MODEL_NAME = 'intfloat/e5-large'
EMB_FUNC = HuggingFaceEmbeddings(
    model_name=MODEL_NAME,
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)
persistent_dir = "./embeddings"

if not os.access(persistent_dir, os.W_OK):
    print("Directory is not writable, fixing...")
    os.chmod(persistent_dir, 0o755)

  EMB_FUNC = HuggingFaceEmbeddings(
2025-11-15 13:51:07.107841: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-15 13:51:07.146025: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-11-15 13:51:07.146069: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-15 13:51:07.146105: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-15 13:51:07.156336: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To 

In [3]:
def embed_docs(docs_path, clt_name="default"):
    loader = TextLoader(docs_path)

    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    chunks = text_splitter.split_documents(documents)
    for chunk in chunks:
        chunk.metadata["default"] = clt_name

    vectorstore = Chroma.from_documents(
        documents=chunks,         
        embedding=EMB_FUNC,       
        collection_name=clt_name, 
        persist_directory=persistent_dir,
    )
    return vectorstore


    

embed_docs("cabin.txt", "paper-1")
embed_docs("simple_text.txt", "paper-2")


<langchain_community.vectorstores.chroma.Chroma at 0x72ec5d90e020>

In [5]:
def embed_pdf(pdf_path, clt_name="paper-1"):
    
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    
    # Split into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    chunks = text_splitter.split_documents(documents)
    
    for chunk in chunks:
        chunk.metadata["default"] = clt_name
    
    # Create vectorstore
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=EMB_FUNC,
        collection_name=clt_name,
        persist_directory=persistent_dir,
    )

embed_pdf("./research_papers/Pranavasri et al. - 2024 - Exploratory Study of oneM2M-Based Interoperability Architectures for IoT A Smart City Perspective.pdf", "paper-3")


In [None]:
query = "Summarise paper 1"


def query_collection(query, clt_name=None):
    vectorstore = Chroma(
        collection_name=clt_name,
        embedding_function=EMB_FUNC,
        persist_directory=persistent_dir
    )
    
    results = vectorstore.similarity_search(query, k=5)
    return results

docs = query_collection(query, "paper-1")
print(docs)


In [None]:
docs = query_collection(query, "paper-2")
print(docs)

[Document(metadata={'source': 'simple_text.txt', 'default': 'paper-2'}, page_content='"I am inclined to think--" said I.\n\n     "I should do so," Sherlock Holmes remarked impatiently.\n\n     I believe that I am one of the most long-suffering of mortals; but\n     I\'ll admit that I was annoyed at the sardonic interruption.\n\n     "Really, Holmes," said I severely, "you are a little trying at\n     times."\n\n     He was too much absorbed with his own thoughts to give any immediate\n     answer to my remonstrance. He leaned upon his hand, with his untasted\n     breakfast before him, and he stared at the slip of paper which he had\n     just drawn from its envelope. Then he took the envelope itself, held\n     it up to the light, and very carefully studied both the exterior and\n     the flap.'), Document(metadata={'source': 'simple_text.txt', 'default': 'paper-2'}, page_content='"It is Porlock\'s writing," said he thoughtfully. "I can hardly doubt\n     that it is Porlock\'s writing

In [None]:
query = "summarise the paper's introduction"
docs = query_collection(query, "paper-3")

print(docs)


[Document(metadata={'source': './research_papers/Assessing the effects of data drift on the performance of machine learning models used in clinical sepsis prediction.pdf', 'author': 'Keyvan Rahmani', 'creationdate': '2023-03-20T13:19:47+00:00', 'title': 'Assessing the effects of data drift on the performance of machine learning models used in clinical sepsis prediction', 'crossmarkdomainexclusive': 'true', 'keywords': 'Data drift,Sepsis,Machine learning,Clinical decision support', 'crossmarkdomains[2]': 'sciencedirect.com', 'crossmarkmajorversiondate': '2010-04-23', 'elsevierwebpdfspecifications': '7.0', 'moddate': '2023-03-22T11:15:09+00:00', 'doi': '10.1016/j.ijmedinf.2022.104930', 'creator': 'Elsevier', 'default': 'paper-3', 'page': 2, 'total_pages': 13, 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'subject': 'International Journal of Medical Informatics, 173 (2023) 104930. doi:10.1016/j.ijmedinf.2022.104930', 'page_label': '3', 'crossmarkdomains[1]': 'elsevier.com', 'robots': '

In [None]:
query2 = "what is in the introduction"

docs = query_collection(query2, "paper-3")

print(docs)


[Document(metadata={'keywords': 'Data drift,Sepsis,Machine learning,Clinical decision support', 'elsevierwebpdfspecifications': '7.0', 'title': 'Assessing the effects of data drift on the performance of machine learning models used in clinical sepsis prediction', 'page_label': '3', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'subject': 'International Journal of Medical Informatics, 173 (2023) 104930. doi:10.1016/j.ijmedinf.2022.104930', 'crossmarkdomains[1]': 'elsevier.com', 'crossmarkdomains[2]': 'sciencedirect.com', 'creationdate': '2023-03-20T13:19:47+00:00', 'crossmarkdomainexclusive': 'true', 'total_pages': 13, 'default': 'paper-3', 'author': 'Keyvan Rahmani', 'crossmarkmajorversiondate': '2010-04-23', 'creationdate--text': '22nd March 2023', 'creator': 'Elsevier', 'doi': '10.1016/j.ijmedinf.2022.104930', 'robots': 'noindex', 'moddate': '2023-03-22T11:15:09+00:00', 'source': './research_papers/Assessing the effects of data drift on the performance of machine learning models u