In [1]:
from typing import List

from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.embeddings import Embeddings
from langchain_chroma import Chroma

# 1. Document preparation

In [2]:
def prepare_documents(docs_folder: str = "./pdf_docs") -> List[Document]:
    """
    Load and split PDF documents from a specified directory.

    Args:
        docs_folder (str, optional): The file containing folder. Defaults to "./pdf_docs".

    Returns:
        List[Document]: A list of Document objects containing the text from the PDFs split into chunks.
    """

    loader = PyPDFDirectoryLoader(
        path=docs_folder,
        glob="**/*.pdf",
        silent_errors=True
    )

    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
    )

    chunks = text_splitter.split_documents(documents)

    return chunks

In [3]:
chunks = prepare_documents()

# 2. Vector store creation

In [4]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
)

In [5]:
def create_chroma_vector_store(documents: List[Document], embedding_model: Embeddings, persist_dir: str = "./chroma_db") -> Chroma:
    """
    Create a Chroma vector store from the provided documents and embedding model.

    Args:
        documents (List[Document]): List of Document objects to be stored in the vector store.
        embedding_model (Embeddings): The embedding model to be used for vectorization.
        dir (str, optional): Directory to persist the collection. Defaults to "./chroma_db".

    Returns:
        Chroma: _description_
    """
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_dir,
    )
    return vector_store

In [6]:
vector_store = create_chroma_vector_store(chunks, embedding_model)