In [8]:
from typing import List

from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [9]:
def prepare_documents(docs_folder: str = "./pdf_docs") -> List[Document]:
    """
    Load and split PDF documents from a specified directory.

    Args:
        docs_folder (str, optional): The file containing folder. Defaults to "./pdf_docs".

    Returns:
        List[Document]: A list of Document objects containing the text from the PDFs split into chunks.
    """

    loader = PyPDFDirectoryLoader(
        path=docs_folder,
        glob="**/*.pdf",
        silent_errors=True
    )

    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
    )

    chunks = text_splitter.split_documents(documents)

    return chunks

In [10]:
chunks = prepare_documents()