In [40]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.response_synthesizers import CompactAndRefine
from llama_index.core.postprocessor.llm_rerank import LLMRerank
from llama_index.core.workflow import (
    Context,
    Workflow,
    StartEvent,
    StopEvent,
    step,
)

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding


In [41]:
from pydantic import BaseModel
from llama_index.core import Document

class DocumentMetadata(BaseModel):
    filename: str
    page: int


In [77]:
import os
import pdfplumber
from llama_index.core.node_parser import SentenceSplitter
from pydantic import BaseModel
from llama_index.core import Document

class DocumentMetadata(BaseModel):
    filename: str
    page: int

def process_pdfs_in_directory(directory):
    splitter = SentenceSplitter(
        chunk_size=512,
        chunk_overlap=30,
    )
    
    documents = []
    
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            filepath = os.path.join(directory, filename)
            with pdfplumber.open(filepath) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    text = page.extract_text()
                    if text:
                        text_list = splitter.split_text(text)
                        metadata = DocumentMetadata(filename=filename, page=page_num + 1)
                        metadata_dict = metadata.dict()
                        page_documents = [Document(text=t, 
                                                   metadata=metadata_dict, 
                                                   excluded_llm_metadata_keys=["filename", "page"],
                                                   excluded_embed_metadata_keys=["filename", "page"]) 
                                          for t in text_list]
                        documents.extend(page_documents)
                        
    return documents

# Usage example:
directory_path = "../data/"
all_documents = process_pdfs_in_directory(directory_path)

print(f"Processed {len(all_documents)} documents from PDFs.")

Processed 194 documents from PDFs.


In [57]:
class PdfIngestionWorkflow(Workflow):
    @step(pass_context=True)
    async def ingest(self, ctx: Context, ev: StartEvent) -> StopEvent | None:
        """Entry point to ingest a document, triggered by a StartEvent with `dirname`."""
        dirname = ev.get("dirname")
        if not dirname:
            return None

        documents = SimpleDirectoryReader(dirname).load_data()
        ctx.data["index"] = VectorStoreIndex.from_documents(
            documents=documents,
            embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
        )
        return StopEvent(result=f"Indexed {len(documents)} documents.")