### Load PDF files

In [1]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader,
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
### Method 1: PyPDFLoader
print("PyPDFLoader")
try:
    pypdf_loader = PyPDFLoader("data/pdf_files/sample_pdf.pdf")
    documents = pypdf_loader.load()
    print(documents)
    print("No: of documents loaded = ",len(documents))
    print("First document: ",documents[0].page_content)
except Exception as e:
    print(f"Error: {e}")

PyPDFLoader
[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2026-01-08T21:36:38-06:00', 'author': 'Johnson, Neil', 'moddate': '2026-01-08T21:36:38-06:00', 'source': 'data/pdf_files/sample_pdf.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Aditya Vemparala Venkata Sesha \n +1 (945) 251-3495| adityavemparalausa@gmail.com| Aditya V.V.S | LinkedIn | My Projects \nEDUCATION \nThe University of Texas at Dallas - Master of Science, Information Technology and Management       May 2025 \nAwards: Nash Leadership, ITM All Rounder, Beta Gamma Sigma Honoree (Top 10% of graduate business students)   GPA 3.888 \nVellore Institute of Technology, Vellore - Bachelor of Technology, Electronics and Communication Engineering     June 2020 \n \nPROFESSIONAL EXPERIENCE \nRadiant Digital                                                               Vienna, VA, USA \nProduct Owner Intern                 

In [6]:
# Method 2: PyMuPDFLoader (fast and accurate)
print("PyMuPDFLoader")
try:
    pymupdf_loader = PyMuPDFLoader("data/pdf_files/sample_pdf.pdf")
    documents = pymupdf_loader.load()
    print(documents)
    print("No: of documents loaded = ",len(documents))
    print("First document: ",documents[0].page_content)
except Exception as e:
    print(f"Error: {e}")

PyMuPDFLoader
[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2026-01-08T21:36:38-06:00', 'source': 'data/pdf_files/sample_pdf.pdf', 'file_path': 'data/pdf_files/sample_pdf.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Johnson, Neil', 'subject': '', 'keywords': '', 'moddate': '2026-01-08T21:36:38-06:00', 'trapped': '', 'modDate': "D:20260108213638-06'00'", 'creationDate': "D:20260108213638-06'00'", 'page': 0}, page_content='Aditya Vemparala Venkata Sesha \n +1 (945) 251-3495| adityavemparalausa@gmail.com| Aditya V.V.S | LinkedIn | My Projects \nEDUCATION \nThe University of Texas at Dallas - Master of Science, Information Technology and Management  \n \n \n  \n May 2025 \nAwards: Nash Leadership, ITM All Rounder, Beta Gamma Sigma Honoree (Top 10% of graduate business students)  \n \nGPA 3.888 \nVellore Institute of Technology, Vellore - Bachelor of Technology, Electronics and Co

In [9]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter, TokenTextSplitter
from langchain_core.documents import Document
from typing import List

class smartPDFProcessor:
    def __init__(self, chunk_size=1000, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len,
            separators=[" "]
        )
    def process_pdf(self,pdf_path:str) -> list[Document]:
        # Load the PDF file
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()

        # Process each page
        processed_chunks = []
        for pageno,page in enumerate(pages):
            print(f"Processing page {pageno+1}\nPage metadata: {page.metadata}")
            ## Clean text
            cleaned_text = self.clean_text(page.page_content) #clean_text() is a method to clean the text
            if len(cleaned_text) < 10:
                continue
            # Create chunks with enhanced metadata
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page":pageno + 1,
                    "totalpages":len(pages),
                    "chunk_method":"smart_pdf_processor",
                    "char_count":len(cleaned_text)
                }]
            )
            processed_chunks.extend(chunks)
        return processed_chunks
    
    def clean_text(self,text:str) -> str:
        # Remove extra whitespace
        text = " ".join(text.split())

        # Fix common PDF extraction issues
        text = text.replace("ﬁ","fi")
        text = text.replace("ﬂ","fl")
        return text





In [10]:
processor = smartPDFProcessor()

In [11]:
processor

<__main__.smartPDFProcessor at 0x25bcb80a300>

In [20]:
## Process a PDF file if available

try:
    smart_chunks = processor.process_pdf("data/pdf_files/sample_pdf.pdf")
    print(f"Processed {len(smart_chunks)} chunks")
    if smart_chunks:
        for chunkno,chunk in enumerate(smart_chunks):
            print(f"Chunk {chunkno+1}:\n{chunk.page_content}")
        for i in range(len(smart_chunks)):
            print(f"Chunk {i+1} metadata:")
            for key,value in smart_chunks[i].metadata.items():
                print(f"{key}: {value}")
except Exception as e:
    print(f"Processing error: {e}")
    

Processing page 1
Page metadata: {'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2026-01-08T21:36:38-06:00', 'author': 'Johnson, Neil', 'moddate': '2026-01-08T21:36:38-06:00', 'source': 'data/pdf_files/sample_pdf.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}
Processed 7 chunks
Chunk 1:
Aditya Vemparala Venkata Sesha +1 (945) 251-3495| adityavemparalausa@gmail.com| Aditya V.V.S | LinkedIn | My Projects EDUCATION The University of Texas at Dallas - Master of Science, Information Technology and Management May 2025 Awards: Nash Leadership, ITM All Rounder, Beta Gamma Sigma Honoree (Top 10% of graduate business students) GPA 3.888 Vellore Institute of Technology, Vellore - Bachelor of Technology, Electronics and Communication Engineering June 2020 PROFESSIONAL EXPERIENCE Radiant Digital Vienna, VA, USA Product Owner Intern August 2025 – Present • Spearheaded the development and migration of a Social Management App in