# Load PDF Files

In [5]:
from langchain_community.document_loaders import (PyPDFLoader)

try:
    pdf_loader = PyPDFLoader("data/pdf_files/dummy-pdf.pdf")
    pdf_doc = pdf_loader.load()

    print(f"No of pages in file {len(pdf_doc)}")
    print(f"Doc content: {pdf_doc[0].page_content}")
    print(f"File meta data is {pdf_doc[0].metadata}")
except Exception as e:
    print(f"Error in loading file with PyPdfoader {e}")


No of pages in file 1
Doc content: Dummy PDF download
File meta data is {'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'Acrobat PDFMaker 8.1 for Word', 'creationdate': '2009-08-25T14:35:18+01:00', 'author': 'NewUser', 'moddate': '2009-08-25T14:35:21+01:00', 'company': 'SDL International', 'sourcemodified': 'D:20090825133443', 'title': 'Dummy PDF download', 'source': 'data/pdf_files/dummy-pdf.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}


In [7]:
from langchain_community.document_loaders import PyMuPDFLoader

try:    
    path = "data/pdf_files/dummy-pdf.pdf"   
    pdfMu_loader = PyMuPDFLoader(path)

    docs = pdfMu_loader.load()

    print(f"Length of doc : {len(docs)}")
    print(f"Doc Content : {docs[0].page_content[:100]}")
    print(f"Doc Meta : {docs[0].metadata}")
except Exception as e:
    print(f"Error is {e}")


Length of doc : 1
Doc Content : Dummy PDF download
Doc Meta : {'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'Acrobat PDFMaker 8.1 for Word', 'creationdate': '2009-08-25T14:35:18+01:00', 'source': 'data/pdf_files/dummy-pdf.pdf', 'file_path': 'data/pdf_files/dummy-pdf.pdf', 'total_pages': 1, 'format': 'PDF 1.4', 'title': 'Dummy PDF download', 'author': 'NewUser', 'subject': '', 'keywords': '', 'moddate': '2009-08-25T14:35:21+01:00', 'trapped': '', 'modDate': "D:20090825143521+01'00'", 'creationDate': "D:20090825143518+01'00'", 'page': 0}


#### PyPDFLoader And PyMuPDFLoader Comparision

PyPDFLoader:

- Simple and reliable
- Good for most pdfs
- Preserves page numbers
- Don't have Basic text extractor
- Used in std text pdf

PyMuPDFLoader:

- Fast Processing
- Good text extractor
- Image extraction support
- use when: Speed is imp

### Handling PDF Challenges

#### PDF are difficult to parse beacuse

- text is stored in complex ways - not simple text
- can have formatting issue
- may contain scanned images ( requiring OCR )
- Often have extraction artifacts

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
from typing import List
from langchain_core.documents import Document
class SmartPDFProcessor:
    """Advanced PDF Processing with error handling"""
    def __init__(self, chunk_size=1000,chunk_overlap=100):
        self.chunk_size=chunk_size
        self.chunk_overlap=chunk_overlap
        self.text_splitter=RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[" "]
        )
    
    def process_pdf(self,pdf_path)->List[Document]:
        """Get chunk data with enhanced meta"""

        # Load PDF

        pdf_loader = PyPDFLoader(pdf_path)
        pdf_docs = pdf_loader.load()

        pdf_chunks:List[Document] = []

        for page_num,page in enumerate(pdf_docs):
            # Clean DOC
            cleaned_doc = self._clean_pdf(page.page_content)

            # Skip Empty Pages
            # if len(cleaned_doc.strip())<50:
            #     continue
            
            chunks  = self.text_splitter.create_documents(
                texts=[cleaned_doc],
                metadatas=[{
                    **page.metadata,
                    "page":page_num+1,
                    "total_pages":len(pdf_docs),
                    "chunk_method":"smart_pdf_chunk",
                    "char_count":len(cleaned_doc)
                }]
            )

            pdf_chunks.extend(chunks)
        
        return pdf_chunks

    def _clean_pdf(self,text):  
        """Clean extracted text"""
        
        # Removes extra whitespace
        text = " ".join(text.split())


        # When you extract text from some PDFs, you sometimes get character ligature issues — for example:

        # Intended    Text	PDF Extracted As	     Why
        # office	     ofﬁce	                     The “ffi” ligature got merged into one glyph
        # flag	          ﬂag	                     “fl” ligature (Unicode: \ufb02)

        # Fix common PDF ligatures
        text = text.replace("ﬁ", "fi")  # U+FB01
        text = text.replace("ﬂ", "fl")  # U+FB02

        return text


In [46]:
processor = SmartPDFProcessor()

In [51]:
try:
    smart_chunks = processor.process_pdf("data/pdf_files/dummy-pdf.pdf")
    print(f"chunk size is : {len(smart_chunks)}")
    for index, chunk in enumerate(smart_chunks):
        print(f"Chunks : {chunk.page_content} \nMeta {chunk.metadata}")
except Exception as e:
    print(f"Error: {e}")

chunk size is : 1
Chunks : Dummy PDF download 
Meta {'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'Acrobat PDFMaker 8.1 for Word', 'creationdate': '2009-08-25T14:35:18+01:00', 'author': 'NewUser', 'moddate': '2009-08-25T14:35:21+01:00', 'company': 'SDL International', 'sourcemodified': 'D:20090825133443', 'title': 'Dummy PDF download', 'source': 'data/pdf_files/dummy-pdf.pdf', 'total_pages': 1, 'page': 1, 'page_label': '1', 'chunk_method': 'smart_pdf_chunk', 'char_count': 18}
