### Extract Text from a PDF

In [4]:
from PyPDF2 import PdfReader

reader = PdfReader("PDFs/example.pdf")
page = reader.pages[0]
print(page.extract_text())

 S a m p l e   D o c u m e n t   f o r   P D F   P l a c e h o l d e r   C o n t e n t
 T h i s   i s   a   p a r a g r a p h .   I t   e x i s t s   h e r e   t o   a c t   a s   p l a c e h o l d e r   c o n t e n t .
 R a n d o m   P a r a g r a p h
 H e r e   i s   a n o t h e r   r a n d o m   p a r a g r a p h   t h a t   s e r v e s   a s   f i l l e r   t e x t .
 B u l l e t   L i s t   E x a m p l e :
 "   T h i s   i s   t h e   f i r s t   i t e m   i n   a   l i s t .
 "   T h i s   i s   t h e   s e c o n d   i t e m   i n   t h e   l i s t .
 "   T h i s   i s   t h e   t h i r d   i t e m   i n   t h e   l i s t .
 N u m b e r e d   L i s t   E x a m p l e :
 1 .   F i r s t   s t e p   o f   t h e   p r o c e s s .
 2 .   S e c o n d   s t e p   o f   t h e   p r o c e s s .
 3 .   F i n a l   s t e p   o f   t h e   p r o c e s s .
 S a m p l e   T a b l e
ID
Name
Age
Country
01
John Doe
29
USA
02
Jane Smith
34
Canada
03
Alex Kim
41
South Korea
04
Maria Silva
25
Brazi



#### Ignoring Headers and Footers

The following example reads the text of page 4 of this PDF document, but ignores header (y < 720) and footer (y > 50).

In [12]:
from PyPDF2 import PdfReader

reader = PdfReader("PDFs/GeoBase_NHNC1_Data_Model_UML_EN.pdf")
page = reader.pages[3]

parts = []


def visitor_body(text, cm, tm, fontDict, fontSize):
    y = tm[5]
    if y > 50 and y < 720:
        parts.append(text)


page.extract_text(visitor_text=visitor_body)
text_body = "".join(parts)

print(text_body)

TABLE OF CONTENTS  
 
1 OVERVIEW  ................................ ................................ ................................ ................................ ............  6 
2 LRS ................................ ................................ ................................ ................................ ........................  6 
2.1 LRS  MODEL  ................................ ................................ ................................ ................................ ...... 7 
3 MODEL  ................................ ................................ ................................ ................................ ..................  8 
3.1 LRS  MODEL  ................................ ................................ ................................ ................................ ...... 9 
3.1.1 Logical view  ................................ ................................ ................................ ...............................  9 
3.1.2 Hydro net

In [18]:
from PyPDF2 import PdfReader
import svgwrite

reader = PdfReader("PDFs/GeoBase_NHNC1_Data_Model_UML_EN.pdf")
page = reader.pages[2]

dwg = svgwrite.Drawing("Images/GeoBase_test.svg", profile="tiny")


def visitor_svg_rect(op, args, cm, tm):
    if op == b"re":
        (x, y, w, h) = (args[i].as_numeric() for i in range(4))
        dwg.add(dwg.rect((x, y), (w, h), stroke="red", fill_opacity=0.05))


def visitor_svg_text(text, cm, tm, fontDict, fontSize):
    (x, y) = (tm[4], tm[5])
    dwg.add(dwg.text(text, insert=(x, y), fill="blue"))


page.extract_text(
    visitor_operand_before=visitor_svg_rect, visitor_text=visitor_svg_text
)
dwg.save()

#### Extracting Meta data

In [21]:
# Open a PDF file
with open('PDFs/example.pdf', 'rb') as pdf_file:
    pdf_reader = PdfReader(pdf_file)

    # Get the number of pages
    num_pages = len(pdf_reader.pages)
    print(f"The PDF has {num_pages} pages.")

    # Extract metadata
    metadata = pdf_reader.metadata
    print(metadata)

The PDF has 2 pages.
{'/Author': '(anonymous)', '/CreationDate': "D:20250816135601+00'00'", '/Creator': '(unspecified)', '/Keywords': '', '/ModDate': "D:20250816135601+00'00'", '/Producer': 'ReportLab PDF Library - www.reportlab.com', '/Subject': '(unspecified)', '/Title': '(anonymous)', '/Trapped': '/False'}


#### Extracting Images

In [36]:
from PyPDF2 import PdfReader

reader = PdfReader("PDFs/DA 2009 - Week 1.pdf")

page = reader.pages[2]
page.images[0].name
page.images[0].data

b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00`\x00`\x00\x00\xff\xdb\x00C\x00\x08\x06\x06\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f\x14\x1d\x1a\x1f\x1e\x1d\x1a\x1c\x1c $.\' ",#\x1c\x1c(7),01444\x1f\'9=82<.342\xff\xdb\x00C\x01\t\t\t\x0c\x0b\x0c\x18\r\r\x182!\x1c!22222222222222222222222222222222222222222222222222\xff\xc0\x00\x11\x08\x01\xb5\x01\xb5\x03\x01"\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1f\x00\x00\x01\x05\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\xff\xc4\x00\xb5\x10\x00\x02\x01\x03\x03\x02\x04\x03\x05\x05\x04\x04\x00\x00\x01}\x01\x02\x03\x00\x04\x11\x05\x12!1A\x06\x13Qa\x07"q\x142\x81\x91\xa1\x08#B\xb1\xc1\x15R\xd1\xf0$3br\x82\t\n\x16\x17\x18\x19\x1a%&\'()*456789:CDEFGHIJSTUVWXYZcdefghijstuvwxyz\x83\x84\x85\x86\x87\x88\x89\x8a\x92\x93\x94\x95\x96\x97\x98\x99\x9a\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xd2\xd3\xd4\xd

In [None]:
from PyPDF2 import PdfReader

reader = PdfReader("PDFs/DA 2009 - Week 1.pdf")

page = reader.pages[2]

count = 0
for image_file_object in page.images:
    save_path = f"Outputs/{image_file_object.name}"
    with open(save_path, "wb") as img_file:
        img_file.write(image_file_object.data)
        count += 1
        print(f"Saved the file {image_file_object.name} in the Outputs folder")

Saved the file Image60.jpg in the Outputs folder


#### Encrypt & Decrypt files

Encrypting

In [None]:
from PyPDF2 import PdfReader, PdfWriter

reader = PdfReader("PDFs/example.pdf")
writer = PdfWriter()

# Add all pages to the writer
for page in reader.pages:
    writer.add_page(page)

# Add a password to the new PDF
writer.encrypt("my-secret-password")

# Save the new PDF to a file
with open("encrypted-pdf.pdf", "wb") as f:
    writer.write(f)

Decrypting

In [None]:
from PyPDF2 import PdfReader, PdfWriter

reader = PdfReader("encrypted-pdf.pdf")
writer = PdfWriter()

if reader.is_encrypted:
    reader.decrypt("my-secret-password")

# Add all pages to the writer
for page in reader.pages:
    writer.add_page(page)

# Save the new PDF to a file
with open("decrypted-pdf.pdf", "wb") as f:
    writer.write(f)

#### Merging PDFs

In [None]:
from PyPDF2 import PdfWriter

merger = PdfWriter()

for pdf in ["file1.pdf", "file2.pdf", "file3.pdf"]:
    merger.append(pdf)

merger.write("merged-pdf.pdf")
merger.close()