In this project we learn how to extract and parse PDF content using Python

### 1-Import tables

In [1]:
import tabula

In [5]:
tables = tabula.read_pdf("Sample_PDF_file.pdf", pages="all")
df = tables[0]
df

Unnamed: 0,EmployeeID,FirstName,LastName,Age,Gender
0,1001,Jim,Halpert,30.0,Male
1,1002,Pam,Beasley,30.0,Female
2,1003,Dwight,Schrute,29.0,Male
3,1004,Angela,Martin,31.0,Female
4,1005,Toby,Flenderson,32.0,Male
5,1006,Michael,Scott,35.0,Male
6,1007,Meredith,Palmer,32.0,Female
7,1008,Stanley,Hudson,38.0,Male
8,1009,Kevin,Malone,31.0,Male
9,1010,Tony,Stark,40.0,Male


### 2-Import text and regular expressions

In [8]:
# use pdfminer.six library
import re
from pdfminer.high_level import extract_pages, extract_text

In [9]:
for page_layout in extract_pages("Sample_PDF_file.pdf"):
    for element in page_layout:
        print(element)


<LTTextBoxHorizontal(0) 261.460,757.540,336.455,768.580 'Sample PDF File \n'>
<LTTextBoxHorizontal(1) 56.664,734.980,286.055,746.020 'This is an ordinary PDF File with some information. \n'>
<LTTextBoxHorizontal(2) 56.664,712.390,291.575,723.430 'Here are five names: John, Tom, Leila, Robert, Lucas \n'>
<LTTextBoxHorizontal(3) 56.664,690.070,274.995,701.110 'Here are six numbers: 6, 200, 360, 230, 420, 100 \n'>
<LTTextBoxHorizontal(4) 56.664,667.510,176.805,678.550 'Here is a table full of data: \n'>
<LTTextBoxHorizontal(5) 62.424,463.700,118.705,655.750 'EmployeeID \n1001 \n1002 \n1003 \n1004 \n1005 \n1006 \n1007 \n1008 \n1009 \n1010 \n1011 \n1012 \n1013 \n'>
<LTTextBoxHorizontal(6) 158.710,463.700,207.285,655.750 'FirstName \nJim \nPam \nDwight \nAngela \nToby \nMichael \nMeredith \nStanley \nKevin \nTony \nRyan \nHolly \nDarryl \n'>
<LTTextBoxHorizontal(7) 254.980,463.700,307.175,655.750 'LastName \nHalpert \nBeasley \nSchrute \nMartin \nFlenderson \nScott \nPalmer \nHudson \nMalone

In [10]:
# Full text

text = extract_text("Sample_PDF_file.pdf")
print(text)

Sample PDF File 

This is an ordinary PDF File with some information. 

Here are five names: John, Tom, Leila, Robert, Lucas 

Here are six numbers: 6, 200, 360, 230, 420, 100 

Here is a table full of data: 

EmployeeID 
1001 
1002 
1003 
1004 
1005 
1006 
1007 
1008 
1009 
1010 
1011 
1012 
1013 

FirstName 
Jim 
Pam 
Dwight 
Angela 
Toby 
Michael 
Meredith 
Stanley 
Kevin 
Tony 
Ryan 
Holly 
Darryl 

LastName 
Halpert 
Beasley 
Schrute 
Martin 
Flenderson 
Scott 
Palmer 
Hudson 
Malone 
Stark 
Howard 
Flax 
Philbin 

Age 
30 
30 
29 
31 
32 
35 
32 
38 
31 
40 
26 
31 
NULL 

Gender 
Male 
Female 
Male 
Female 
Male 
Male 
Female 
Male 
Male 
Male 
Male 
Female 
Male 

 
 



In [11]:
# Regular Expressions

pattern = re.compile(r"[a-zA-Z]+,{1}\s{1}") # read L/Ucase words followed by 1 comma and 1 space
matches = pattern.findall(text)
print(matches)

['John, ', 'Tom, ', 'Leila, ', 'Robert, ']


In [13]:
names = [n[:-2] for n in matches]
print(names)

['John', 'Tom', 'Leila', 'Robert']


### 3-Import images

In [17]:
import fitz # PyMuPDF
import PIL.Image # pillow
import io

In [22]:
pdf = fitz.open("Sample_PDF_file.pdf")
counter = 1 # it extracts 1 image
for i in range(len(pdf)):
    page = pdf[i]
    images = page.get_images()
    for image in images:
        base_img = pdf.extract_image(image[0])
        image_data = base_img["image"]
        img = PIL.Image.open(io.BytesIO(image_data))
        extension = base_img["ext"]
        img.save(open(f"image{counter}.{extension}", "wb"))
        counter += 1 # it extrats if we have more images