In [6]:
# A basic example of reading data from a .pdf file with Python,
# using pdf2image to convert it to images, and then using the
# openCV and tesseract libraries to extract the text
# The source data was downloaded from:
# https://files.stlouisfed.org/files/htdocs/publications/page1-econ/2020/12/01/unemployment-insurance-a-tried-and-true-safety-net_SE.pdf

In [22]:
# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB
# !apt install poppler-utils
# !pip install pdf2image
# !apt-get install tesseract-ocr
# !pip install pytesseract
# !pip install opencv-python


# from google.colab import files

Reading package lists... Done
Building dependency tree       
Reading state information... Done
poppler-utils is already the newest version (0.62.0-2ubuntu2.12).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr is already the newest version (4.00~git2288-10f4998a-2).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [23]:
# the built-in `operating system` or `os` Python library will let us create
# a new folder in which to store our converted images and output text
import os

# we'll import the `convert_from_path` "chapter" of the `pdf2image` library
from pdf2image import convert_from_path

# the built-in `glob`library offers a handy way to loop through all the files
# of a certain type in a folder, without needing to specify their individual
# file names
import glob
# `cv2` is the actual library name for `openCV`
import cv2

# and of course, we need our Python library for interfacing with the tesseract
# OCR process
import pytesseract

In [24]:
# we'll use the pdf name to name our generated images and text files, too
pdf_name = "SafetyNet"

# our source pdf is just in the same folder as our Python script
pdf_source_file = pdf_name+".pdf"

# if a folder with the same name as the pdf does not already exist
if os.path.isdir(pdf_name) == False:
    # create a new folder with that name
    target_folder = os.mkdir(pdf_name)

In [25]:
# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB
# # Import PyDrive and associated libraries.
# # This only needs to be done once per notebook.
# # Documentation found here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2
# from pydrive.auth import GoogleAuth
# from pydrive.drive import GoogleDrive
# from google.colab import auth
# from oauth2client.client import GoogleCredentials

# # Authenticate and create the PyDrive client.
# # This only needs to be done once per notebook.
# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)

In [26]:
# # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB
# # Link to data file stored in Drive: https://drive.google.com/file/d/1OndYA6Eb-qoyvbvc8-k4fdiIFdrsJv5F/view?usp=sharing
# file_id = '1OndYA6Eb-qoyvbvc8-k4fdiIFdrsJv5F' # notice where this string comes from in link above

# imported_file = drive.CreateFile({'id': file_id}) # creating an accessible copy of the shared data file
# print(imported_file['title'])  # it should print the title of desired file
# imported_file.GetContentFile(imported_file['title']) # refer to it in this notebook by the same name as it has in Drive

SafetyNet.pdf


In [27]:
# store all the pages of the PDF in a variable, by providing the path to then
# source file and the desired dots per inch (DPI) resolution of the output images
# while a lower DPI will be much faster, the poorer quality images my yield
# significantly less accurate OCR results. 300 DPI is a standard "print" quality
pages = convert_from_path(pdf_source_file, 300)

In [28]:
# loop through all the converted pages, enumerating them so that the page
# number can be used to label the resulting images
for page_num, page in enumerate(pages):

    # use the `.join` function to save the new files into the target_folder
    # we created above
    # we have to use the `str()` function to make the page number into a string
    # for use in the filename
    filename = os.path.join(pdf_name,"p"+str(page_num)+".png")

    # save the image of the page in system
    page.save(filename, 'PNG')

In [29]:
# next, go through the images in the folder and extract the text from each one
# note that '*.png' means "any file ending in .png"
# the `glob()` function creates a list of all the filenames in the specified
# folder, which in this case is the same as `pdf_name` - the folder where our
# images are stored
for img_file in glob.glob(os.path.join(pdf_name, '*.png')):

    # we need the image's file name, but `img_file` starts with the folder
    # name (e.g. "SafteyNet/" and ends in `.png`. So we'll replace the
    # forward slash with a period
    temp_name = img_file.replace("/",".")

    # `temp_name` is now something like, "SafteyNet.p1.png"
    # if we `split()` that on the period, we'll get a list like:
    # ["SafetyNet","p1","png"]
    # we want the second item, but since lists start counting at 0, we need to
    # target the item at position 1
    text_filename = temp_name.split(".")[1]

    # now! create a new, writable file, also in our target folder, that
    # has the same name as the image, but is a `.txt` file
    output_file = open(os.path.join(pdf_name,text_filename+".txt"), "w")

    # use the `cv2` library to interpret our image
    img = cv2.imread(img_file)

    # create a new variable to hold the results of using pytesseract's
    # `image_to_string()` function, which will do just that
    converted_text = pytesseract.image_to_string(img)

    # write our extracted text to our output file
    output_file.write(converted_text)

    # close the output file
    output_file.close()

    # # UNCOMMENT BELOW TO USE WITH GOOGLE COLAB
    # files.download(os.path.join(pdf_name,text_filename+".txt"))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>