In [None]:
# Short instruction to make sure we are working in the right environment
!conda info

In [27]:
#import libraries
import os, io
import re
import time
from io import StringIO
from os import listdir
from os.path import isfile, join
# import pdf2image
# from pdf2image import convert_from_path
# from pdf2image import convert_from_bytes

# from PyPDF2 import PdfFileReader

# from IPython.display import display, Image
import pytesseract
# import tesserocr
# import pdftotext

# import pikepdf
# from pikepdf import Pdf, Page
import pdfminer
# import pdfplumber

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams #, LTTextBox, LTTextLine
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser

from PIL import Image

# Checking whether the files are scanned images or true pdfs

In [23]:
def pdf_to_text(file):
    output_string = StringIO()
    with open('../input/India_image.pdf', 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    print(output_string.getvalue())

In [30]:
onlyfiles = [f for f in listdir('../input/') if isfile(join('../input/', f))]

onlyfiles

['dof.html',
 '.~lock.dof.doc#',
 'dof.docx',
 'dof.pdf',
 'India1.pdf',
 '.gitkeep',
 'India_image.pdf',
 'ElSalvador_watermark.pdf']

In [8]:

def is_image(file):
    fp = open(file, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        return parser


In [11]:
print(is_image('../input/India1.pdf'))

<PDFParser: <_io.BufferedReader name='../input/India1.pdf'>, bufpos=17>


# Converting pdf to image files and improving quality

In [None]:
def get_image1(file_path):
  """Get image out of pdf file_path. Splits pdf file into PIL images of each of its pages.
  """
  return convert_from_path(file_path, 500)

# Performance tips according to pdf2image: 
# Using an output folder is significantly faster if you are using an SSD. Otherwise i/o usually becomes the bottleneck.
# Using multiple threads can give you some gains but avoid more than 4 as this will cause i/o bottleneck (even on my NVMe SSD!).

In [None]:
pages = get_image1(filepaths[0])
display(pages[0])

What can we do here to improve image quality? It already seems pretty good!

# Evaluating extraction time from each method and saving text to disk

In [None]:
def export_ocr(text, file, extract, out=out_path):
  """ Export ocr output text using extract method to file at out
  """
  filename = f'{os.path.splitext(os.path.basename(file))[0]}_{extract}.txt'
  with open(os.path.join(out, filename), 'w') as the_file:
    the_file.write(text)

def wrap_pagenum(page_text, page_num):
  """ Wrap page_text with page_num tag
  """
  return f"<p n={page_num}>" + page_text + "</p>"

In [None]:
# pytesseract extraction
start_time = time.time()
for file in filepaths:
  pages = get_image1(file)
  text = ""
  for pageNum, imgBlob in enumerate(pages):
    page_text = pytesseract.image_to_string(imgBlob, lang="spa")
    text += wrap_pagenum(page_text, pageNum)
  export_ocr(text, file, "pytesseract")  # write extracted text to disk
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# tesserocr extraction
start_time = time.time()
for file in filepaths:
  pages = get_image1(file)
  text = ""
  for pageNum, imgBlob in enumerate(pages):
    page_text = tesserocr.image_to_text(imgBlob, lang="spa")
    text += wrap_pagenum(page_text, pageNum)
  export_ocr(text, file, "tesserocr")  # write extracted text to disk
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# tesserocr extraction using the PyTessBaseAPI
start_time = time.time()
for file in filepaths:
  pages = get_image1(file)
  text = ""
  with tesserocr.PyTessBaseAPI(lang="spa") as api:
    for pageNum, imgBlob in enumerate(pages):
      api.SetImage(imgBlob)
      page_text = api.GetUTF8Text()
      text += wrap_pagenum(page_text, pageNum)
    export_ocr(text, file, "tesserocr_pytess")  # write extracted text to disk
print("--- %s seconds ---" % (time.time() - start_time))

It seems that the pytesseract package provides the fastest extraction and by looking at the extracted text it doesn't seem to exist any difference in the output of all the tested methods.

In [None]:
# comparison between text extracted by the different methods
os.listdir(out_path)

In [None]:
# TODO: perform a more programatical comparison between extracted texts

# Let's look at the extracted text

In [None]:
with open(os.path.join(out_path, 'Decreto_ejecutivo_57_pytesseract.txt')) as text:
  extracted_text = text.read()
extracted_text

In [None]:
# Replace \x0c (page break) by \n
# Match 1 or more occurrences of \n if preceeded by one occurrence of \n OR 
# Match 1 or more occurrences of \s (whitespace) if preceeded by one occurrence of \n OR 
# Match one occurrence of \n if it isn't followed by \n
print(re.sub("(?<=\n)\n+|(?<=\n)\s+|\n(?!\n)", " ", extracted_text.replace("\x0c", "\n")))