In [3]:
import fitz

# for OCR using PyTesseract
import cv2                              # pre-processing images
import pytesseract                      # extracting text from images
import numpy as np
import matplotlib.pyplot as plt         # displaying output images

from PIL import Image

SCANNED_FILE = "../data/motor-insurance-pds-current.pdf"
OUT_PATH = "../.tmp"

In [67]:
def convert_to_img(src_file, out_path, zoom_x = 2.0, zoom_y = 2.0):
    mat = fitz.Matrix(zoom_x, zoom_y)
    doc = fitz.open(src_file)

    doc_name = src_file.split("/")[-1].split(".")[0]
    gen_files = []
    print(f"Generated pages of {doc_name}: ")
    for page in doc:
        pix = page.get_pixmap(matrix=mat)
        png = f'{out_path}/{doc_name}_page-{page.number}.png'
        print(png)
        gen_files.append(png)
        pix.save(png)

    return gen_files

gen_files = convert_to_img(src_file=SCANNED_FILE, out_path=OUT_PATH)

Generated pages of motor-insurance-pds-current: 
../.tmp/motor-insurance-pds-current_page-0.png
../.tmp/motor-insurance-pds-current_page-1.png
../.tmp/motor-insurance-pds-current_page-2.png
../.tmp/motor-insurance-pds-current_page-3.png
../.tmp/motor-insurance-pds-current_page-4.png
../.tmp/motor-insurance-pds-current_page-5.png
../.tmp/motor-insurance-pds-current_page-6.png
../.tmp/motor-insurance-pds-current_page-7.png
../.tmp/motor-insurance-pds-current_page-8.png
../.tmp/motor-insurance-pds-current_page-9.png
../.tmp/motor-insurance-pds-current_page-10.png
../.tmp/motor-insurance-pds-current_page-11.png
../.tmp/motor-insurance-pds-current_page-12.png
../.tmp/motor-insurance-pds-current_page-13.png
../.tmp/motor-insurance-pds-current_page-14.png
../.tmp/motor-insurance-pds-current_page-15.png
../.tmp/motor-insurance-pds-current_page-16.png
../.tmp/motor-insurance-pds-current_page-17.png
../.tmp/motor-insurance-pds-current_page-18.png
../.tmp/motor-insurance-pds-current_page-19.png
.

In [73]:
import re
def clean_text(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9\n\(\)\[\]\- ]+', ' ', text)
    text = re.sub('(?<![\r\n])(\r?\n|\r)(?![\r\n])', ' ', re.sub(' +', ' ', text))
    text = re.sub('\n+',' \n', text)
    return text

def extract_text_page(img_file, text_base_path, tess_lang='lat', tess_args='--oem 3 --psm 1'):
    original_image_file = img_file
    page_name = original_image_file.split("/")[-1].split(".")[0]
    original_image = cv2.imread(original_image_file)
    gray_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)
    ret, threshold_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)

    rectangular_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (22, 22))

    # Applying dilation on the threshold image
    dilated_image = cv2.dilate(threshold_image, rectangular_kernel, iterations = 3)

    # Finding contours
    contours, hierarchy = cv2.findContours(dilated_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    # Creating a copy of the image
    copied_image = original_image.copy()
    # Looping through the identified contours
    # Then rectangular part is cropped and passed on to pytesseract
    # pytesseract extracts the text inside each contours
    # Extracted text is then written into a text file
    extracted_text = ""
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        # Cropping the text block for giving input to OCR
        cropped = copied_image[y:y + h, x:x + w]
        text = pytesseract.image_to_string(cropped, lang=tess_lang, config=tess_args)
        extracted_text = extracted_text + " \n" + text

    fname_out = f"{text_base_path}/{page_name}.txt"
    with open(fname_out, "w") as f:
            #write out the text
            cleaned_text = clean_text(extracted_text)
            f.writelines(cleaned_text)
            print(f"Written to {fname_out}")

    return cleaned_text, extracted_text

In [74]:
text_base_path = "../data/out/"
for file in gen_files:
    print(f"Extracting {file}")
    extract_text_page(file, text_base_path, tess_lang="eng")



Extracting ../.tmp/motor-insurance-pds-current_page-0.png
Written to ../data/out//motor-insurance-pds-current_page-0.txt
Extracting ../.tmp/motor-insurance-pds-current_page-1.png
Written to ../data/out//motor-insurance-pds-current_page-1.txt
Extracting ../.tmp/motor-insurance-pds-current_page-2.png
Written to ../data/out//motor-insurance-pds-current_page-2.txt
Extracting ../.tmp/motor-insurance-pds-current_page-3.png
Written to ../data/out//motor-insurance-pds-current_page-3.txt
Extracting ../.tmp/motor-insurance-pds-current_page-4.png
Written to ../data/out//motor-insurance-pds-current_page-4.txt
Extracting ../.tmp/motor-insurance-pds-current_page-5.png
Written to ../data/out//motor-insurance-pds-current_page-5.txt
Extracting ../.tmp/motor-insurance-pds-current_page-6.png
Written to ../data/out//motor-insurance-pds-current_page-6.txt
Extracting ../.tmp/motor-insurance-pds-current_page-7.png
Written to ../data/out//motor-insurance-pds-current_page-7.txt
Extracting ../.tmp/motor-insuran