The previous team used PIL and PyTesseract to parse through the images. I decided to use PIL and PyTesseract instead of Image Magick as it seems more straightforward than using ImageMagick whose API has not been updated for a while, it seems. I am also trying out OpenCV as it has more functionalities and customization options where image preprocessing is concerned.

In [3]:
# Import libraries
import numpy as np
import pandas as pd
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
from io import StringIO    
from skimage import io
from skimage import transform as tf
from skimage.feature import canny
import matplotlib.pyplot as plt
from IPython.display import display
import spacy
import re
import os
import cv2

# Set Tesseract path 
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Check tesseract version
## Tesseract 5 was used because it is faster and better than tesseract 4
print("Tesseract 5: " + str(pytesseract.get_tesseract_version()))

#Check cv2 version 
print("OpenCV: " + str(cv2.__version__))

# Test image path 
test_directory_path = "./test_directory_files"
folder_name = "/1847.33f67330-5daf-0134-9838-00505686a51c"
file_name = "/183.57504747.d4315750-2b8e-0136-1360-47c4533390a8.jpeg"

Tesseract 5: 5.0.0-alpha.20191030
OpenCV: 4.1.1


I considered the concept of Histogram Equalization (https://docs.opencv.org/master/d5/daf/tutorial_py_histogram_equalization.html). However, the results were pretty disastrous. Enhancing the contrast through histogram equalization affected Tesseract's ability to parse through the image and recognize the text. 

In [47]:
# cv2 method as suggested by: 
# https://www.freecodecamp.org/news/getting-started-with-tesseract-part-ii-f7f9a0899b3f/

# Reading in image
img = cv2.imread(test_directory_path + folder_name + file_name)

# Rescale the image    
img = cv2.resize(img, None, fx=1.5, fy=1.5, interpolation=cv2.INTER_CUBIC)
# Convert to gray    
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)    
# Apply dilation and erosion to remove some noise    
kernel = np.ones((1, 1), np.uint8)    
img = cv2.dilate(img, kernel, iterations=1)    
img = cv2.erode(img, kernel, iterations=1)

# Run tesseract
result_opencv_string = pytesseract.image_to_string(img)
## Get bounding box estimates, lines, confidences, and page numbers
## Useful statistics to have but too annoying to print
####print(pytesseract.image_to_boxes(img))
####print(pytesseract.image_to_data(pic))
## Save to file
directory = os.path.join(test_directory_path + folder_name)
with open(directory + file_name[:-5] + "_opencv2" + ".txt", "w") as f: 
    f.write(result_opencv_string)
    print("Text file saved")


Text file saved


In [None]:
 """The following chunk was the code used to attempt histogram equalization to improve contrast of the image.
     Histogram equalization was applied after the standard grayscale, dilation, erosion, and scaling transformations.
     However, it is not very useful in the improving text recognition. In fact, the results were worse."""

# #Look at the histogram
# hist,bins = np.histogram(img.flatten(),256,[0,256])
# cdf = hist.cumsum()
# cdf_normalized = cdf * float(hist.max()) / cdf.max()
# plt.plot(cdf_normalized, color = 'b')
# plt.hist(img.flatten(),256,[0,256], color = 'r')
# plt.xlim([0,256])
# plt.legend(('cdf','histogram'), loc = 'upper left')
# plt.show()

# #Run equalization
# equ = cv2.equalizeHist(img)
# res = np.hstack((img,equ)) #stacking images side-by-side
# plt.imshow(res, cmap = 'gray'), plt.axis("off")
# plt.show()

# #Histogram for equalized
# hist1, bins1 = np.histogram(equ.flatten(),256,[0,256])
# cdf1 = hist1.cumsum()
# cdf1_normalized = cdf1 * float(hist1.max()) / cdf1.max()
# plt.plot(cdf1_normalized, color = 'b')
# plt.hist(equ.flatten(),256,[0,256], color = 'r')
# plt.xlim([0,256])
# plt.legend(('cdf1','histogram'), loc = 'upper left')
# plt.show()

# # Run tesseract
# result_opencv_string = pytesseract.image_to_string(equ)
# # Save to file
# directory = os.path.join(test_directory_path + folder_name)
# with open(directory + file_name[:-5] + "_opencv2" + "equ" + ".txt", "w") as f: 
#     f.write(result_opencv_string)
#     print("Text file saved")

# #What about CLAHE
# # create a CLAHE object

# clahe = cv2.createCLAHE()
# cl1 = clahe.apply(img)
# res2 = np.hstack((img,cl1)) #stacking images side-by-side
# plt.imshow(res2, cmap = 'gray'), plt.axis("off")
# plt.show()

# #Histogram for localized contrast change
# hist2, bins2 = np.histogram(cl1.flatten(),256,[0,256])
# cdf2 = hist2.cumsum()
# cdf2_normalized = cdf2 * float(hist2.max()) / cdf2.max()
# plt.plot(cdf2_normalized, color = 'b')
# plt.hist(cl1.flatten(),256,[0,256], color = 'r')
# plt.xlim([0,256])
# plt.legend(('cdf2','histogram'), loc = 'upper left')
# plt.show()

# # Run tesseract
# result_opencv_string = pytesseract.image_to_string(cl1)

# # Save to file
# directory = os.path.join(test_directory_path + folder_name)
# with open(directory + file_name[:-5] + "_opencv2" + "cl1" + ".txt", "w") as f: 
#     f.write(result_opencv_string)
#     print("Text file saved")

This is the preprocessing done by CUSP sans bounding box detection.

In [48]:
# Preprocess function from previous CUSP team 
# Added in the pivot to return an hOCR object instead

def preprocessImage(test_directory_path):
        """Processes each column image to make it black and white and then using PyTesseract to return results"""

        image = Image.open(test_directory_path + folder_name + file_name)
        thresh = 150
        fn = lambda x : 255 if x > thresh else 0

        # Uses PIL's Image to convert to greyscale
        bw_image = image.convert('L').point(fn, mode='1')
        print(type(bw_image))
        enhanceImage = ImageEnhance.Sharpness(bw_image)
        contrast = ImageEnhance.Contrast(enhanceImage.image)
        brightness = ImageEnhance.Brightness(contrast.image)
        enhancedImage = brightness.image
        
      
        # Recognize text with tesseract for python    
        result = pytesseract.image_to_pdf_or_hocr(enhancedImage, lang="eng", extension = "hocr")
        result_string = pytesseract.image_to_string(enhancedImage)

        
        # save hOCR to file
        ## wb NOT w for hOCR
        directory = os.path.join(test_directory_path + folder_name)
        with open(directory + file_name[:-5] + ".hocr", "wb") as f: 
            f.write(result)
            print("hOCR saved")
        with open(directory + file_name[:-5] + "_bw_" + ".txt", "w") as f: 
            f.write(result_opencv_string)
        return print("Text file saved")

In [10]:
"""This method will not work because PIL cannot work with an np array that is returned by cv2"""
def preprocessImage_cv2(test_directory_path, folder_name, file_name):
        """Processes each column image to make it black and white and then using PyTesseract to return results"""
        # this path set-up is not ideal but it can be tweaked easily later
        image = cv2.imread(test_directory_path + folder_name + file_name)
        # Rescale the image    
        image = cv2.resize(image, None, fx=1.5, fy=1.5, interpolation=cv2.INTER_CUBIC)
        # Convert to gray    
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)    
        # Apply dilation and erosion to remove some noise    
        kernel = np.ones((1, 1), np.uint8)    
        image = cv2.dilate(image, kernel, iterations=1)    
        image = cv2.erode(image, kernel, iterations=1)
        
        print(type(image))
        
        # Using PIL to further enhance image
        enhanceImage = Ima geEnhance.Sharpness(image)
        contrast = ImageEnhance.Contrast(enhanceImage.image)
        brightness = ImageEnhance.Brightness(contrast.image)
        enhancedImage = brightness.image
        
        # Recognize text with tesseract for python
        # first returns hocr
        # second returns string
        result_opencv_hocr = pytesseract.image_to_pdf_or_hocr(enhancedImage, lang="eng", extension = "hocr")
        result_opencv_string = pytesseract.image_to_string(enhancedImage)
        
        # Batch processing with a single file containing the list of multiple image file paths
        ###print(pytesseract.image_to_string('images.txt'))
        
        # Get bounding box estimates, lines, confidences, and page numbers
        print(pytesseract.image_to_data(image))
        
        # save hOCR to file
        ## wb NOT w for hOCR
        directory = os.path.join(test_directory_path + folder_name)
        with open(directory + file_name[:-5] + "_opencv2" + ".hocr", "wb") as f: 
            f.write(result_opencv_hocr)
            print("hOCR saved")
        with open(directory + file_name[:-5] + "_opencv2" + ".txt", "w") as f: 
            f.write(result_opencv_string)
        return print("Text file saved")

In [49]:
preprocessImage(test_directory_path)

<class 'PIL.Image.Image'>
hOCR saved
Text file saved


I have gone ahead and done a comparison between the CUSP's method with PIL and OpenCV 4.1.1. There is no real difference between the two beyond the occassional character and formatting difference. A more substantial test will require running the hOCR column detection script to see which approach's hOCR output is more likely to yield accurate results.