<a href="https://colab.research.google.com/github/sourcecode369/100-days-of-ml-code/blob/master/OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install pillow
!pip install wand

Collecting wand
[?25l  Downloading https://files.pythonhosted.org/packages/2e/06/5845baa21b3190248d571400cdeeb2de84f3c8757b2046c5ee1eadf59d7a/Wand-0.5.9-py2.py3-none-any.whl (129kB)
[K     |████████████████████████████████| 133kB 7.0MB/s 
[?25hInstalling collected packages: wand
Successfully installed wand-0.5.9


In [0]:
try:
  !sudo apt install tesseract-ocr
  !pip install pytesseract
  !pip install numpy 
  !pip install openpyxl
  !pip install pandas 
  !pip install pdf2image
  !pip install opencv-python==4.0.0.21
  !pip install XlsxWriter==1.1.2
except Exception as ex:
  print(f'Cannot install {ex} dependency.')

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 25 not upgraded.
Need to get 4,795 kB of archives.
After this operation, 15.8 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-eng all 4.00~git24-0e00fe6-1.2 [1,588 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-osd all 4.00~git24-0e00fe6-1.2 [2,989 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr amd64 4.00~git2288-10f4998a-2 [218 kB]
Fetched 4,795 kB in 0s (28.5 MB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5

In [0]:
class Table:
    def __init__(self, x, y, w, h):
        self.x = x
        self.y = y
        self.w = w
        self.h = h
        self.joints = None

    def __str__(self):
        return "(x: %d, y: %d, w: %d, h: %d)" % (self.x, self.x + self.w, self.y, self.y + self.h)
    
    # Stores the coordinates of the table joints.
    # Assumes the n-dimensional array joints is sorted in ascending order.
    def set_joints(self, joints):
        if self.joints != None:
            raise ValueError("Invalid setting of table joints array.")

        self.joints = []
        row_y = joints[0][1]
        row = []
        for i in range(len(joints)):
            if i == len(joints) - 1:
                row.append(joints[i])
                self.joints.append(row)
                break

            row.append(joints[i])

            # If the next joint has a new y-coordinate,
            # start a new row.
            if joints[i + 1][1] != row_y:
                self.joints.append(row)
                row_y = joints[i + 1][1]
                row = []

    # Prints the coordinates of the joints.
    def print_joints(self):
        if self.joints == None:
            print("Joint coordinates not found.")
            return

        print("[")
        for row in self.joints:
            print("\t" + str(row))
        print("]")

    # Finds the bounds of table entries in the image by
    # using the coordinates of the table joints.
    def get_table_entries(self):
        if self.joints == None:
            print("Joint coordinates not found.")
            return

        entry_coords = []
        for i in range(0, len(self.joints) - 1):
            entry_coords.append(self.get_entry_bounds_in_row(self.joints[i], self.joints[i + 1]))

        return entry_coords

    # Finds the bounds of table entries
    # in each row based on the given sets of joints.
    def get_entry_bounds_in_row(self, joints_A, joints_B):
        row_entries = []

        # Since the sets of joints may not have the same 
        # number of points, we pick the set with a lower number 
        # of points to find the bounds from.
        if len(joints_A) <= len(joints_B):
            defining_bounds = joints_A
            helper_bounds = joints_B
        else:
            defining_bounds = joints_B
            helper_bounds = joints_A

        for i in range(0, len(defining_bounds) - 1):
            x = defining_bounds[i][0]
            y = defining_bounds[i][1]
            w = defining_bounds[i + 1][0] - x # helper_bounds's (i + 1)th coordinate may not be the lower-right corner
            h = helper_bounds[0][1] - y # helper_bounds has the same y-coordinate for all of its elements

            # If the calculated height is less than 0, 
            # make the height positive and
            # use the y-coordinate of the row above for the bounds
            if h < 0:
                h = -h
                y = y - h

            row_entries.append([x, y, w, h])

        return row_entries

In [0]:
from __future__ import absolute_import, print_function, division, unicode_literals
from builtins import range, input

In [0]:
try:
  import cv2 as cv
  import pytesseract as tess
  from PIL import Image
  import subprocess as s
  import os
  from pdf2image import convert_from_path
  import sys
  import xlsxwriter
  import warnings
  warnings.simplefilter("ignore")
  import gc
  gc.enable()
  import numpy as np
  np.set_printoptions(precision=2)
except Exception as ex:
  raise ex

In [0]:
def isolate_lines(src, structuring_element):
	cv.erode(src, structuring_element, src, (-1, -1))
	cv.dilate(src, structuring_element, src, (-1, -1))
 
MIN_TABLE_AREA = 50 
EPSILON = 3

In [0]:
def verify_table(contour, intersections):
    area = cv.contourArea(contour)
    if (area < MIN_TABLE_AREA):
        return (None, None)
    curve = cv.approxPolyDP(contour, EPSILON, True)
    rect = cv.boundingRect(curve)
    possible_table_region = intersections[rect[1]:rect[1] + rect[3], rect[0]:rect[0] + rect[2]]
    (possible_table_joints, _) = cv.findContours(possible_table_region, cv.RETR_CCOMP, cv.CHAIN_APPROX_SIMPLE)
    if len(possible_table_joints) < 5:
        return (None, None)
    return rect, possible_table_joints

In [0]:
def mkdir(path):
  if not os.path.exists(path):
      os.makedirs(path)

In [0]:
def showImg(name, matrix, durationMillis=0):
  cv.imshow(name, matrix)
  cv.waitKey(durationMillis)

In [0]:
'''
Using the text cleaner script to clean the image
'''
# upload text cleaner and pdf or jpg file
from google.colab import files;
files.upload()
!mkdir data
!mv 'Citi_JIRASniping.jpg' data
!ls -GFlash --color

Saving textcleaner to textcleaner
Saving Citi_JIRASniping.jpg to Citi_JIRASniping.jpg
total 44K
4.0K drwxr-xr-x 1 root 4.0K Mar 26 11:39 [0m[01;34m.[0m/
4.0K drwxr-xr-x 1 root 4.0K Mar 26 11:09 [01;34m..[0m/
4.0K drwxr-xr-x 1 root 4.0K Mar 24 16:59 [01;34m.config[0m/
4.0K drwxr-xr-x 2 root 4.0K Mar 26 11:39 [01;34mdata[0m/
4.0K drwxr-xr-x 1 root 4.0K Mar 18 16:23 [01;34msample_data[0m/
 24K -rw-r--r-- 1 root  24K Mar 26 11:39 textcleaner


In [0]:
path = 'data/Citi_JIRASniping.jpg'

In [0]:
def run_textcleaner(filename, img_id):
    mkdir("bin/cleaned/")
    cleaned_file = "bin/cleaned/cleaned" + str(img_id) + ".jpg"
    s.call(["./textcleaner.sh", "-g", "-e", "none", "-f", str(10), "-o", str(20), "-t", str(30), "-u", "-s", str(1), "-T", "-p", str(20), filename, cleaned_file])
    return cleaned_file

In [0]:
def run_tesseract(filename, img_id, psm, oem):
    mkdir("bin/extracted/")
    image = Image.open(filename)
    language = 'eng'
    configuration = "--psm " + str(psm) + " --oem " + str(oem)
    text = tess.image_to_string(image, lang=language, config=configuration)
    if len(text.strip()) == 0:
        configuration += " -c tessedit_char_whitelist=0123456789"
        text = tess.image_to_string(image, lang=language, config=configuration)
    return text

In [0]:
if not path.endswith(".pdf") and not path.endswith(".jpg"):
    print("Must use a pdf or a jpg image to run the program.")
    sys.exit(1)

In [0]:
if path.endswith(".pdf"):
    ext_img = convert_from_path(path)[0]
else:
    ext_img = Image.open(path)

In [0]:
ext_img.save("data/target.png", "PNG")
image = cv.imread("data/target.png")

In [0]:
NUM_CHANNELS = 3
if len(image.shape) == NUM_CHANNELS:
    grayscale = cv.cvtColor(image, cv.COLOR_BGR2GRAY)

In [0]:
MAX_THRESHOLD_VALUE = 255
BLOCK_SIZE = 15
THRESHOLD_CONSTANT = 0

filtered = cv.adaptiveThreshold(~grayscale, MAX_THRESHOLD_VALUE, cv.ADAPTIVE_THRESH_MEAN_C, cv.THRESH_BINARY, BLOCK_SIZE, THRESHOLD_CONSTANT)

In [0]:
SCALE = 15

In [0]:
horizontal = filtered.copy()
vertical = filtered.copy()

horizontal_size = int(horizontal.shape[1] / SCALE)
horizontal_structure = cv.getStructuringElement(cv.MORPH_RECT, (horizontal_size, 1))
isolate_lines(horizontal, horizontal_structure)

vertical_size = int(vertical.shape[0] / SCALE)
vertical_structure = cv.getStructuringElement(cv.MORPH_RECT, (1, vertical_size))
isolate_lines(vertical, vertical_structure)

In [0]:
mask = horizontal + vertical
(contours, _) = cv.findContours(mask, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)

In [0]:
intersections = cv.bitwise_and(horizontal, vertical)

tables = [] 
for i in range(len(contours)):
    (rect, table_joints) = verify_table(contours[i], intersections)
    if rect == None or table_joints == None:
        continue

    table = Table(rect[0], rect[1], rect[2], rect[3])

    joint_coords = []
    for i in range(len(table_joints)):
        joint_coords.append(table_joints[i][0][0])
    joint_coords = np.asarray(joint_coords)

    sorted_indices = np.lexsort((joint_coords[:, 0], joint_coords[:, 1]))
    joint_coords = joint_coords[sorted_indices]

    table.set_joints(joint_coords)

    tables.append(table)

In [0]:
out = "bin/"
table_name = "table.jpg"
psm = 6
oem = 3
mult = 3

mkdir(out)
mkdir("bin/table/")

mkdir("excel/")
workbook = xlsxwriter.Workbook('excel/tables.xlsx')

for table in tables:
    worksheet = workbook.add_worksheet()

    table_entries = table.get_table_entries()

    table_roi = image[table.y:table.y + table.h, table.x:table.x + table.w]
    table_roi = cv.resize(table_roi, (table.w * mult, table.h * mult))

    cv.imwrite(out + table_name, table_roi)
    num_img = 0
    for i in range(len(table_entries)):
        row = table_entries[i]
        for j in range(len(row)):
            entry = row[j]
            entry_roi = table_roi[entry[1] * mult: (entry[1] + entry[3]) * mult, entry[0] * mult:(entry[0] + entry[2]) * mult]
            fname = out + "table/cell" + str(num_img) + ".jpg"
            cv.imwrite(fname, entry_roi)
#            fname = run_textcleaner(fname, num_img)
            text = run_tesseract(fname, num_img, psm, oem)
            num_img += 1
            worksheet.write(i, j, text)

workbook.close()