<a href="https://colab.research.google.com/github/yazeedMohi/OCR-Table-Recognition/blob/main/OCR_Table_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Prerequisites**

In [None]:
!sudo apt install tesseract-ocr
!pip install pytesseract
!pip install xlsxwriter

# **Run Here**

In [276]:
#RecPDF("/content/drive/My Drive/mech 0001.pdf")
RecImg(path = "/content/drive/My Drive/Scan_0009.jpg", excel_path = "/content/test.xlsx")

PASS
Excel file successfully created, saved at  /content/test.xlsx


# **Configuration**


**excel_engine:** the engine to use when writing the excel file

**ocr:** whether or not to use OCR and recognize the image

**kernel_factor:** define how large is the detection edge kernel compared to the image width

**morph_kernel:** define the size of the morphing kernel

**kernel_factor_clean:** kernel factor used for cleaning the image

**morph_kernel_clean:** morph kernel size for cleaning the image

**adaptive_filter:** define whether to use adaptive filtering for thresholding or not

**show_progress:** define whether to show images of the different stages or not

**min_width, max_width:** acceptable range for box width

**min_height, max_height:** acceptable range for box height

**threshold:** whether or not to apply thresholding before recognition

In [272]:
class config:
  def __init__(self):
    self.show_progress = True
    self.excel_engine="xlsxwriter"
    self.ocr=True 
    self.kernel_factor=200 
    self.morph_kernel=3
    self.kernel_factor_clean=120
    self.morph_kernel_clean=1
    self.adaptive_filter = False 
    self.show_progress = False 
    self.min_width = 15
    self.max_width = 1000 
    self.min_height=18
    self.max_height = 50
    self.excel_path = "/content/test.xlsx"
    self.threshold = False
conf = config()

# **RecPDF**
Recognizes each image in a PDF file, processes them one by one.

**path:** path to the PDF file

In [144]:
import sys

def RecPDF(path=""):
  with open(path, "rb") as file:
      pdf = file.read()

  img_counter = 0
  pointer = 0
  while True:
      pointer = pdf.find(b"stream", pointer)
      if pointer < 0:
          break

      x = pdf.find(b"\xff\xd8", pointer)
      if x < 0:
          pointer = pointer + 1
          continue
      else:
          extension = "jpg"

      limit = pdf.find(b"endstream", pointer)
      if limit < 0:
          break

      y = pdf.find(b"\xff\xd9", pointer, limit) + 2

      pointer = limit + 9
      if y < 2:
          continue        
      
      img = pdf[x:y]

      img_counter = img_counter + 1

      img_path = "img_" + str(img_counter) + "." + extension

      with open(img_path, "wb") as jpgfile:
          jpgfile.write(img)

      RecImg(img_path, ocr=False, kernel_factor=260, morph_kernel=4, min_width =10, min_height=10)

#**RecImg**
Recognizes a table in image form, converts it to an Excel file.

**path:** path to the input image

In [222]:
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract

def RecImg(path="/content/sample.PNG", excel_path = "/content/test.xlsx"):
  #read your file
  file= path
  img = cv2.imread(file,0)

  if(conf.show_progress):
    print("ORIGINAL IMAGE")
    plotting = plt.imshow(img, cmap="gray")
    plt.show()

  if(conf.adaptive_filter):
    img_bin = cv2.adaptiveThreshold(img,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)
  else:
    thresh,img_bin = cv2.threshold(img,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU) 
  img_bin = 255-img_bin
  
  table = Cut_Table(img_bin, orig_img = img)
  
  if(conf.adaptive_filter):
    table_bin = cv2.adaptiveThreshold(table,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)
  else:
    thresh,table_bin = cv2.threshold(table,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU) 
  table_bin = 255-table_bin

  clean_table = Clean_Image(table, table_bin)

  conts = Find_Contours(table_bin)
  boxes = Arrange_Boxes(table, contours =conts)
  
  if(conf.ocr):
    results = OCR_Boxes(clean_table, boxes)
    Save_Excel(results, excel_path)

# **Utility Functions**

## **Cut Table**

In [220]:
def Cut_Table(img, orig_img = None):
  img_vh = Find_Lines(img, conf.kernel_factor, conf.morph_kernel)
  def find_largest_cont(in_img):
    cnts, _ = cv2.findContours(in_img.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    cnts = sorted(cnts, key = cv2.contourArea, reverse = True)[:10]
    table = None
    i = 0
    for c in cnts:
      # approximate the contour
      peri = cv2.arcLength(c, True)
      approx = cv2.approxPolyDP(c, 0.015 * peri, True)

      if len(approx) == 4 and i>0:
        table = approx
        break
      else: print("PASS")
      i +=1
    return table
  
  largest_cont = find_largest_cont(img_vh)

  def crop_minAreaRect(img, screenCnt):
    pts = screenCnt.reshape(4, 2)
    rect = np.zeros((4, 2), dtype = "float32")

    s = pts.sum(axis = 1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]

    diff = np.diff(pts, axis = 1)
    rect[1] = pts[np.argmin(diff)]
    rect[3] = pts[np.argmax(diff)]
    # multiply the rectangle by the original ratio
    #rect *= ratio

    (tl, tr, br, bl) = rect
    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))

    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))

    maxWidth = max(int(widthA), int(widthB))
    maxHeight = max(int(heightA), int(heightB))

    dst = np.array([
      [0, 0],
      [maxWidth - 1, 0],
      [maxWidth - 1, maxHeight - 1],
      [0, maxHeight - 1]], dtype = "float32")

    M = cv2.getPerspectiveTransform(rect, dst)
    warp = cv2.warpPerspective(img, M, (maxWidth, maxHeight))
    return warp
  table_img = crop_minAreaRect(orig_img, largest_cont)
  if(conf.show_progress):
    print("CROPPED TABLE")
    plotting = plt.imshow(table_img,cmap='gray')
    plt.show()
  return table_img

## **Find Contours**

In [147]:
def Find_Contours(img):
  img_vh = Find_Lines(img, conf.kernel_factor, conf.morph_kernel)
  contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
  return contours

## **Find Lines**

In [268]:
def Find_Lines(img_bin, kernel_factor = 300, morph_kernel=1):
  # countcol(width) of kernel as 100th of total width
  kernel_len = np.array(img_bin).shape[1]//kernel_factor
  # Defining a vertical kernel to detect all vertical lines of image 
  ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
  # Defining a horizontal kernel to detect all horizontal lines of image
  hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
  # A kernel of 2x2
  kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (morph_kernel, morph_kernel))

  #Use vertical kernel to detect and save the vertical lines in a jpg
  image_1 = cv2.erode(img_bin, ver_kernel, iterations=3)
  vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=3)

  #Use horizontal kernel to detect and save the horizontal lines in a jpg
  image_2 = cv2.erode(img_bin, hor_kernel, iterations=3)
  horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=3)

  # Combine horizontal and vertical lines in a new third image, with both having same weight.
  img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
  #Eroding and thesholding the image
  img_vh = cv2.erode(~img_vh, kernel, iterations=2)
  if(conf.adaptive_filter):
    img_vh = cv2.adaptiveThreshold(img_vh,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)
  else:
    thresh, img_vh = cv2.threshold(img_vh,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
  
  return img_vh

## **Clean Image**

In [266]:
def Clean_Image(img, img_bin):
  if(conf.threshold):
    thresh, my_img = cv2.threshold(my_img,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)   
  else:
    my_img = img
  
  img_vh = Find_Lines(img_bin, kernel_factor = conf.kernel_factor_clean, morph_kernel = conf.morph_kernel_clean)

  bitxor = cv2.bitwise_xor(my_img,img_vh)
  bitnot = cv2.bitwise_not(bitxor)
  
  if(conf.show_progress):
    print("CLEAN IMAGE")
    plotting = plt.imshow(bitnot,cmap='gray')
    plt.show()

  return bitnot

## **Arrange Boxes**

In [218]:
def Arrange_Boxes(img, contours = None):

  def sort_contours(cnts, method="left-to-right"):
      # initialize the reverse flag and sort index
      reverse = False
      i = 0
      # handle if we need to sort in reverse
      if method == "right-to-left" or method == "bottom-to-top":
          reverse = True
      # handle if we are sorting against the y-coordinate rather than
      # the x-coordinate of the bounding box
      if method == "top-to-bottom" or method == "bottom-to-top":
          i = 1
      # construct the list of bounding boxes and sort them from top to
      # bottom
      boundingBoxes = [cv2.boundingRect(c) for c in cnts]
      (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
      key=lambda b:b[1][i], reverse=reverse))
      # return the list of sorted contours and bounding boxes
      return (cnts, boundingBoxes)

  # Sort all the contours by top to bottom.
  contours, boundingBoxes = sort_contours(contours, method="top-to-bottom")

  #Creating a list of heights for all detected boxes
  heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]

  #Get mean of heights
  mean = np.mean(heights)

  #Create list box to store all boxes in  
  box = []
  # Get position (x,y), width and height for every contour and show the contour on image
  for c in contours:
      x, y, w, h = cv2.boundingRect(c)
      if (w<conf.max_width and h<conf.max_height and w>conf.min_width and h>conf.min_height):
          image = cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
          box.append([x,y,w,h])
          

  if(conf.show_progress):      
    print("EXTRACTED BOXES")  
    plotting = plt.imshow(image,cmap='GnBu_r')
    plt.show()
  #Creating two lists to define row and column in which cell is located
  row=[]
  column=[]
  j=0

  #Sorting the boxes to their respective row and column
  for i in range(len(box)):    
          
      if(i==0):
          column.append(box[i])
          previous=box[i]    
      
      else:
          if(box[i][1]<=previous[1]+mean/2):
              column.append(box[i])
              previous=box[i]            
              
              if(i==len(box)-1):
                  row.append(column)        
              
          else:
              row.append(column)
              column=[]
              previous = box[i]
              column.append(box[i])
  
  if(conf.show_progress):        
    print("Table Size: ", len(row),"X",len(row[0]))
  return row

## **OCR Boxes**

In [275]:
from tqdm import tqdm

def OCR_Boxes(image,  boxes):
  result = []
  for i in range(len(boxes)):
      result.append([])
      for j in range(len(boxes[i])):
          y,x,w,h = boxes[i][j][0],boxes[i][j][1], boxes[i][j][2],boxes[i][j][3]
          finalimg = image[x:x+h, y:y+w]
          kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
          border = cv2.copyMakeBorder(finalimg,2,2,2,2, cv2.BORDER_CONSTANT,value=[255,255])
          resizing = cv2.resize(border, None, fx=5, fy=5, interpolation=cv2.INTER_CUBIC)
          dilation = cv2.dilate(resizing, kernel,iterations=1)
          erosion = cv2.erode(dilation, kernel,iterations=2)
          
          out = pytesseract.image_to_string(erosion)
          if(len(out)==0):
              out = pytesseract.image_to_string(erosion, config='-psm 3')
          if(len(out)==0):
              out = pytesseract.image_to_string(erosion, config='-psm 7')
          out.strip("\n")
          if(len(out)==0): out = " "
          if(conf.show_progress):
              plotting = plt.imshow(erosion,cmap='gray')
              plt.show()
              print(x,y,w,h," || ",out)
          
          result[i].append(out)
  return result

## **Save Excel**

In [166]:
def Save_Excel(result, save_path):
  dataframe = pd.DataFrame(result)
  data = dataframe.style.set_properties(align="left")
  data.to_excel(save_path, engine=conf.excel_engine)
  print("Excel file successfully created, saved at ", save_path)

In [None]:
        
  if(show_progress):          
    print(len(column))
    print(len(row))

  #calculating maximum number of cells
  max_countcol = 0
  for i in range(len(row)):
      countcol = len(row[i])
      if countcol > max_countcol:
          max_countcol = countcol

  countcol = max_countcol

  #Retrieving the center of each column
  center = [int(row[i][j][0]+row[i][j][2]/2) for j in range(len(row[i])) if row[0]]

  center=np.array(center)
  center.sort()
  #if(show_progress):
  #  print(center)

  #Regarding the distance to the columns center, the boxes are arranged in respective order

  finalboxes = []
  #my_max = 300000000
  #how_many = 1
  return
  if(ocr):
    #from every single image-based cell/box the strings are extracted via pytesseract and stored in a list
    outer=[]
    for i in range(len(finalboxes)):
        for j in range(len(finalboxes[i])):
            inner=''
            if(len(finalboxes[i][j])==0):
                outer.append(' ')
            else:
                for k in range(len(finalboxes[i][j])):
                    y,x,w,h = finalboxes[i][j][k][0],finalboxes[i][j][k][1], finalboxes[i][j][k][2],finalboxes[i][j][k][3]
                    finalimg = bitnot[x:x+h, y:y+w]
                    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
                    border = cv2.copyMakeBorder(finalimg,2,2,2,2, cv2.BORDER_CONSTANT,value=[255,255])
                    resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
                    dilation = cv2.dilate(resizing, kernel,iterations=1)
                    erosion = cv2.erode(dilation, kernel,iterations=2)
                    
                    #out = pytesseract.image_to_string(erosion)
                    #if(len(out)==0):
                    out = pytesseract.image_to_string(erosion, config='--psm 3')
                    if(show_progress):
                      print(x,y,w,h,out)
                    inner = inner +" "+ out
                outer.append(inner)

    #Creating a dataframe of the generated OCR list
    arr = np.array(outer)
    dataframe = pd.DataFrame(arr.reshape(len(row), countcol))
    print(dataframe)
    data = dataframe.style.set_properties(align="left")
    #Converting it in a excel-file
    data.to_excel("/content/test.xlsx", engine=excel_engine)

    def Clean_Image(img, img_bin):
  if(conf.threshold):
    thresh, my_img = cv2.threshold(my_img,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)   
  else:
    my_img = img
  
  img_vh = Find_Lines(img_bin, kernel_factor = conf.kernel_factor_clean, morph_kernel = conf.morph_kernel_clean)

  bitand = cv2.bitwise_and(my_img, cv2.bitwise_not(img_vh))
  bitxor = cv2.bitwise_xor(cv2.bitwise_not(my_img), bitand)
  bitnot = cv2.bitwise_not(bitxor)
  
  if(conf.show_progress):
    print("AND IMAGE")
    plotting = plt.imshow(bitand,cmap='gray')
    plt.show()
    print("MY IMAGE")
    plotting = plt.imshow(cv2.bitwise_not(my_img),cmap='gray')
    plt.show()
    print("XOR IMAGE")
    plotting = plt.imshow(bitxor,cmap='gray')
    plt.show()
    print("CLEAN IMAGE")
    plotting = plt.imshow(bitnot,cmap='gray')
    plt.show()

  return bitnot