<a href="https://colab.research.google.com/github/yazeedMohi/OCR-Table-Recognition/blob/main/OCR_Table_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Prerequisites**

In [14]:
!sudo apt install tesseract-ocr
!sudo apt-get install tesseract-ocr-ara
!pip install pytesseract
!pip install xlsxwriter
!pip install unidecode
!pip install pymupdf

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr is already the newest version (4.00~git2288-10f4998a-2).
0 upgraded, 0 newly installed, 0 to remove and 14 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr-ara is already the newest version (4.00~git24-0e00fe6-1.2).
0 upgraded, 0 newly installed, 0 to remove and 14 not upgraded.
Collecting pymupdf
[?25l  Downloading https://files.pythonhosted.org/packages/81/75/442a1bcc89569453969f34a53000e8a80e3da13367e3e29b81b5137ef388/PyMuPDF-1.18.3-cp36-cp36m-manylinux2010_x86_64.whl (6.3MB)
[K     |████████████████████████████████| 6.3MB 3.9MB/s 
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.18.3


# **Run Here**

In [None]:
from unidecode import unidecode
import re
#RecPDF("/content/drive/My Drive/mech 0001.pdf")
RecImg(path = "/content/drive/My Drive/Scan_0021.jpg", excel_path = "/content/testB.xlsx")

# **Configuration**


**excel_engine:** the engine to use when writing the excel file

**ocr:** whether or not to use OCR and recognize the image

**kernel_factor:** define how large is the detection edge kernel compared to the image width

**morph_kernel:** define the size of the morphing kernel

**kernel_factor_clean:** kernel factor used for cleaning the image

**morph_kernel_clean:** morph kernel size for cleaning the image

**adaptive_filter:** define whether to use adaptive filtering for thresholding or not

**show_progress:** define whether to show images of the different stages or not

**min_width, max_width:** acceptable range for box width

**min_height, max_height:** acceptable range for box height

**threshold:** whether or not to apply thresholding before recognition

In [2]:
class config:
  def __init__(self):
    self.excel_engine="xlsxwriter"
    self.ocr=True 
    self.kernel_factor=200 
    self.morph_kernel=3
    self.kernel_factor_clean=120
    self.morph_kernel_clean=1
    self.adaptive_filter = False 
    self.show_progress = True
    self.min_width = 20
    self.max_width = 1000 
    self.min_height=18
    self.max_height = 250
    self.passes = 1
    self.threshold = False
    self.ara_col = 3
    self.ara_row = 1
    self.last_rows = 5
    self.rot_start = 4
    self.rot_last = 1
    self.header_arabic = True
conf = config()

# **GEN IMGS**

In [18]:

import os
import sys
import fitz
import cv2
import numpy as np
import json

import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf
import matplotlib.font_manager as mfm


dir_path = '/content/drive/MyDrive/FermaSorted'

departmentsList = {
    'a':'Agricultural', 'o':'Chemical', 'e':'Civil', 'u':'Electrical', 
    'h':'Mechanical', 't':'Surveying', 'n':'Mining', 's':'Petroleum' 
}

grades = {'1st': 1, '2nd': 2, '3rd': 3, '4th': 4, '5th': 5}

def put_pg_into_sortedscan(yr, gr, dept, pgno):
    if not (yr in sortedscan):
        sortedscan[yr] = {}
        
    if not (gr in sortedscan[yr]):
        sortedscan[yr][gr] = {}
        
    if not (dept in sortedscan[yr][gr]):
        sortedscan[yr][gr][dept] = [pgno];
    else:
        if not(pgno in sortedscan[yr][gr][dept]):
            sortedscan[yr][gr][dept].append(pgno)

def pix2np(pix):
    im = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
    im = np.ascontiguousarray(im[..., [2, 1, 0]])  # rgb to bgr
    return im

def border_around_subplots(sub1):
    autoAxis = sub1.axis()
    rec = plt.Rectangle(
        (autoAxis[0]-0.7,
        autoAxis[2]-0.2),
        (autoAxis[1]-autoAxis[0])+1,
        (autoAxis[3]-autoAxis[2])+0.4, fill=False, lw=2)
    rec = sub1.add_patch(rec)
    rec.set_clip_on(False)
    plt.tight_layout()

def FindImagesInPDF(path, yr, gr):
    doc = fitz.open(path)
    
    year_str = str(yr);
    gr_str = str(gr)
    dept_data = sortedscan[year_str][gr_str];
    
    for dept in dept_data:
        if(not (dept in departmentsList.values()) ):
            continue
        else:
            for pgno in dept_data[dept]:
                page = doc.loadPage(pgno)
                imgs_in_page = doc.getPageImageList(pgno)
                if(len(imgs_in_page) == 1):
                    xref = imgs_in_page[0]
                    pix = fitz.Pixmap(doc, xref[0])
                else:
                    pix = page.getPixmap(fitz.Matrix(1.0, 0, 0, 1.0, 0, 0))
                
                img = pix2np(pix)
                
                h, w, _ = img.shape
                if(h > w):
                    img = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
                h, w, _ = img.shape
                
                imgx = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
                
                cut_image = RecImg(img, None, None)
                
                fig, axs = plt.subplots(2, 1)
                fig.set_size_inches(8, 10)
                
                axs[0].imshow(imgx, cmap='gray')
                axs[0].set_title('Original: {}/{}-{}-{}-Page No {}'.format(yr, yr+1, gr, dept, pgno))
                axs[0].set_xticks([]); axs[0].set_yticks([])
                border_around_subplots(axs[0])
                
                if not (cut_image is None):
                    axs[1].imshow(cut_image, cmap='gray')
                    axs[1].set_title('Cut Table')
                    axs[1].set_xticks([]); axs[1].set_yticks([])
                    border_around_subplots(axs[1])
                
                pdf_debug.savefig()
                plt.close()
                
                # print("W: ", w, "H: ", h, "W2: ", cut_image.shape[0], "H2: ", cut_image.shape[1])
                
                # merged = np.concatenate((imgx, cut_image), axis=1)
                # merged_s = cv2.resize(merged, (wn, hn), interpolation=cv2.INTER_CUBIC)
                
                # cv2.waitKey(0)
    return

if (os.stat('sortedscans.json').st_size == 0):
    exit()
    sortedscan = {}
else:
    with open('sortedscans.json', "rb") as f:
        sortedscan = json.load(f)

# print(json.dumps(sortedscan, indent=2))
# exit()
with matplotlib.backends.backend_pdf.PdfPages("output_debug.pdf") as pdf_debug:
    for entry in os.scandir(dir_path):
        if(entry.is_file() and (entry.path.endswith(".pdf"))):
            pass
        else:
            print("Found non pdf file ... Exiting")
            exit()
            
        currentFile = entry.path
        sortedscan["lastFile"] = currentFile
        
        print(entry.path)
        
        xst = entry.name.split('_')
        if len(xst) == 3:
            pass
        elif len(xst) == 2:
            yr = int(xst[0])
            gr_k = xst[1].split('.')[0];
            gr = grades[gr_k]
            
            # if(gr < 4): continue
            # if(yr > 2013): break
            
            if( (str(yr) in sortedscan) and (str(gr) in sortedscan[str(yr)])):
                print("Year: ", "{}/{}".format(yr, yr+1), " Grade: ", gr_k);
                FindImagesInPDF(entry.path, yr, gr)
            else:
                print("NOT IN DICTIONARY Year: ", "{}/{}".format(yr, yr+1), " Grade: ", gr_k);
        else:
            pass
        

/content/drive/MyDrive/FermaSorted/2013_2nd.pdf
Year:  2013/2014  Grade:  2nd
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
/content/drive/MyDrive/FermaSorted/2013_3rd.pdf
Year:  2013/2014  Grade:  3rd
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
/content/drive/MyDrive/FermaSorted/2013_4th.pdf
Year:  2013/2014  Grade:  4th
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
/content/drive/MyDrive/FermaSorted/2013_1st.pdf
Year:  2013/2014  Grade:  1st
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PAS

mupdf: expected object number


PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
/content/drive/MyDrive/FermaSorted/2015_2nd.pdf
Year:  2015/2016  Grade:  2nd


mupdf: expected object number


PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
/content/drive/MyDrive/FermaSorted/2015_1st.pdf
NOT IN DICTIONARY Year:  2015/2016  Grade:  1st
/content/drive/MyDrive/FermaSorted/2015_3rd.pdf
Year:  2015/2016  Grade:  3rd


mupdf: expected object number


PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
/content/drive/MyDrive/FermaSorted/2015_Sub_3rd.pdf
/content/drive/MyDrive/FermaSorted/2015_Sub_4th.pdf
/content/drive/MyDrive/FermaSorted/2015_Sub_2nd.pdf
/content/drive/MyDrive/FermaSorted/2015_Sub_1st.pdf
/content/drive/MyDrive/FermaSorted/2014_Sub_3rd.pdf
/content/drive/MyDrive/FermaSorted/2014_Sub_4th.pdf
/content/drive/MyDrive/FermaSorted/2014_Sub_2nd.pdf
/content/drive/MyDrive/FermaSorted/2014_Sub_1st.pdf
/content/drive/MyDrive/FermaSorted/2014_2nd.pdf
Year:  2014/2015  Grade:  2nd
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
PASS
P

In [13]:
from unidecode import unidecode
import re
import sys
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf
import matplotlib.font_manager as mfm
import csv
import math
from tqdm import tqdm
#from bidi.algorithm import get_display
#import arabic_reshaper
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract

fontname = mfm.findfont('Tahoma', fontext='ttf') 
"""reshaper = arabic_reshaper.ArabicReshaper(
    arabic_reshaper.config_for_true_type_font(
        fontname,
        arabic_reshaper.ENABLE_NO_LIGATURES
    )
)
"""
class config:
  def __init__(self):
    self.excel_engine="xlsxwriter"
    self.ocr=True 
    self.kernel_factor=200 
    self.morph_kernel=3
    self.kernel_factor_clean=120
    self.morph_kernel_clean=1
    self.adaptive_filter = False 
    self.show_progress = True
    self.min_width = 20
    self.max_width = 1000 
    self.min_height=18
    self.max_height = 250
    self.threshold = False
    self.ara_col = 3
    self.ara_row = 1
    self.last_rows = 5
    self.rot_start = 4
    self.rot_last = 1
    self.header_arabic = True
conf = config()

dpidef = 200

def RecImg(imgs, excel_path, dbg_path=None):
    if(len(imgs.shape) == 3):
        height, width, channels = imgs.shape
    else:
        height, width = imgs.shape

    if(height > width):
        imgs = cv2.rotate(imgs, cv2.ROTATE_90_COUNTERCLOCKWISE)

    img = cv2.cvtColor(imgs.copy(), cv2.COLOR_RGB2GRAY)

    try:

        if(conf.adaptive_filter):
            img_bin = cv2.adaptiveThreshold(img,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)
        else:
            thresh,img_bin = cv2.threshold(img,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU) 
        img_bin = 255-img_bin

        table = Cut_Table(img_bin, orig_img = img)
        
        return table

        if(conf.adaptive_filter):
            table_bin = cv2.adaptiveThreshold(table,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)
        else:
            thresh,table_bin = cv2.threshold(table,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU) 
        table_bin = 255-table_bin

        clean_table = Clean_Image(table, table_bin)

        conts = Find_Contours(table_bin)
        boxes = Arrange_Boxes(table, contours =conts)
        print("Dumping Boxes")
        if(conf.ocr):
            # results = OCR_Boxes(clean_table, boxes)
            DumpBoxes(clean_table, boxes)
            # Save_Excel(results, excel_path)
    except:
        print('An Error has occured')
        raise
        
def Cut_Table(img, orig_img = None):
    img_vh = Find_Lines(img, conf.kernel_factor, conf.morph_kernel)
    
    def find_largest_cont(in_img):
        cnts, _ = cv2.findContours(in_img.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        cnts = sorted(cnts, key = cv2.contourArea, reverse = True)[:10]
        table = None
        i = 0
        for c in cnts:
            # approximate the contour
            peri = cv2.arcLength(c, True)
            approx = cv2.approxPolyDP(c, 0.015 * peri, True)
            
            if len(approx) == 4 and i>0:
                table = approx
                break
            else: 
                print("PASS")
                i +=1
        
        return table
        
    def crop_minAreaRect(img, screenCnt):
        if (screenCnt is None): return None
        pts = screenCnt.reshape(4, 2)
        rect = np.zeros((4, 2), dtype = "float32")

        s = pts.sum(axis = 1)
        rect[0] = pts[np.argmin(s)]
        rect[2] = pts[np.argmax(s)]

        diff = np.diff(pts, axis = 1)
        rect[1] = pts[np.argmin(diff)]
        rect[3] = pts[np.argmax(diff)]
        # multiply the rectangle by the original ratio
        #rect *= ratio

        (tl, tr, br, bl) = rect
        widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
        widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))

        heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
        heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))

        maxWidth = max(int(widthA), int(widthB))
        maxHeight = max(int(heightA), int(heightB))

        dst = np.array([
          [0, 0],
          [maxWidth - 1, 0],
          [maxWidth - 1, maxHeight - 1],
          [0, maxHeight - 1]], dtype = "float32")

        M = cv2.getPerspectiveTransform(rect, dst)
        warp = cv2.warpPerspective(img, M, (maxWidth, maxHeight))
        return warp
  
    largest_cont = find_largest_cont(img_vh)
    table_img = crop_minAreaRect(orig_img, largest_cont)
    
    return table_img

def Find_Contours(img):
  img_vh = Find_Lines(img, conf.kernel_factor, conf.morph_kernel)
  contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
  return contours

def Find_Lines(img_bin, kernel_factor = 300, morph_kernel=1):
  # countcol(width) of kernel as 100th of total width
  kernel_len = np.array(img_bin).shape[1]//kernel_factor
  # Defining a vertical kernel to detect all vertical lines of image 
  ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
  # Defining a horizontal kernel to detect all horizontal lines of image
  hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
  # A kernel of 2x2
  kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (morph_kernel, morph_kernel))

  #Use vertical kernel to detect and save the vertical lines in a jpg
  image_1 = cv2.erode(img_bin, ver_kernel, iterations=3)
  vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=3)

  #Use horizontal kernel to detect and save the horizontal lines in a jpg
  image_2 = cv2.erode(img_bin, hor_kernel, iterations=3)
  horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=3)

  # Combine horizontal and vertical lines in a new third image, with both having same weight.
  img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
  #Eroding and thesholding the image
  img_vh = cv2.erode(~img_vh, kernel, iterations=2)
  if(conf.adaptive_filter):
    img_vh = cv2.adaptiveThreshold(img_vh,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)
  else:
    thresh, img_vh = cv2.threshold(img_vh,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
  
  return img_vh

def Clean_Image(img, img_bin):
  if(conf.threshold):
    thresh, my_img = cv2.threshold(img,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)   
  else:
    my_img = img
  
  img_vh = Find_Lines(img_bin, kernel_factor = conf.kernel_factor_clean, morph_kernel = conf.morph_kernel_clean)

  bitxor = cv2.bitwise_xor(my_img,img_vh)
  bitnot = cv2.bitwise_not(bitxor)
  
  return bitnot

def Arrange_Boxes(img, contours = None):

  def sort_contours(cnts, method="left-to-right"):
      # initialize the reverse flag and sort index
      reverse = False
      i = 0
      # handle if we need to sort in reverse
      if method == "right-to-left" or method == "bottom-to-top":
          reverse = True
      # handle if we are sorting against the y-coordinate rather than
      # the x-coordinate of the bounding box
      if method == "top-to-bottom" or method == "bottom-to-top":
          i = 1
      # construct the list of bounding boxes and sort them from top to
      # bottom
      boundingBoxes = [cv2.boundingRect(c) for c in cnts]
      (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
      key=lambda b:b[1][i], reverse=reverse))
      # return the list of sorted contours and bounding boxes
      return (cnts, boundingBoxes)

  # Sort all the contours by top to bottom.
  contours, boundingBoxes = sort_contours(contours, method="top-to-bottom")

  #Creating a list of heights for all detected boxes
  heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]

  #Get mean of heights
  mean = np.mean(heights)

  #Create list box to store all boxes in  
  box = []
  # Get position (x,y), width and height for every contour and show the contour on image
  for c in contours:
      x, y, w, h = cv2.boundingRect(c)
      if (w<conf.max_width and h<conf.max_height and w>conf.min_width and h>conf.min_height):
          image = cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
          box.append([x,y,w,h])
          
  #Creating two lists to define row and column in which cell is located
  row=[]
  column=[]
  j=0

  #Sorting the boxes to their respective row and column
  for i in range(len(box)):    
          
      if(i==0):
          column.append(box[i])
          previous=box[i]    
      
      else:
          if(box[i][1]<=previous[1]+mean/2):
              column.append(box[i])
              previous=box[i]            
              
              if(i==len(box)-1):
                  column = sorted(column, key=lambda b:b[0], reverse=True)
                  row.append(column)        
              
          else:
              column = sorted(column, key=lambda b:b[0], reverse=True)
              row.append(column)
              column=[]
              previous = box[i]
              column.append(box[i])

  row[len(row)-1] = sorted(row[len(row)-1], key=lambda b:b[0], reverse=True)
  if(conf.show_progress):        
    print("Table Size: ", len(row),"X",len(row[0]))
  
  Nrows = len(row)
  Ncols = max([len(r) for r in row])
  print("Rows: ", Nrows, " Cols: ", Ncols)
  # Check if there are misaligned columns
  col_centers = np.zeros((Nrows, Ncols))
  for i in range(len(row)):
    for j in range(len(row[i])):
      col_centers[i, j] = (row[i][j][0] + row[i][j][2])/2 
  
  # print(col_centers.transpose())
  better_boxes = [[None for j in range(Ncols)] for i in range(Nrows)]

  header_widths = [r[2] for r in row[0]]
  head_cent_limit = [[r[0] + 0.3*r[2], r[0] + 0.7*r[2]] for r in row[0]]
  # print(head_cent_limit)
  for i in range(Nrows):
    r = row[i]
    for b in r:
      p = (b[0] + b[2]/2)
      for j in range(Ncols):
        if(p > head_cent_limit[j][0] and p < head_cent_limit[j][1]):
          better_boxes[i][j] = b
    
  return better_boxes

def DumpBoxes(image, boxes):
    for i in range(len(boxes)):
        for j in range(len(boxes[i])):
            if(boxes[i][j] is None):
                continue

            y,x,w,h = boxes[i][j][0],boxes[i][j][1], boxes[i][j][2],boxes[i][j][3]
            finalimg = image[x:x+h, y:y+w]
            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
            border = cv2.copyMakeBorder(finalimg,10,1,1,1, cv2.BORDER_CONSTANT,value=[255,255])
            resizing = cv2.resize(border, None, fx=5, fy=5, interpolation=cv2.INTER_CUBIC)
            # dilation = cv2.dilate(border, kernel,iterations=2)
            # erosion = cv2.erode(dilation, kernel,iterations=1)
            thresh, erosion = cv2.threshold(resizing,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
            if(i ==0 and j>=conf.rot_start and j<(len(boxes[i])-conf.rot_last)):
                erosion = cv2.rotate(erosion, cv2.ROTATE_90_CLOCKWISE)

            # print("SmallImgs/img_{}_{}.png".format(i, j))
            cv2.imwrite("SmallImgs/img_{}_{}.png".format(i, j),erosion)

def OCR_Boxes(image,  boxes):
    global arfont 
    global enfont

    def clean_string(inp):
        symbs = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-*|."
        outp = ""
        for c in inp:
            if(symbs.find(c) != -1): outp += c
        return outp

    def clean_string_ara(inp):
        symbs = "دجحخهعغفقثصضطكمنتالبيسشظزوةىلارؤءئذإلإًَُأألأآلآْ "
        outp = ""
        for c in inp:
            if(symbs.find(c) != -1): outp += c
        return outp

    def rotate_box(im, scale = 1.0):
        """angle=270
        (h, w) = im.shape[:2]

        center = (w / 2, h / 2)

        M = cv2.getRotationMatrix2D(center, angle, scale)
        rotated = cv2.warpAffine(im, M, (w, h))"""

        cv.Transpose(img,timg)
        cv.Flip(timg,timg,flipMode=1)

        return rotated

    maxResult = 30
    kx = 0
    mix = 0
    result = [[None for j in range(len(boxes[0]))] for i in range(len(boxes))]
    for i in range(len(boxes)):
        Ncols = 6
        Nrows = math.ceil(len(boxes[i]) / Ncols)
        
        for j in range(len(boxes[i])):
            if(boxes[i][j] is None):
                continue

            y,x,w,h = boxes[i][j][0],boxes[i][j][1], boxes[i][j][2],boxes[i][j][3]
            finalimg = image[x:x+h, y:y+w]
            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
            border = cv2.copyMakeBorder(finalimg,2,2,2,2, cv2.BORDER_CONSTANT,value=[255,255])
            resizing = cv2.resize(border, None, fx=5, fy=5, interpolation=cv2.INTER_CUBIC)
            dilation = cv2.dilate(resizing, kernel,iterations=1)
            erosion = cv2.erode(dilation, kernel,iterations=2)
            thresh, erosion = cv2.threshold(erosion,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
            if(i ==0 and j>=conf.rot_start and j<(len(boxes[i])-conf.rot_last)):
                erosion = cv2.rotate(erosion, cv2.ROTATE_90_CLOCKWISE)
            #if(kx> maxResult):
            #  continue
            kx+=1
            arabic_name = i >= conf.ara_row and j == conf.ara_col
            arabic_header = i==0 and conf.header_arabic and not (j > conf.ara_col and j < (len(boxes[i])-conf.last_rows))
            is_arabic = arabic_name or arabic_header
            if(is_arabic):
              erosion = cv2.resize(erosion, None, fx=0.25, fy=0.25, interpolation=cv2.INTER_CUBIC)
              out = pytesseract.image_to_string(erosion, lang = "ara+eng")
              out = clean_string_ara(out)
              #print(out)
              if(len(out)<1):
                  #print("RUNNEEER!")
                  erosion = cv2.resize(erosion, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
                  new_erosion = cv2.dilate(erosion, kernel,iterations=1)
                  out = pytesseract.image_to_string(new_erosion, lang = "ara+eng")
                  out = clean_string_ara(out)
                  #plotting = plt.imshow(new_erosion,cmap='gray')
                  ## plt.show()
                  #print(x,y,w,h," || ",out)
                  #mix += 1
                  #if(mix==2):
                  #   x = lo
            else:
              out = pytesseract.image_to_string(erosion)
              out = clean_string(out)
              if(len(out)<1):
                  out = pytesseract.image_to_string(erosion, config=("-c tessedit"
                        "_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789|."
                        " --psm 7"
                        "-l osd"
                        " "))
              out = clean_string(out)
              if(len(out)<1):
                  out = pytesseract.image_to_string(erosion, config=("-c tessedit"
                        "_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789|."
                        " --psm 10"
                        "-l osd"
                        " "))
              out = clean_string(out)
              if(len(out)<1):
                  out = pytesseract.image_to_string(erosion, config=("-c tessedit"
                        "_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789|."
                        " --psm 3"
                        "-l osd"
                        " "))
              out = clean_string(out)

              out = fix_string(out,i,j,len(boxes[i])-conf.last_rows)

            result[i][j] = out

    return result

def fix_string(inp, i, j, tot):
      if(i >= conf.ara_row and j > conf.ara_col and j < tot):
          if(inp == "g"):
              inp = "C"
          elif(inp == "f-"):
              inp = "F"
          elif(inp == "f"):
              inp = "F"
          elif(inp == "fF"):
              inp = "F"
          elif(inp == "G"):
              inp = "C"
          elif(inp == "H+"):
              inp = "B+"
          elif(inp == "Bt"):
              inp = "B+"
          elif(inp == "At"):
              inp = "A+"
          elif(inp == "i."):
              inp = "F"
          elif(inp == "t"):
              inp = "F"
          elif(inp == "7"):
              inp = "F"
          elif(inp == "vy"):
              inp = "C"
          elif(inp == "h"):
              inp = "F"
          elif(inp == "a"):
              inp = "F*"
          elif(inp == "a"):###
              inp = "F"
          elif(inp == "i"):
              inp = "F"
          elif(inp == "I"):
              inp = "F"
          elif(inp == "4"):
              inp = "A"
          elif(inp == "HAE"):
              inp = "F"
          elif(inp == "o"):
              inp = "F"
          elif(inp == "oB"):
              inp = "B"
          elif(inp == "Fe"):
              inp = "F*"
          elif(inp == "p*"):
              inp = "F*"
              
          symbs = "ABCDF+-*"
          outp = ""
          for c in inp:
              if(symbs.find(c) != -1): outp += c
          return outp
      return inp

def Save_Excel(result, save_path):
  def FormatString(s):
    if isinstance(s, str):
      try:
        s.encode('unicode_escape').decode('utf-8')
        return s
      except:
        return unidecode(s)
    else:
      return ""
  
  dataframe = pd.DataFrame(result)
  dataframe = dataframe.applymap(FormatString)#lambda x:  if isinstance(x, str) else x)
  #data = dataframe.style.set_properties(align="left")
  dataframe.to_excel(save_path, engine=conf.excel_engine)
  print("Excel file successfully created, saved at", save_path)

findfont: Font family ['Tahoma'] not found. Falling back to DejaVu Sans.


# **RecPDF**
Recognizes each image in a PDF file, processes them one by one.

**path:** path to the PDF file

In [None]:
import sys

def RecPDF(path=""):
  with open(path, "rb") as file:
      pdf = file.read()

  img_counter = 0
  pointer = 0
  while True:
      pointer = pdf.find(b"stream", pointer)
      if pointer < 0:
          break

      x = pdf.find(b"\xff\xd8", pointer)
      if x < 0:
          pointer = pointer + 1
          continue
      else:
          extension = "jpg"

      limit = pdf.find(b"endstream", pointer)
      if limit < 0:
          break

      y = pdf.find(b"\xff\xd9", pointer, limit) + 2

      pointer = limit + 9
      if y < 2:
          continue        
      
      img = pdf[x:y]

      img_counter = img_counter + 1

      img_path = "img_" + str(img_counter) + "." + extension

      with open(img_path, "wb") as jpgfile:
          jpgfile.write(img)

      RecImg(img_path, ocr=False, kernel_factor=260, morph_kernel=4, min_width =10, min_height=10)

#**RecImg**
Recognizes a table in image form, converts it to an Excel file.

**path:** path to the input image

In [None]:
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract

def RecImg(path="/content/sample.PNG", excel_path = "/content/test.xlsx"):
  #read your file
  file= path
  img = cv2.imread(file,0)

  if(conf.show_progress):
    print("ORIGINAL IMAGE")
    plotting = plt.imshow(img, cmap="gray")
    plt.show()

  if(conf.adaptive_filter):
    img_bin = cv2.adaptiveThreshold(img,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)
  else:
    thresh,img_bin = cv2.threshold(img,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU) 
  img_bin = 255-img_bin
  
  table = Cut_Table(img_bin, orig_img = img)
  
  if(conf.adaptive_filter):
    table_bin = cv2.adaptiveThreshold(table,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)
  else:
    thresh,table_bin = cv2.threshold(table,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU) 
  table_bin = 255-table_bin

  clean_table = Clean_Image(table, table_bin)

  conts = Find_Contours(table_bin)
  boxes = Arrange_Boxes(table, contours =conts)
  
  if(conf.ocr):
    results = OCR_Boxes(clean_table, boxes)
    Save_Excel(results, excel_path)

# **Utility Functions**

## **Cut Table**

In [1]:
def Cut_Table(img, orig_img = None):
  img_vh = Find_Lines(img, conf.kernel_factor, conf.morph_kernel)
  def find_largest_cont(in_img):
    cnts, _ = cv2.findContours(in_img.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    cnts = sorted(cnts, key = cv2.contourArea, reverse = True)[:10]
    table = None
    i = 0
    for c in cnts:
      # approximate the contour
      peri = cv2.arcLength(c, True)
      approx = cv2.approxPolyDP(c, 0.015 * peri, True)

      if len(approx) == 4 and i>conf.passes-1:
        table = approx
        break
      else: print("PASS")
      i +=1
    return table
  
  largest_cont = find_largest_cont(img_vh)

  def crop_minAreaRect(img, screenCnt):
    pts = screenCnt.reshape(4, 2)
    rect = np.zeros((4, 2), dtype = "float32")

    s = pts.sum(axis = 1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]

    diff = np.diff(pts, axis = 1)
    rect[1] = pts[np.argmin(diff)]
    rect[3] = pts[np.argmax(diff)]
    # multiply the rectangle by the original ratio
    #rect *= ratio

    (tl, tr, br, bl) = rect
    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))

    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))

    maxWidth = max(int(widthA), int(widthB))
    maxHeight = max(int(heightA), int(heightB))

    dst = np.array([
      [0, 0],
      [maxWidth - 1, 0],
      [maxWidth - 1, maxHeight - 1],
      [0, maxHeight - 1]], dtype = "float32")

    M = cv2.getPerspectiveTransform(rect, dst)
    warp = cv2.warpPerspective(img, M, (maxWidth, maxHeight))
    return warp
  table_img = crop_minAreaRect(orig_img, largest_cont)
  if(conf.show_progress):
    print("CROPPED TABLE")
    plotting = plt.imshow(table_img,cmap='gray')
    plt.show()
  return table_img

## **Find Contours**

In [None]:
def Find_Contours(img):
  img_vh = Find_Lines(img, conf.kernel_factor, conf.morph_kernel)
  contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
  return contours

## **Find Lines**

In [None]:
def Find_Lines(img_bin, kernel_factor = 300, morph_kernel=1):
  # countcol(width) of kernel as 100th of total width
  kernel_len = np.array(img_bin).shape[1]//kernel_factor
  # Defining a vertical kernel to detect all vertical lines of image 
  ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
  # Defining a horizontal kernel to detect all horizontal lines of image
  hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
  # A kernel of 2x2
  kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (morph_kernel, morph_kernel))

  #Use vertical kernel to detect and save the vertical lines in a jpg
  image_1 = cv2.erode(img_bin, ver_kernel, iterations=3)
  vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=3)

  #Use horizontal kernel to detect and save the horizontal lines in a jpg
  image_2 = cv2.erode(img_bin, hor_kernel, iterations=3)
  horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=3)

  # Combine horizontal and vertical lines in a new third image, with both having same weight.
  img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
  #Eroding and thesholding the image
  img_vh = cv2.erode(~img_vh, kernel, iterations=2)
  if(conf.adaptive_filter):
    img_vh = cv2.adaptiveThreshold(img_vh,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,2)
  else:
    thresh, img_vh = cv2.threshold(img_vh,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
  
  return img_vh

## **Clean Image**

In [None]:
def Clean_Image(img, img_bin):
  if(conf.threshold):
    thresh, my_img = cv2.threshold(img,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)   
  else:
    my_img = img
  
  img_vh = Find_Lines(img_bin, kernel_factor = conf.kernel_factor_clean, morph_kernel = conf.morph_kernel_clean)

  bitxor = cv2.bitwise_xor(my_img,img_vh)
  bitnot = cv2.bitwise_not(bitxor)
  
  if(conf.show_progress):
    print("CLEAN IMAGE")
    plotting = plt.imshow(bitnot,cmap='gray')
    plt.show()

  return bitnot

## **Arrange Boxes**

In [None]:
def Arrange_Boxes(img, contours = None):

  def sort_contours(cnts, method="left-to-right"):
      # initialize the reverse flag and sort index
      reverse = False
      i = 0
      # handle if we need to sort in reverse
      if method == "right-to-left" or method == "bottom-to-top":
          reverse = True
      # handle if we are sorting against the y-coordinate rather than
      # the x-coordinate of the bounding box
      if method == "top-to-bottom" or method == "bottom-to-top":
          i = 1
      # construct the list of bounding boxes and sort them from top to
      # bottom
      boundingBoxes = [cv2.boundingRect(c) for c in cnts]
      (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
      key=lambda b:b[1][i], reverse=reverse))
      # return the list of sorted contours and bounding boxes
      return (cnts, boundingBoxes)

  # Sort all the contours by top to bottom.
  contours, boundingBoxes = sort_contours(contours, method="top-to-bottom")

  #Creating a list of heights for all detected boxes
  heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]

  #Get mean of heights
  mean = np.mean(heights)

  #Create list box to store all boxes in  
  box = []
  # Get position (x,y), width and height for every contour and show the contour on image
  for c in contours:
      x, y, w, h = cv2.boundingRect(c)
      if (w<conf.max_width and h<conf.max_height and w>conf.min_width and h>conf.min_height):
          image = cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
          box.append([x,y,w,h])
          

  if(conf.show_progress):      
    print("EXTRACTED BOXES")  
    plotting = plt.imshow(image,cmap='GnBu_r')
    plt.show()
  #Creating two lists to define row and column in which cell is located
  row=[]
  column=[]
  j=0

  #Sorting the boxes to their respective row and column
  for i in range(len(box)):    
          
      if(i==0):
          column.append(box[i])
          previous=box[i]    
      
      else:
          if(box[i][1]<=previous[1]+mean/2):
              column.append(box[i])
              previous=box[i]            
              
              if(i==len(box)-1):
                  column = sorted(column, key=lambda b:b[0], reverse=True)
                  row.append(column)        
              
          else:
              column = sorted(column, key=lambda b:b[0], reverse=True)
              row.append(column)
              column=[]
              previous = box[i]
              column.append(box[i])
  
  row[len(row)-1] = sorted(row[len(row)-1], key=lambda b:b[0], reverse=True)
  if(conf.show_progress):        
    print("Table Size: ", len(row),"X",len(row[0]))
  return row

## **OCR Boxes**

In [None]:
from tqdm import tqdm

def OCR_Boxes(image,  boxes):
  def clean_string(inp):
      symbs = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-*|."
      outp = ""
      for c in inp:
          if(symbs.find(c) != -1): outp += c
      return outp

  def clean_string_ara(inp):
      symbs = "دجحخهعغفقثصضطكمنتالبيسشظزوةىلارؤءئذإلإًَُأألأآلآْ "
      outp = ""
      for c in inp:
          if(symbs.find(c) != -1): outp += c
      return outp

  def rotate_box(im, scale = 1.0):
    """angle=270
    (h, w) = im.shape[:2]

    center = (w / 2, h / 2)

    M = cv2.getRotationMatrix2D(center, angle, scale)
    rotated = cv2.warpAffine(im, M, (w, h))"""

    cv.Transpose(img,timg)
    cv.Flip(timg,timg,flipMode=1)

    return rotated

  maxResult = 30
  kx = 0
  mix = 0
  result = []
  for i in range(len(boxes)):
      result.append([])
      for j in range(len(boxes[i])):
          y,x,w,h = boxes[i][j][0],boxes[i][j][1], boxes[i][j][2],boxes[i][j][3]
          finalimg = image[x:x+h, y:y+w]
          kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
          border = cv2.copyMakeBorder(finalimg,2,2,2,2, cv2.BORDER_CONSTANT,value=[255,255])
          resizing = cv2.resize(border, None, fx=5, fy=5, interpolation=cv2.INTER_CUBIC)
          dilation = cv2.dilate(resizing, kernel,iterations=1)
          erosion = cv2.erode(dilation, kernel,iterations=2)
          thresh, erosion = cv2.threshold(erosion,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
          if(i ==0 and j>=conf.rot_start and j<(len(boxes[i])-conf.rot_last)):
              erosion = cv2.rotate(erosion, cv2.ROTATE_90_CLOCKWISE)
          #if(kx> maxResult):
          #  continue
          kx+=1
          arabic_name = i >= conf.ara_row and j == conf.ara_col
          arabic_header = i==0 and conf.header_arabic and not (j > conf.ara_col and j < (len(boxes[i])-conf.last_rows))
          is_arabic = arabic_name or arabic_header
          if(is_arabic):
              erosion = cv2.resize(erosion, None, fx=0.25, fy=0.25, interpolation=cv2.INTER_CUBIC)
              out = pytesseract.image_to_string(erosion, lang = "ara")
              out = clean_string_ara(out)
              #print(out)
              if(len(out)<1):
                  #print("RUNNEEER!")
                  erosion = cv2.resize(erosion, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
                  new_erosion = cv2.dilate(erosion, kernel,iterations=1)
                  out = pytesseract.image_to_string(new_erosion, lang = "ara")
                  out = clean_string_ara(out)
                  #plotting = plt.imshow(new_erosion,cmap='gray')
                  #plt.show()
                  #print(x,y,w,h," || ",out)
                  #mix += 1
                  #if(mix==2):
                  #   x = lo
          else:
              out = pytesseract.image_to_string(erosion)
              out = clean_string(out)
              if(len(out)<1):
                  out = pytesseract.image_to_string(erosion, config=("-c tessedit"
                        "_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789|."
                        " --psm 7"
                        "-l osd"
                        " "))
              out = clean_string(out)
              if(len(out)<1):
                  out = pytesseract.image_to_string(erosion, config=("-c tessedit"
                        "_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789|."
                        " --psm 10"
                        "-l osd"
                        " "))
              out = clean_string(out)
              if(len(out)<1):
                  out = pytesseract.image_to_string(erosion, config=("-c tessedit"
                        "_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789|."
                        " --psm 3"
                        "-l osd"
                        " "))
              out = clean_string(out)

              out = fix_string(out,i,j,len(boxes[i])-conf.last_rows)

          if(conf.show_progress):
              plotting = plt.imshow(erosion,cmap='gray')
              plt.show()
              print(x,y,w,h," || ",out)
          
          result[i].append(out)
  return result

In [None]:
def fix_string(inp, i, j, tot):
      if(i >= conf.ara_row and j > conf.ara_col and j < tot):
          if(inp == "g"):
              inp = "C"
          elif(inp == "f-"):
              inp = "F"
          elif(inp == "f"):
              inp = "F"
          elif(inp == "fF"):
              inp = "F"
          elif(inp == "G"):
              inp = "C"
          elif(inp == "H+"):
              inp = "B+"
          elif(inp == "Bt"):
              inp = "B+"
          elif(inp == "At"):
              inp = "A+"
          elif(inp == "i."):
              inp = "F"
          elif(inp == "t"):
              inp = "F"
          elif(inp == "7"):
              inp = "F"
          elif(inp == "vy"):
              inp = "C"
          elif(inp == "h"):
              inp = "F"
          elif(inp == "a"):
              inp = "F*"
          elif(inp == "a"):###
              inp = "F"
          elif(inp == "i"):
              inp = "F"
          elif(inp == "I"):
              inp = "F"
          elif(inp == "4"):
              inp = "A"
          elif(inp == "HAE"):
              inp = "F"
          elif(inp == "o"):
              inp = "F"
          elif(inp == "oB"):
              inp = "B"
          elif(inp == "Fe"):
              inp = "F*"
          elif(inp == "p*"):
              inp = "F*"
              
          symbs = "ABCDF+-*"
          outp = ""
          for c in inp:
              if(symbs.find(c) != -1): outp += c
          return outp
      return inp

## **Save Excel**

In [None]:
def Save_Excel(result, save_path):
  def FormatString(s):
    if isinstance(s, str):
      try:
        s.encode('unicode_escape').decode('utf-8')
        return s
      except:
        return unidecode(s)
    else:
      return ""
  
  dataframe = pd.DataFrame(result)
  dataframe = dataframe.applymap(FormatString)#lambda x:  if isinstance(x, str) else x)
  #data = dataframe.style.set_properties(align="left")
  dataframe.to_excel(save_path, engine=conf.excel_engine)
  print("Excel file successfully created, saved at", save_path)

In [None]:
        
  if(show_progress):          
    print(len(column))
    print(len(row))

  #calculating maximum number of cells
  max_countcol = 0
  for i in range(len(row)):
      countcol = len(row[i])
      if countcol > max_countcol:
          max_countcol = countcol

  countcol = max_countcol

  #Retrieving the center of each column
  center = [int(row[i][j][0]+row[i][j][2]/2) for j in range(len(row[i])) if row[0]]

  center=np.array(center)
  center.sort()
  #if(show_progress):
  #  print(center)

  #Regarding the distance to the columns center, the boxes are arranged in respective order

  finalboxes = []
  #my_max = 300000000
  #how_many = 1
  return
  if(ocr):
    #from every single image-based cell/box the strings are extracted via pytesseract and stored in a list
    outer=[]
    for i in range(len(finalboxes)):
        for j in range(len(finalboxes[i])):
            inner=''
            if(len(finalboxes[i][j])==0):
                outer.append(' ')
            else:
                for k in range(len(finalboxes[i][j])):
                    y,x,w,h = finalboxes[i][j][k][0],finalboxes[i][j][k][1], finalboxes[i][j][k][2],finalboxes[i][j][k][3]
                    finalimg = bitnot[x:x+h, y:y+w]
                    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
                    border = cv2.copyMakeBorder(finalimg,2,2,2,2, cv2.BORDER_CONSTANT,value=[255,255])
                    resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
                    dilation = cv2.dilate(resizing, kernel,iterations=1)
                    erosion = cv2.erode(dilation, kernel,iterations=2)
                    
                    #out = pytesseract.image_to_string(erosion)
                    #if(len(out)==0):
                    out = pytesseract.image_to_string(erosion, config='--psm 3')
                    if(show_progress):
                      print(x,y,w,h,out)
                    inner = inner +" "+ out
                outer.append(inner)

    #Creating a dataframe of the generated OCR list
    arr = np.array(outer)
    dataframe = pd.DataFrame(arr.reshape(len(row), countcol))
    print(dataframe)
    data = dataframe.style.set_properties(align="left")
    #Converting it in a excel-file
    data.to_excel("/content/test.xlsx", engine=excel_engine)

    def Clean_Image(img, img_bin):
  if(conf.threshold):
    thresh, my_img = cv2.threshold(my_img,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)   
  else:
    my_img = img
  
  img_vh = Find_Lines(img_bin, kernel_factor = conf.kernel_factor_clean, morph_kernel = conf.morph_kernel_clean)

  bitand = cv2.bitwise_and(my_img, cv2.bitwise_not(img_vh))
  bitxor = cv2.bitwise_xor(cv2.bitwise_not(my_img), bitand)
  bitnot = cv2.bitwise_not(bitxor)
  
  if(conf.show_progress):
    print("AND IMAGE")
    plotting = plt.imshow(bitand,cmap='gray')
    plt.show()
    print("MY IMAGE")
    plotting = plt.imshow(cv2.bitwise_not(my_img),cmap='gray')
    plt.show()
    print("XOR IMAGE")
    plotting = plt.imshow(bitxor,cmap='gray')
    plt.show()
    print("CLEAN IMAGE")
    plotting = plt.imshow(bitnot,cmap='gray')
    plt.show()

  return bitnot

IndentationError: ignored

In [None]:
ui = "5431"

if(ui.find("7") != -1):
  print("dfd")