In [1]:
import cv2 
import pytesseract
import numpy as np
import json

pytesseract.pytesseract.tesseract_cmd = r'C:\Users\fabia\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'

### Functions for image preprocessing

In [169]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)

#resizing image
def resize(image, scale=1.5):
    height, width = image.shape[:2]
    new_width = int(width * scale)
    new_height = int(height * scale)
    resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
    return resized_image

# Labeling Process

In [185]:
img = cv2.imread('../receipts/receipts-fabian/receipt68.jpg')

resized = resize(img)
gray = get_grayscale(resized)

# TODO: are there more useful image preprocessing methods 

img = gray

In [187]:
image_text = pytesseract.image_to_string(img, lang='eng', config='--oem 3 -c tessedit_char_blacklist="!@%^&*_+=<>?/{}|\\~`£"')
image_text

'RISTORANTE\nBELLAGIO (CO)\nTELEFONO 031951888\n\nEY aerate Coenen eyes Sto oe\n\nTAV. 9 COPERTI 4\nSALA UNICA\n147072019 16:23\n\n4 COPERTO 12.00\n2 ACQUA NAT 8.00\n1 SALMERINO PIASTRA 18.00\n1 TRENETTE AL PESTO 13.00\n1 SPAGHETTI POM. BASILICO 10.00\n1 FETTUCCINE RAGU’ 15.00\n1—-PATATINE FRITTE 6.00\n\nTOKE §»— «82.0\n\nERE Tee oi sss\n\nPrego ritirare i]\ndocumento fiscale\nalla cassa\n'

In [188]:
print(image_text)

RISTORANTE
BELLAGIO (CO)
TELEFONO 031951888

EY aerate Coenen eyes Sto oe

TAV. 9 COPERTI 4
SALA UNICA
147072019 16:23

4 COPERTO 12.00
2 ACQUA NAT 8.00
1 SALMERINO PIASTRA 18.00
1 TRENETTE AL PESTO 13.00
1 SPAGHETTI POM. BASILICO 10.00
1 FETTUCCINE RAGU’ 15.00
1—-PATATINE FRITTE 6.00

TOKE §»— «82.0

ERE Tee oi sss

Prego ritirare i]
documento fiscale
alla cassa



In [160]:
entities_1 = [
    'G] Karma Cab Sauv 12.00',
    'Gl Dougass Green SB 11.00',
    'PKI I Green Curry Scallops 27.95',
    'PKI Piri Piri Chicken 7315'
]

entities = []
for entity in entities_1:
    starting_index = image_text.index(entity)
    end_index = starting_index + len(entity)
    entities.append([starting_index, end_index, "RECEIPT_ITEM"])
    print(f'Entity: "{entity}" starts at: {starting_index}, ends at: {end_index}')
    


Entity: "G] Karma Cab Sauv 12.00" starts at: 78, ends at: 101
Entity: "Gl Dougass Green SB 11.00" starts at: 102, ends at: 127
Entity: "PKI I Green Curry Scallops 27.95" starts at: 128, ends at: 160
Entity: "PKI Piri Piri Chicken 7315" starts at: 161, ends at: 187


In [161]:
# some JSON
try:
    # TODO: use different file when starting new labling session and update bottom write execution
    file = open('../spacy-ner/training/training_data_fabian_1.json', 'r')
except FileNotFoundError:
    print('file not found')
    data = []
else:
    with file:
        data = json.load(file)
    
# a Python object (dict):
x = {
  "text": image_text,
  "entities": entities,
}

data.append(x)

# convert into JSON:
y = json.dumps(data)

with open("../spacy-ner/training/training_data_fabian_1.json", "w") as outfile:
    outfile.write(y)
    
print('added training data')

added training data
