In [2]:
import cv2 
import pytesseract
import numpy as np
import json

pytesseract.pytesseract.tesseract_cmd = r'C:\Users\fabia\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'

### Functions for image preprocessing

In [3]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)

#resizing image
def resize(image, scale=1.5):
    height, width = image.shape[:2]
    new_width = int(width * scale)
    new_height = int(height * scale)
    resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
    return resized_image

# Labeling Process

In [377]:
img = cv2.imread('../receipts/receipts-fabian/receipt202.jpg')

resized = resize(img)
gray = get_grayscale(resized)

# TODO: are there more useful image preprocessing methods 

img = gray

In [378]:
image_text = pytesseract.image_to_string(img, lang='eng', config='--oem 3 -c tessedit_char_blacklist="!@%^&*_+=<>?/{}|\\~`£"')
image_text

'Bohmer straBe 11a\n4190 Bad Leonfel den\nTel: 0721380063\n\nThr Einkauf am eel\nEUR\nSPAR CLEMENT INEN 1KG 0,99 A Se ee\naktionsersparnis 1,0 seg siane wie ine” Se eee\nBERGBARON 3,99 A eae ea a ee\nAkti onsersparni s 4,30 ee Seman ee . mer\nMengenvortei I -1,00 e- oe ee ATS\nBRESSO KNOBL 1,99 A Se aE “Sees\nMengenvor tei I -1,00 RT:\nSUMME 6,96 i: as\nIhre Ersparnis heute: 7,30 EUR 3 feats tr\nZAHLUNG BAR 10,00 ; ae es\nRUCKGELD 3,04 ret\nexcl. MWST . Incl.\n\n10 ,00 6,33\n\njer 216202 Kass\n\nwu\n\n96420003816\n\nKass a 002 Bon 7324 Pos.4\n\nherhei tsverordnung\n\nrkassensic\ntYn5qzR3ta7s8\n\nksv.spar.a\n\nVielen Dank fur\nThren Einkauf bei\nSPAR\n\nRegistrie\nhttps:17°\n\n'

In [379]:
print(image_text)

Bohmer straBe 11a
4190 Bad Leonfel den
Tel: 0721380063

Thr Einkauf am eel
EUR
SPAR CLEMENT INEN 1KG 0,99 A Se ee
aktionsersparnis 1,0 seg siane wie ine” Se eee
BERGBARON 3,99 A eae ea a ee
Akti onsersparni s 4,30 ee Seman ee . mer
Mengenvortei I -1,00 e- oe ee ATS
BRESSO KNOBL 1,99 A Se aE “Sees
Mengenvor tei I -1,00 RT:
SUMME 6,96 i: as
Ihre Ersparnis heute: 7,30 EUR 3 feats tr
ZAHLUNG BAR 10,00 ; ae es
RUCKGELD 3,04 ret
excl. MWST . Incl.

10 ,00 6,33

jer 216202 Kass

wu

96420003816

Kass a 002 Bon 7324 Pos.4

herhei tsverordnung

rkassensic
tYn5qzR3ta7s8

ksv.spar.a

Vielen Dank fur
Thren Einkauf bei
SPAR

Registrie
https:17°




In [373]:
entities_1 = [
    'Geae0i e Stoviglie 1,00',
    'Hacedonia 10.00',
    'Insalata Negombo 10.00',
    'Bufala 8.00',
    'Pomodor ini 4.00',
    'Nat 12 It 1,00'
]

entities = []
for entity in entities_1:
    starting_index = image_text.index(entity)
    end_index = starting_index + len(entity)
    entities.append([starting_index, end_index, "RECEIPT_ITEM"])
    print(f'Entity: "{entity}" starts at: {starting_index}, ends at: {end_index}')
    


Entity: "Geae0i e Stoviglie 1,00" starts at: 130, ends at: 153
Entity: "Hacedonia 10.00" starts at: 163, ends at: 178
Entity: "Insalata Negombo 10.00" starts at: 179, ends at: 201
Entity: "Bufala 8.00" starts at: 202, ends at: 213
Entity: "Pomodor ini 4.00" starts at: 214, ends at: 230
Entity: "Nat 12 It 1,00" starts at: 231, ends at: 245


In [374]:
# some JSON
try:
    # TODO: use different file when starting new labling session and update bottom write execution
    file = open('../spacy-ner/training/training_data_fabian_2.json', 'r')
except FileNotFoundError:
    print('file not found')
    data = []
else:
    with file:
        data = json.load(file)
    
# a Python object (dict):
x = {
  "text": image_text,
  "entities": entities,
}

data.append(x)

# convert into JSON:
y = json.dumps(data)

with open("../spacy-ner/training/training_data_fabian_2.json", "w") as outfile:
    outfile.write(y)
    
print('added training data')

added training data
