In [1]:
import cv2 
import pytesseract
import numpy as np
import json

pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'

### Functions for image preprocessing

In [2]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)

#resizing image
def resize(image, scale=1.5):
    height, width = image.shape[:2]
    new_width = int(width * scale)
    new_height = int(height * scale)
    resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
    return resized_image

# Labeling Process

In [4]:
img = cv2.imread('./receipt.jpg')

resized = resize(img)
gray = get_grayscale(resized)

# TODO: are there more useful image preprocessing methods 

img = gray

In [5]:
image_text = pytesseract.image_to_string(img, lang='eng', config='--oem 3 -c tessedit_char_blacklist="!@%^&*_+=<>?/{}|\\~`£"')
image_text

'MIGROS\n\nGENOSSENSCHAFT MIGROS LUZERN\nM Schlossberg Luzern\n\nArtikelbezeichnung Menge Preis Gespart Total\n\n0. 0.\na bs Ee 11.\n18. 18.\n\n2.\n\nKassentragtasche\nKinder Mix Kalender\nMonini Classico 11\nCrocchini Rosmarino\nMClass L6ffelbiscuit\nZweifel Chips Nature\nServietten 33x33cm\nvalflora Vollrahm UHT\nValflora Vollrahm UHT\nEier Frei landhaltung\nvalflora Vollmilch 11\nOatly Haferdr.Barista\nAgnesi Spaghetti N. 3\nMClass Penne\n\nMClass Skyr Heidelbeer\nYOU Skyr Mango-Passion\nGalbani Mascarpone\nThomy Tomatenptiree\nZwiebeln\n\nBananen\n\nCORPRPRPNA PRP RPRPRPRPRENRPRERE\nNP WORPRPENWHEWNWhUPH\nNOWODRPNNOWRWNWARUNNDN\n\nPRP RP RP RP RP RPRPRPRPRPENRPRPRENN F\n\nTotal CHF\n\nMastercard\n\nBUCHUNG Mastercard\nXXXXXXXXXXXX2 532\n\n30.11.2024 14:08\n#31454544#00295699633131]NA#\n\nTotal EFT CHF:\n\nCumulus-Nummer 2099.603.400.063\nPunktestand per 09.11.2024 1136.45\nErhaltene Punkte\n\nCHE-105.802.421 MWST\n\nSatz Total MwST\n2.60 78.60 1.99\n\n010100731402523011240081000009

In [6]:
print(image_text)

MIGROS

GENOSSENSCHAFT MIGROS LUZERN
M Schlossberg Luzern

Artikelbezeichnung Menge Preis Gespart Total

0. 0.
a bs Ee 11.
18. 18.

2.

Kassentragtasche
Kinder Mix Kalender
Monini Classico 11
Crocchini Rosmarino
MClass L6ffelbiscuit
Zweifel Chips Nature
Servietten 33x33cm
valflora Vollrahm UHT
Valflora Vollrahm UHT
Eier Frei landhaltung
valflora Vollmilch 11
Oatly Haferdr.Barista
Agnesi Spaghetti N. 3
MClass Penne

MClass Skyr Heidelbeer
YOU Skyr Mango-Passion
Galbani Mascarpone
Thomy Tomatenptiree
Zwiebeln

Bananen

CORPRPRPNA PRP RPRPRPRPRENRPRERE
NP WORPRPENWHEWNWhUPH
NOWODRPNNOWRWNWARUNNDN

PRP RP RP RP RP RPRPRPRPRPENRPRPRENN F

Total CHF

Mastercard

BUCHUNG Mastercard
XXXXXXXXXXXX2 532

30.11.2024 14:08
#31454544#00295699633131]NA#

Total EFT CHF:

Cumulus-Nummer 2099.603.400.063
Punktestand per 09.11.2024 1136.45
Erhaltene Punkte

CHE-105.802.421 MWST

Satz Total MwST
2.60 78.60 1.99

010100731402523011240081000009510

Filiale Bed. KNr Bon Datum Zeit
0073140 0537999 252 0081 30

In [373]:
entities_1 = [
    'Geae0i e Stoviglie 1,00',
    'Hacedonia 10.00',
    'Insalata Negombo 10.00',
    'Bufala 8.00',
    'Pomodor ini 4.00',
    'Nat 12 It 1,00'
]

entities = []
for entity in entities_1:
    starting_index = image_text.index(entity)
    end_index = starting_index + len(entity)
    entities.append([starting_index, end_index, "RECEIPT_ITEM"])
    print(f'Entity: "{entity}" starts at: {starting_index}, ends at: {end_index}')
    


Entity: "Geae0i e Stoviglie 1,00" starts at: 130, ends at: 153
Entity: "Hacedonia 10.00" starts at: 163, ends at: 178
Entity: "Insalata Negombo 10.00" starts at: 179, ends at: 201
Entity: "Bufala 8.00" starts at: 202, ends at: 213
Entity: "Pomodor ini 4.00" starts at: 214, ends at: 230
Entity: "Nat 12 It 1,00" starts at: 231, ends at: 245


In [374]:
# some JSON
try:
    # TODO: use different file when starting new labling session and update bottom write execution
    file = open('../spacy-ner/training/training_data_fabian_2.json', 'r')
except FileNotFoundError:
    print('file not found')
    data = []
else:
    with file:
        data = json.load(file)
    
# a Python object (dict):
x = {
  "text": image_text,
  "entities": entities,
}

data.append(x)

# convert into JSON:
y = json.dumps(data)

with open("../spacy-ner/training/training_data_fabian_2.json", "w") as outfile:
    outfile.write(y)
    
print('added training data')

added training data
