# Import Packages

In [1]:
import cv2 
import pytesseract
import numpy as np
import json

pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'

# Data Labeling

In [2]:
path_to_images = '../data/images/receipts-switzerland/'
image_name = 'swiss_receipt_1.jpeg'

img = cv2.imread(path_to_images + image_name)

## Image text extraction

In order to label the image correctliy for our training with spacy, we first have to convert the image to text using a modern OCR like tesseract.

### Image Preprocessing

To improve the text extraction we do some image preprocessing

In [3]:
def resize(image, scale=1.5):
    height, width = image.shape[:2]
    new_width = int(width * scale)
    new_height = int(height * scale)
    resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
    return resized_image

# Image Resizing
resized = resize(img)

# Grayscale Image
img = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)

### Text Extraction

In [4]:
image_text = pytesseract.image_to_string(img, lang='eng', config='--oem 3 -c tessedit_char_blacklist="!@%^&*_+=<>?/{}|\\~`£"')

print(image_text)

(2fv) Die Gastronomiegruppe

Hochschule Luzern
Informatik 8 Wirtschaft
C‘o ZFV-Unternehmungen
Fluelastrasse 51
8047 Zurich

CHF
Pizza Neapolitana 10.50
Ramseier Apfelschor] 3.00

Total CHF I, 00

Z MASTERCARD 13,50

MwSt

MwSt Prozent Netto Brutto
(1) 8.18 12,49 13.50 1.0]

Profit Center: InHouse
29.11.2024 11:36:49 #:2450 0p:568071 C56
8002 $:568

ES bediente Sie - 9€1f Check

Kopie # ]

CHE-105 827. 102 MWST
Vielen Dank und auf Wiedersehen




## Entity Extraction

In [5]:
items = [
    {
        'item': 'Pizza Neapolitana 10.50',
        'quantity': None,
        'name': 'Pizza Neapolitana',
        'price': '10.50'
    },
    {
        'item': 'Ramseier Apfelschor] 3.00',
        'quantity': None,
        'name': 'Ramseier Apfelschor]',
        'price': '3.00'
    }
]

entities = []
for entity in items:
    # item
    item = entity['item']
    item_starting_index = image_text.index(item)
    item_end_index = item_starting_index + len(item)
    entities.append([item_starting_index, item_end_index, "RECEIPT_ITEM"])
    print(f'Item: "{item}" starts at: {item_starting_index}, ends at: {item_end_index}')
    
    # quantity
    if entity['quantity'] is not None:
        item_quantity = entity['quantity']
        starting_index = item_starting_index + item.index(item_quantity)
        end_index = starting_index + len(item_quantity)
        entities.append([starting_index, end_index, "RECEIPT_ITEM_QUANTITY"])
        print(f'\tQuantity: "{item_quantity}" starts at: {starting_index}, ends at: {end_index}')
        
    # name
    if entity['name'] is not None:
        item_name = entity['name']
        starting_index = item_starting_index + item.index(item_name)
        end_index = starting_index + len(item_name)
        entities.append([starting_index, end_index, "RECEIPT_ITEM_NAME"])
        print(f'\tName: "{item_name}" starts at: {starting_index}, ends at: {end_index}')
        
    # price
    if entity['price'] is not None:
        item_price = entity['price']
        starting_index = image_text.index(item_price)
        end_index = starting_index + len(item_price)
        entities.append([starting_index, end_index, "RECEIPT_ITEM_PRICE"])
        print(f'\tPrice: "{item_price}" starts at: {starting_index}, ends at: {end_index}')
    


Entity: "Pizza Neapolitana 10.50" starts at: 128, ends at: 151
	Name: "Pizza Neapolitana" starts at: 128, ends at: 145
	Price: "10.50" starts at: 146, ends at: 151
Entity: "Ramseier Apfelschor] 3.00" starts at: 152, ends at: 177
	Name: "Ramseier Apfelschor]" starts at: 152, ends at: 172
	Price: "3.00" starts at: 173, ends at: 177


Check the output of the entity extraction and make sure everything is correct. The indexes could be wrong because of the string.index() only returns the first match.

In [6]:
entities

[[128, 151, 'RECEIPT_ITEM'],
 [128, 145, 'RECEIPT_ITEM_NAME'],
 [146, 151, 'RECEIPT_ITEM_PRICE'],
 [152, 177, 'RECEIPT_ITEM'],
 [152, 172, 'RECEIPT_ITEM_NAME'],
 [173, 177, 'RECEIPT_ITEM_PRICE']]

### (OPTIONAL) Fix if neccessary

To fix the indexes copy the part from above and overwrite the entites array with the correct indicies

In [7]:
# entities = ...

## Save Image Reference, Extracted Text & Entities to Dataset

Load the already existing training data. Use a separate File for different types of receipts or new labeling entities.

In [9]:
try:
    # TODO: use different file when starting new labling session and update bottom write execution
    file = open('../data/datasets/training_v2_test.json', 'r')
except FileNotFoundError:
    print('file not found')
    data = []
else:
    with file:
        data = json.load(file)

In [10]:
new_data_entry = {
    "image_name": image_name,
    "text": image_text,
    "entities": entities,
}

new_data_entry

{'image_name': 'swiss_receipt_1.jpeg',
 'text': '(2fv) Die Gastronomiegruppe\n\nHochschule Luzern\nInformatik 8 Wirtschaft\nC‘o ZFV-Unternehmungen\nFluelastrasse 51\n8047 Zurich\n\nCHF\nPizza Neapolitana 10.50\nRamseier Apfelschor] 3.00\n\nTotal CHF I, 00\n\nZ MASTERCARD 13,50\n\nMwSt\n\nMwSt Prozent Netto Brutto\n(1) 8.18 12,49 13.50 1.0]\n\nProfit Center: InHouse\n29.11.2024 11:36:49 #:2450 0p:568071 C56\n8002 $:568\n\nES bediente Sie - 9€1f Check\n\nKopie # ]\n\nCHE-105 827. 102 MWST\nVielen Dank und auf Wiedersehen\n\n',
 'entities': [[128, 151, 'RECEIPT_ITEM'],
  [128, 145, 'RECEIPT_ITEM_NAME'],
  [146, 151, 'RECEIPT_ITEM_PRICE'],
  [152, 177, 'RECEIPT_ITEM'],
  [152, 172, 'RECEIPT_ITEM_NAME'],
  [173, 177, 'RECEIPT_ITEM_PRICE']]}

In [11]:
data.append(new_data_entry)

# convert into JSON:
json_data = json.dumps(data)

# TODO: use different file when starting new labling session and update bottom write execution
with open("../data/datasets/training_v2_test.json", "w+") as outfile:
    outfile.write(json_data)
    
print('added training data')

added training data
