## Imports

In [4]:
import os
import json
import re

# from sklearn.model_selection import train_test_split

In [5]:
DATA_PATH = os.path.abspath("../data/datasets/ner-swiss-receipts-test.json") 

In [6]:
with open(DATA_PATH, "r", encoding="utf-8") as file:
    dataset = json.load(file)

dataset

[{'image_name': 'swiss_receipt_121.jpeg',
  'text': 'Bäckerei Konditorei\nCafe Moos Fabienne GmbH\nAmlehrıstrasse 48\n\n6005 sriens\n\nTelefon: 041 310 23 id\ninfeübeck-moos.ch\n\nWu, beck-mMOOoS.ch\n\nRechnung\n\njerumzett Beleg Nr.\nz5.,05,2022 OT: 27 S3097\nKu Fr, ’kg kg Fr,\nSuttergip feli\nStk, 1,40 7,60\nLausengipfeli\nStk, 1,40 170\nTotal CHF 1 4 „Ol\n€ 13,59\n\nDC ErT Ir er 2 Er PerFe re\n\n',
  'entities': [[211, 240, 'RECEIPT_ITEM'],
   [211, 225, 'RECEIPT_ITEM_NAME'],
   [236, 240, 'RECEIPT_ITEM_PRICE'],
   [241, 268, 'RECEIPT_ITEM'],
   [241, 254, 'RECEIPT_ITEM_NAME'],
   [265, 268, 'RECEIPT_ITEM_PRICE']]},
 {'image_name': 'swiss_receipt_4.jpeg',
  'text': 'BERSHKA\n\nCHE-100.6492.311 TVvA\nITX Retail Suisse SARL\n\nMall of Switzerland\nEbik-Square Strasse ]\n6030 Ebikon\nTel: 091 440 33 72\n\n14.12.2024 15:06 14408\n12275 063131 01 Trans: 272236\nTAP MwST MGE PREIS SUMME  T\n\nSHB 8.10 1 47.90 47.90 V\n10001827 0743453880504 PULLOVER\n\nNetto gesamt 44.31\nMUST 8.10\n\nGES

In [7]:
data = []
for entry in dataset:
    text = entry["text"]
    entities = [entity for entity in entry["entities"] if entity[2] != 'RECEIPT_ITEM']
    data.append((text, {"entities": entities}))

data

[('Bäckerei Konditorei\nCafe Moos Fabienne GmbH\nAmlehrıstrasse 48\n\n6005 sriens\n\nTelefon: 041 310 23 id\ninfeübeck-moos.ch\n\nWu, beck-mMOOoS.ch\n\nRechnung\n\njerumzett Beleg Nr.\nz5.,05,2022 OT: 27 S3097\nKu Fr, ’kg kg Fr,\nSuttergip feli\nStk, 1,40 7,60\nLausengipfeli\nStk, 1,40 170\nTotal CHF 1 4 „Ol\n€ 13,59\n\nDC ErT Ir er 2 Er PerFe re\n\n',
  {'entities': [[211, 225, 'RECEIPT_ITEM_NAME'],
    [236, 240, 'RECEIPT_ITEM_PRICE'],
    [241, 254, 'RECEIPT_ITEM_NAME'],
    [265, 268, 'RECEIPT_ITEM_PRICE']]}),
 ('BERSHKA\n\nCHE-100.6492.311 TVvA\nITX Retail Suisse SARL\n\nMall of Switzerland\nEbik-Square Strasse ]\n6030 Ebikon\nTel: 091 440 33 72\n\n14.12.2024 15:06 14408\n12275 063131 01 Trans: 272236\nTAP MwST MGE PREIS SUMME  T\n\nSHB 8.10 1 47.90 47.90 V\n10001827 0743453880504 PULLOVER\n\nNetto gesamt 44.31\nMUST 8.10\n\nGESAMT\n\n(1 : Fr.0.9385)\nb1.04EUR\n\nOnline EURDCARD Fr.47.90\n\n',
  {'entities': [[217, 218, 'RECEIPT_ITEM_QUANTITY'],
    [259, 267, 'RECEIPT_ITEM_NAME']

In [60]:
# train, test = train_test_split(data, test_size=0.1)

### Regex Baseline Model

In [8]:
quantity_pattern = r'(\d+\s*[xX]?)\s+'
item_name_pattern = r'([\w\s\-]+)\s+'
price_pattern = r'([\d]+[\.\,]\d{2})'

item_pattern = quantity_pattern + item_name_pattern + price_pattern

In [9]:
def extract_item_data(text):
    lines = text.split("\n")
    items = []
    for line in lines:
        found_items = re.findall(item_pattern, line)
        if found_items:
            items.append(found_items)
    flattened_items = [item for sublist in items for tuple in sublist for item in tuple]
    return flattened_items

## Evaluation

In [10]:
def print_confusion_matrix(matrix, labels):
    """
    Prints a confusion matrix as a formatted string.

    Args:
        matrix (list of list of int): Confusion matrix as a 2D list.
        labels (list of str): Labels for the matrix.
    """
    max_label_length = max(len(label) for label in labels)
    padding = max_label_length + 2

    # Print header
    header = " " * padding + " ".join(f"{label:>{padding}}" for label in labels)
    print(f'{"":<15} Predicted Values')
    print(header)

    # Print each row
    for label, row in zip(labels, matrix):
        row_str = " ".join(f"{cell:>{padding}}" for cell in row)
        print(f"{label:>{padding}} {row_str}")

In [11]:
true_positive = []
false_positive = []
false_negative = []

for text, annotations in data:
    # Process the text
    ents = extract_item_data(text)

    # Prepare expected values (y)
    expected_entities = []
    for ent in annotations.get("entities"):
        if ent[2] != 'RECEIPT_ITEM':
            expected_entities.append(text[ent[0]:ent[1]])
    print(expected_entities)

    # Print recognized entities
    for text in ents:
        if text in expected_entities:
            true_positive.append(text)
            expected_entities.remove(text)
        else:
            false_positive.append(text)

    false_negative.extend(expected_entities)

# Example usage
labels = ["Entity", "No Entity"]
confusion_matrix = [
    [len(true_positive), len(false_negative)],
    [len(false_positive), '0'],
]

print('Confusion Matrix:')
print_confusion_matrix(confusion_matrix, labels)

print('\nPerformance Measures:')
precision = len(true_positive) / (len(true_positive) + len(false_positive))
recall = len(true_positive) / (len(true_positive) + len(false_negative))
f1_score = 2 * precision * recall / (precision + recall)
print('Precision: ' + str(precision))
print('Recall: ' + str(recall))
print('F1 Score: ' + str(f1_score))

# True-Positive = correct idetified entities
# false_positive = actual entity that was not identified
# False-negative = identified entity that is actually not a entity

['Suttergip feli', '7,60', 'Lausengipfeli', '170']
['1', 'PULLOVER', '47.90']
['1', 'Big Mac', '6.90', '1', 'Filet-0-Fish', '5,90', '1', 'Medium Pommes Frites', '4.50', '1', 'Coca-Cola 0.4L', '3,50']
['2', '75c1 Anselmi', '98.00', '2', '75 Terra de Alter Re', '110.00', '3', '500] Cola', '19.50', '5', 'Fl Cola', '24.50', '3', '5dl Henniez mit KS', '19.50', '2', 'It Henniez mit KS', '19.60', '1', '3d] Henniez mit KS', '4.90', '2', 'Kaffee Cröme', '9.80', '7', 'Espresso', '34.30', '1', 'Kaffe Zwetschge', '8.50', '1', 'Menu 3', '27.50', 'ı', 'Nüsslisalat Mimosa', '14.50', '2', 'Tagliatel ührısiina', '47.00', '1', 'Gnocchi della casa', '26.50', '1', 'Schweins-cordon-bleu', '38.00', '1', 'Chicken Nuggets', '11.50', '2', 'Pizza Margherita', '35.00', '1', 'Pizza Prosciutto', '21.50', '1', 'Pizza Fiorentina', '23.50', '1', 'Pizza Calzone', '23.50', '1', 'Pizza Mascarpone', '27.50', '1', 'Pizza Möcke', '27.50', '1', '172 Pizza Fantasia', '23.50', '1', 'Kugel Glace', '3.60']
['1 x', '3321 Coca Ca