## Imports

In [24]:
import os

import spacy
from spacy import displacy

import random
import json
import numpy as np

from sklearn.model_selection import train_test_split
from spacy.training.example import Example

In [2]:
DATA_PATH = os.path.abspath("../../data/datasets/ner-swiss-receipts.json")  # Path to your training data file
OUTPUT_DIR = os.path.abspath("../models/receipt-item-details-ner-v01")  # TODO: make sure to use a new version
ITERATIONS = 30  # Number of training iterations
BATCH_SIZE = 8  # Batch size for training
DROPOUT = 0.5  # Dropout rate

## Preprocessing

### Load Dataset

In [3]:
with open(DATA_PATH, "r", encoding="utf-8") as file:
    dataset = json.load(file)

### Reformat for Spacy & Remove unwanted Entities

In [4]:
data = []
for entry in dataset:
    text = entry["text"]
    entities = [entity for entity in entry["entities"] if entity[2] != 'RECEIPT_ITEM']
    data.append((text, {"entities": entities}))

### Split Data into Train & Test

In [5]:
train, test = train_test_split(data, test_size=0.1)

In [6]:
print("Sample Training Data:")
sample_text, sample_annotaiton = train[0]
print("-----------------------------------------")
print(sample_text)
print("-----------------------------------------")
for ent in sample_annotaiton.get("entities"):
    print(f'Entity-Type: {ent[2]}, Entity text: {sample_text[ent[0]:ent[1]]}')

Sample Training Data:
-----------------------------------------
jez MIGROL

MIGROL Service Roos AG
Migro] Service
Beat Roos
Zeughausstrasse 38
b210 Sursee
Tel. 041 921 72 07

RECHNUNG 73995#
ERSTES TERSTETELSETELETSSETLETELTFETTERTET ET RE
Premiumpf lege -, 23.006

Unterboden prle ege: 3 ‚00 C

an

pr .. er .
re ie en a EN rn nn re

SUMME CHF 26. "00
Rar So .00

Wechselgeld -24.00

Netto Brutto
1.70 24.14 1. 26.00 C

CHE-391.003.927 MWST (6)
MIGROL Service Roos AG
Zeughausstrasse 36

6210 Sursee

u a hr tn men nn ee en a A an A Se m

CUMULUS zsgeozıasan

u me am an mn a u ar ea Hr 0 mn U An Tu u A En A

Besten Dank und
gute Fahrt

Es bediente Sie Dominik Jakob


-----------------------------------------
Entity-Type: RECEIPT_ITEM_NAME, Entity text: Premiumpf lege
Entity-Type: RECEIPT_ITEM_PRICE, Entity text: 23.00
Entity-Type: RECEIPT_ITEM_NAME, Entity text: Unterboden prle ege
Entity-Type: RECEIPT_ITEM_PRICE, Entity text: 3 ‚00


## Model Definition

Load an emtpy spacy model for specific language

In [7]:
# For a blank model, or replace with spacy.load("en_core_web_sm")
nlp = spacy.blank("de")

### Add Named-Entity Recognition Component

In [8]:
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

### Add Labels to Model

In [9]:
for _, annotations in train:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

### Spacy Examples

In [10]:
examples = []

for text, annotations in train:
    try:
        examples.append(Example.from_dict(nlp.make_doc(text), annotations))
    except Exception as e:
        print('!!ERROR!!' + str(e))
        continue

!!ERROR!![E103] Trying to set conflicting doc.ents: '(286, 287, 'RECEIPT_ITEM_QUANTITY')' and '(286, 290, 'RECEIPT_ITEM_PRICE')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.
!!ERROR!![E103] Trying to set conflicting doc.ents: '(254, 255, 'RECEIPT_ITEM_QUANTITY')' and '(254, 258, 'RECEIPT_ITEM_PRICE')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.



MIGROL Service Roos AG
Migro] Service
..." with entities "[[176, 190, 'RECEIPT_ITEM_NAME'], [194, 199, 'RECE...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

Magazine zum Globus AG
Pilatusstrasse 4, 6..." with entities "[[177, 193, 'RECEIPT_ITEM_NAME'], [195, 200, 'RECE...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
‘ £y , L"“H

[Ail f Ur ”"3h’ﬁf ,
“ 1 ‚ 44..." with entities "[[284, 285, 'RECEIPT_ITEM_QUANTITY'], [286, 307, '...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

Poststrasse 2 9494 Schaan

Rechnun..." with entities "[[156, 158, 'RECEIPT_ITEM_QUANTITY'], [159, 175, '...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities

### Training

In [11]:
# Disable other pipes during training for efficiency
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
        
    # Training loop
    print("Starting training...")
    for epoch in range(ITERATIONS):
        random.shuffle(examples)
        losses = {}
        for batch in spacy.util.minibatch(examples, size=BATCH_SIZE):
            nlp.update(batch, drop=DROPOUT, losses=losses)
        print(f"Epoch {epoch + 1}/{ITERATIONS}, Loss: {losses['ner']:.4f}")

Starting training...
Epoch 1/30, Loss: 7066.9717
Epoch 2/30, Loss: 2531.5986
Epoch 3/30, Loss: 1827.1016
Epoch 4/30, Loss: 1712.7734
Epoch 5/30, Loss: 1611.0862
Epoch 6/30, Loss: 1543.3706
Epoch 7/30, Loss: 1487.2064
Epoch 8/30, Loss: 1456.6031
Epoch 9/30, Loss: 1403.0322
Epoch 10/30, Loss: 1106.0182
Epoch 11/30, Loss: 1002.4337
Epoch 12/30, Loss: 943.2146
Epoch 13/30, Loss: 897.4747
Epoch 14/30, Loss: 882.0396
Epoch 15/30, Loss: 824.8223
Epoch 16/30, Loss: 799.8843
Epoch 17/30, Loss: 744.7394
Epoch 18/30, Loss: 701.8192
Epoch 19/30, Loss: 662.6771
Epoch 20/30, Loss: 667.6395
Epoch 21/30, Loss: 601.7375
Epoch 22/30, Loss: 585.2615
Epoch 23/30, Loss: 516.0106
Epoch 24/30, Loss: 500.4705
Epoch 25/30, Loss: 457.7902
Epoch 26/30, Loss: 449.3677
Epoch 27/30, Loss: 430.9229
Epoch 28/30, Loss: 403.3849
Epoch 29/30, Loss: 383.3532
Epoch 30/30, Loss: 366.7635


### Save Model

In [12]:
# Save the trained model
print(f"Saving model to {OUTPUT_DIR}...")
nlp.to_disk(OUTPUT_DIR)

Saving model to /Users/timon/git/dspro1-receipt-ocr/named-entity-recognition/models/receipt-item-details-ner-v01...


## Evaluation

In [13]:
def print_confusion_matrix(matrix, labels):
    """
    Prints a confusion matrix as a formatted string.

    Args:
        matrix (list of list of int): Confusion matrix as a 2D list.
        labels (list of str): Labels for the matrix.
    """
    max_label_length = max(len(label) for label in labels)
    padding = max_label_length + 2

    # Print header
    header = " " * padding + " ".join(f"{label:>{padding}}" for label in labels)
    print(f'{"":<15} Predicted Values')
    print(header)

    # Print each row
    for label, row in zip(labels, matrix):
        row_str = " ".join(f"{cell:>{padding}}" for cell in row)
        print(f"{label:>{padding}} {row_str}")

In [15]:
true_positive = []
false_positive = []
false_negative = []

index = 0

for text, annotations in test:
    # Process the text
    doc = nlp(text)
    
    # Prepare expected values (y)
    expected_entities = []
    for ent in annotations.get("entities"):    
        if ent[2] != 'RECEIPT_ITEM':
            expected_entities.append(text[ent[0]:ent[1]])

    # Print recognized entities
    for ent in doc.ents:
        if ent.text in expected_entities:
            true_positive.append(ent.text)
            expected_entities.remove(ent.text)
        else:
            false_positive.append(ent.text)
            
    false_negative.extend(expected_entities)
    
# Example usage
labels = ["Entity", "No Entity"]
confusion_matrix = [
    [len(true_positive), len(false_negative)],
    [len(false_positive), '0'],
]

print('Confusion Matrix:')
print_confusion_matrix(confusion_matrix, labels)

print('\nPerformance Measures:')
precision = len(true_positive) / (len(true_positive) + len(false_positive))
recall = len(true_positive) / (len(true_positive) + len(false_negative))
f1_score = 2 * precision * recall / (precision + recall)
print('Precision: ' + str(precision))
print('Recall: ' + str(recall))
print('F1 Score: ' + str(f1_score))

# True-Positive = correct idetified entities
# false_positive = actual entity that was not identified
# False-negative = identified entity that is actually not a entity

Confusion Matrix:
                Predicted Values
                Entity   No Entity
     Entity          86          36
  No Entity          35           0

Performance Measures:
Precision: 0.7107438016528925
Recall: 0.7049180327868853
F1 Score: 0.7078189300411523


In [29]:
doc = nlp(test[2][0])

displacy.serve(doc, style="ent", port=5001)


Using the 'ent' visualizer
Serving on http://0.0.0.0:5001 ...



127.0.0.1 - - [14/Jan/2025 19:33:17] "GET / HTTP/1.1" 200 4297
127.0.0.1 - - [14/Jan/2025 19:33:17] "GET /favicon.ico HTTP/1.1" 200 4297


Shutting down server on port 5001.
