## Imports

In [2]:
import os

import spacy
import random
import json
import numpy as np

from sklearn.model_selection import train_test_split
from spacy.training.example import Example

dataset_path = os.path.abspath('../../data/datasets/ner-swiss-receipts.json')


In [12]:
DATA_PATH = os.path.abspath("../../data/datasets/ner-swiss-receipts.json")  # Path to your training data file
OUTPUT_DIR = os.path.abspath("../models/receipts-ner-v02")  # TODO: make sure to use a new version
ITERATIONS = 30  # Number of training iterations
BATCH_SIZE = 8  # Batch size for training
DROPOUT = 0.5  # Dropout rate

## Preprocessing

### Load Dataset

In [3]:
with open(dataset_path, "r", encoding="utf-8") as file:
    dataset = json.load(file)

### Reformat for Spacy & Remove unwanted Entities

In [4]:
data = []
for entry in dataset:
    text = entry["text"]
    entities = [entity for entity in entry["entities"] if entity[2] == 'RECEIPT_ITEM']
    data.append((text, {"entities": entities}))

### Split Data into Train & Test

In [5]:
train, test = train_test_split(data, test_size=0.1)

In [7]:
print("Sample Training Data:")
sample_text, sample_annotaiton = train[0]
print("-----------------------------------------")
print(sample_text)
print("-----------------------------------------")
for ent in sample_annotaiton.get("entities"):
    print(f'Entity-Type: {ent[2]}, Entity text: {sample_text[ent[0]:ent[1]]}')

Sample Training Data:
-----------------------------------------
MANOR”

www.manor.ch
Emmen Center
6032 Emmen
041 269 46 99

[] [e]
EE

' Es bediente Sie:

S, Elshaz1ly 09.07.2021 11:91
B Strohhut 20.95 A
2003001006833
N Total Artikel: 1 20.95
N American Express 20.95
. Total Zahlungsmittel 20.

AMEX KXKKKKOKKROOOOGHKXH42 118354 31503551
Herzlicher: Dank für Ihren Einkauf

CHE-116.267.650 MuSt

Code betrag Satz MUST
A 20.95 7.70 1.50
Manoı AG, Base) Total 1.50

» 206 251 #20508 122579 «


-----------------------------------------
Entity-Type: RECEIPT_ITEM, Entity text: B Strohhut 20.95 A


## Model Definition

Load an emtpy spacy model for specific language

In [8]:
# For a blank model, or replace with spacy.load("en_core_web_sm")
nlp = spacy.blank("de")

### Add Named-Entity Recognition Component

In [9]:
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

### Add Labels to Model

In [10]:
for _, annotations in data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

### Spacy Examples

In [11]:
examples = [
    Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in data
]


Migrol Service
Überriauerst..." with entities "[[255, 282, 'RECEIPT_ITEM'], [283, 306, 'RECEIPT_I...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
FREUDE
AM
GESCHMACK

Macchi AG Bäckerei
Überna..." with entities "[[213, 234, 'RECEIPT_ITEM'], [235, 267, 'RECEIPT_I...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


### Training

In [13]:
# Disable other pipes during training for efficiency
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
        
    # Training loop
    print("Starting training...")
    for epoch in range(ITERATIONS):
        random.shuffle(examples)
        losses = {}
        for batch in spacy.util.minibatch(examples, size=BATCH_SIZE):
            nlp.update(batch, drop=DROPOUT, losses=losses)
        print(f"Epoch {epoch + 1}/{ITERATIONS}, Loss: {losses['ner']:.4f}")

Starting training...
Epoch 1/30, Loss: 5948.5698
Epoch 2/30, Loss: 1468.1434
Epoch 3/30, Loss: 1504.7682
Epoch 4/30, Loss: 850.7117
Epoch 5/30, Loss: 856.3832
Epoch 6/30, Loss: 839.6247
Epoch 7/30, Loss: 781.5732
Epoch 8/30, Loss: 633.9720
Epoch 9/30, Loss: 519.0314
Epoch 10/30, Loss: 429.3792
Epoch 11/30, Loss: 350.1024
Epoch 12/30, Loss: 309.6041
Epoch 13/30, Loss: 269.8421
Epoch 14/30, Loss: 241.0048
Epoch 15/30, Loss: 236.3765
Epoch 16/30, Loss: 224.4710
Epoch 17/30, Loss: 155.5443
Epoch 18/30, Loss: 164.9194
Epoch 19/30, Loss: 176.4708
Epoch 20/30, Loss: 147.3633
Epoch 21/30, Loss: 129.1356
Epoch 22/30, Loss: 134.7334
Epoch 23/30, Loss: 118.7207
Epoch 24/30, Loss: 119.9999
Epoch 25/30, Loss: 111.2022
Epoch 26/30, Loss: 112.1037
Epoch 27/30, Loss: 104.7212
Epoch 28/30, Loss: 85.9288
Epoch 29/30, Loss: 94.0414
Epoch 30/30, Loss: 74.5331


### Save Model

In [14]:
# Save the trained model
print(f"Saving model to {OUTPUT_DIR}...")
nlp.to_disk(OUTPUT_DIR)
print("Training complete!")

Saving model to /Users/timon/git/dspro1-receipt-ocr/named-entity-recognition/models/receipts-ner-v02...
Training complete!
