In [13]:
import spacy
import json

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from spacy import displacy
from spacy.tokens import Span, DocBin

from spacy.displacy.render import DEFAULT_LABEL_COLORS

print(f'spaCy version: {spacy.__version__}')

# Load the trained model
nlp = spacy.load('./output/model-best')

spaCy version: 3.8.3


In [48]:
doc_bin = DocBin().from_disk("./training/test.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

In [49]:
# Example input text
text = docs[2]

# Process the text with the trained model
doc = nlp(text)

In [50]:
print(text)

Moosbacnstrasse I
8910 Affoltern a.A.
Tel 044 585 91 22

BELEG

#CSO Rechnungsnummer : 65 POSEJ Order:
KASSE 31- 1670472022 21:34:51

ANZ ARTIKEL TOTAL
1 Big Mac 6.90
1 Filet-0-Fish 5,90
1 Medium Pommes Frites 4.50
1 Keine Sauce
1 Coca-Cola 0.4L 3,50

INNEN TOTAL 20.80
Mastercard 20.80

Sst.Nr. CHE -449.055.085 MWST
SAT, BRUTTO MWST
sick, MwSt  1.70R 20.00 1.49
 KUNDENBELEG #

BUCHUNG

MC payPass
ktlos

ON
KÄKKKKKKKXKRXIOT

16.04.2022 21:34: 10
Irm-Id; 101200994




In [51]:
# Access the predicted spans
print("Predicted Spans:")
for span in doc.spans["sc"]:  # Use the spans_key defined in your config
    print(f"{span.label_}: {span.text}")

Predicted Spans:
RECEIPT_ITEM_QUANTITY: 1
RECEIPT_ITEM_PRICE: 6.90
RECEIPT_ITEM_QUANTITY: 1
RECEIPT_ITEM_NAME: Filet-0-Fish
RECEIPT_ITEM_PRICE: 5,90
RECEIPT_ITEM_QUANTITY: 1
RECEIPT_ITEM_PRICE: 4.50
RECEIPT_ITEM_QUANTITY: 1
RECEIPT_ITEM_PRICE: 3,50
RECEIPT_ITEM_NAME: Big Mac
RECEIPT_ITEM_NAME: Coca-Cola 0.4L
RECEIPT_ITEM: 1 Filet-0-Fish 5,90
RECEIPT_ITEM_NAME: Medium Pommes Frites
RECEIPT_ITEM: 1 Big Mac 6.90
RECEIPT_ITEM: 1 Coca-Cola 0.4L 3,50
RECEIPT_ITEM: 1 Medium Pommes Frites 4.50


In [52]:
class Redisplacy(object):
    """Wrapper around displacy.serve and displacy.render for style=span to abbreviate labels and
    set colors to labels."""
    def __init__(self, labels):
        """
        :param labels: list or dictionary with all used labels for spans.
            If a dict, it contains a mapping from the actually used labels of doc objects
            to another label, typically an abbreviation.
        """
        self.labels = labels

    def _make_color_dict(self):
        """Create a dictionary from labels to colors"""
        lst = []
        colors = {}
        if type(self.labels) == dict:
            lst = list(self.labels.values())
        elif type(self.labels) == list:
            lst = self.labels
        # We take the colors from DEFAULT_LABEL_COLORS in spacy/displacy/render.py
        color_list = list(DEFAULT_LABEL_COLORS.values())
        for i, l in enumerate(lst):
            # wrap if current label list is longer than DEFAULT_LABEL_COLORS
            c = color_list[i % len(color_list)]
            colors[l] = c
        return colors

    def _translate_docs(self, docs, **kw):
        """Replace existing labels if we have a translation/abbreviation"""
        if type(docs) != list:
            docs = [docs]
        out = []
        if 'options' in kw.keys() and 'spans_key' in kw['options']:
            spans_key = kw['options']['spans_key']
        else:
            spans_key = 'sc'
        if type(self.labels) == dict:
            for doc in docs:
                new_spans = []
                for s in doc.spans[spans_key]:
                    if s.label_ in self.labels.keys():
                        s.label_ = self.labels[s.label_]
                    new_spans.append(s)
                doc.spans[spans_key] = new_spans
                out.append(doc)
        else:
            out = docs
        return out

    def render(self, *args, **kw):
        args, kw = self._serve_or_render(*args, **kw)
        return displacy.render(*args, **kw)

    def serve(self, *args, **kw):
        args, kw = self._serve_or_render(*args, **kw)
        return displacy.serve(*args, **kw)

    def _serve_or_render(self, *args, **kw):
        if 'options' in kw.keys():
            options = kw['options']
        else:
            options = {}
        docs = args[0]
        if type(docs) != list:
            docs = [docs]
        # Translate the labels of the document(s)
        docs = self._translate_docs(docs, **kw)
        args = (docs,) + args[1:]
        options['colors'] = self._make_color_dict()
        kw['options'] = options
        kw['style'] = 'span'
        return args, kw

In [41]:
labels = set()
for s in doc.spans["sc"]:
    labels.add(s.label_)

colour_dict = {}
colours = list(DEFAULT_LABEL_COLORS.values())
for i, t in enumerate(labels):
    colour_dict[t] = colours[i]

In [53]:
translations = {
  'RECEIPT_ITEM': 'Item',
  'RECEIPT_ITEM_QUANTITY': 'Quantity',
  'RECEIPT_ITEM_NAME': 'Name',
  'RECEIPT_ITEM_PRICE': 'Price',
}

In [54]:
options = {'spans_key': 'sc'}

redisplay = Redisplacy(labels=translations)
redisplay.serve(doc, port=5001, style='span', options=options)




Using the 'span' visualizer
Serving on http://0.0.0.0:5001 ...



127.0.0.1 - - [15/Jan/2025 14:17:09] "GET / HTTP/1.1" 200 13274
127.0.0.1 - - [15/Jan/2025 14:17:09] "GET /favicon.ico HTTP/1.1" 200 13274


Shutting down server on port 5001.
