In [13]:
import random

import spacy
from spacy.training.example import Example


nlp = spacy.blank("en")

## Prepare Training Data

First, we have to know the exact position of the entities in a string

In [8]:
text_1 = '''
GREEN FIELD
5305 E PACIFIC COAST HWY
Long Beach, CA 90804
(562) 597-0906

Server: Francis         Station: 3
----------------------------------
Order #: 69923          Dine In
Table: B11              Guests: 2
----------------------------------
1 Coffee                3.00
2 Lunch                 45.90
1 Coke                  3.00

SUB TOTAL:              51.90
Tax 1:                  4.60

TOTAL:                  $56.58

    5/26/2016 12:53:10 PM

        THANK YOU!
'''

entities_1 = [
    '1 Coffee                3.00',
    '2 Lunch                 45.90',
    '1 Coke                  3.00'
]

for entity in entities_1:
    starting_index = text_1.index(entity)
    print(f'Entity: "{entity}" starts at: {starting_index}, ends at: {starting_index + len(entity)}')

Entity: "1 Coffee                3.00" starts at: 246, ends at: 274
Entity: "2 Lunch                 45.90" starts at: 275, ends at: 304
Entity: "1 Coke                  3.00" starts at: 305, ends at: 333


In [9]:
data1 = '''
GREEN FIELD
5305 E PACIFIC COAST HWY
Long Beach, CA 90804
(562) 597-0906

Server: Francis         Station: 3
----------------------------------
Order #: 69923          Dine In
Table: B11              Guests: 2
----------------------------------
1 Coffee                3.00
2 Lunch                 45.90
1 Coke                  3.00

SUB TOTAL:              51.90
Tax 1:                  4.60

TOTAL:                  $56.58

    5/26/2016 12:53:10 PM

        THANK YOU!
'''

training_data = [
    (
        data1,
        [
            (246, 274, "RECEIPT_ITEM"),
            (275, 304, "RECEIPT_ITEM"),
            (305, 333, "RECEIPT_ITEM"),
        ]
    ),
]

In [10]:
ner = None
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

Add labels to the NER

In [11]:
for _, annotations in training_data:
    for annotation in annotations:
        ner.add_label(annotation[2])

## Train the model
We need an optimizer and a few training parameters.

### Prepare the training data

In [15]:
examples = []
for text, annotations in training_data:
    examples.append(Example.from_dict(nlp.make_doc(text), annotations))

TypeError: Argument 'example_dict' has incorrect type (expected dict, got list)

### Disable other pipeline components for faster training

In [14]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for epoch in range(30):
        random.shuffle(examples)
        losses = {}
        for batch in spacy.util.minibatch(examples, size=8):
            nlp.update(batch, drop=0.5, losses=losses)
        print(f'Epoch {epoch}, Loss: {losses}')

NameError: name 'examples' is not defined