In [None]:
try:
    already_initialized
except NameError:
    !python -m pip install --upgrade pip
    !pip install spacy==2.3.5
    !pip install spacy-lookups-data
    !python -m spacy download de_core_news_sm
    already_initialized = True

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import spacy
import en_core_web_sm
from spacy.lang.de import German
from spacy.matcher import PhraseMatcher
from spacy.gold import GoldParse
from spacy.scorer import Scorer

import random
import pprint
import string
import re
from datetime import datetime
from pathlib import Path

In [None]:
csv_file = 'data/data.csv'
ingredients_df = pd.read_csv(csv_file)

print(f"Total number of rows: {len(ingredients_df.index)}")

# print out the first few rows of data info
ingredients_df.head(10)

In [None]:
pprinter = pprint.PrettyPrinter(indent=4)

In [None]:
def pp(title, obj, newline=True):
    print(f'{title}:')
    pprinter.pprint(obj)
    if(newline):
        print()

def split_words(series):
    unique = set()
    for item in series:
        cleaned = re.sub(r'['+re.escape(string.punctuation)+']', ' ', item)
        cleaned = re.sub(r' +', ' ', cleaned)
        words = cleaned.split()
        for word in words:
            unique.add(word)
    return list(unique)
    
def generate_patterns(model, series, label):
    entity_patterns = []
    matcher_patterns = []
    
    for item in series:
        pattern = item.lower() if isinstance(item, str) else f'{item:.2f}'
        entity_patterns.append({'label': label, 'pattern': pattern})
        matcher_patterns.append(model.make_doc(pattern))
    
    return (entity_patterns, matcher_patterns)

def generate_data(model, data):
    RESULT = []
    for doc in model.pipe(data):
        entities = []
        for match_id, start, end in matcher(doc):
            span = doc[start:end]
            label = model.vocab.strings[match_id]
            entity = (span.start_char, span.end_char, label)
            entities.append(entity)

        training_example = (doc.text, {'entities': entities})
        RESULT.append(training_example)
    return RESULT

# 1. Generate patterns
We are going to generate patterns we can use for automatically labeling our training data using spacy.

In [None]:
nlp = German()

UNITS = ingredients_df.unit.unique()

(unit_entity_patterns, unit_matcher_patterns) = generate_patterns(nlp, UNITS, 'UNIT')

pp('Unit entity examples:', unit_entity_patterns[0:10])
pp('Unit pattern examples', unit_matcher_patterns[0:10])

In [None]:
QUANTITIES = ingredients_df.quantity.unique()

(quantity_entity_patterns, quantity_matcher_patterns) = generate_patterns(nlp, QUANTITIES, 'QUANTITY')

pp('Quantity entity examples:', quantity_entity_patterns[0:10])
pp('Quantity pattern examples', quantity_matcher_patterns[0:10])

In [None]:
NAMES = ingredients_df.name.unique()

UNIQUE_WORDS = split_words(NAMES)

(name_entity_patterns, name_matcher_patterns) = generate_patterns(nlp, UNIQUE_WORDS, 'NAME')

pp('Name entity examples:', name_entity_patterns[0:10])
pp('Name pattern examples', name_matcher_patterns[0:10])

# 2. Define a Metric

Different types of metrics can be used:
1. accuracy:
  - `(true positives + true negatives) / total`
  - In other words: correctly predicted / total
  - Question accuracy answers: How many selected items were correctly categorized?
2. recall:
  - `true positives / (true positives + false negatives)`
  - In other words: words correctly identified as entities / all words that are entities
  - Question recall answers: How many relevant items are selected?
3. precision:
  - `true positives / (true positives + false positives)`
  - In other words: words correctly identified as entities / words correctly and incorrectly identified as entities
  - Question precision answers: How many selected items are relevant?

`Accuracy` can be misleading if we have imbalanced data.

`Recall` is in our case more important than `precision`, since we want to catch all ingredients,
and we don't care if some non-ingredients are marked as such, we can just delete them later in the app with a
single click of a button. On the other hand not recognizing ingredients would force us to type them manually.

We still don't want to have too many non-ingredients in our ingredient's list though, so we could to use the `F1-Score`
which combines `recall` and `precision` as a metric for our problem.

Another approach could be to use precision as a **satisficing metric** and recall as an **optimizing metric**.

# 3. Create Training Data
Now we are going to loop through all the texts in the ingredient column and mark each part (quantity, unit and name) as different entities using the patterns we generated in step 1.

In [None]:
# Number of examples - train: 343 - dev: 115 - test: 115
TRAIN_DEV_SET, TEST_SET = train_test_split(ingredients_df, test_size=0.2)
TRAIN_SET, DEV_SET = train_test_split(TRAIN_DEV_SET, test_size=0.25)

TRAIN_DEV_SET = None

total = len(ingredients_df)
train_percent = len(TRAIN_SET) / total
dev_percent = len(DEV_SET) / total
test_percent = len(TEST_SET) / total

print(f'train-dev-test split: {train_percent:.0%} - {dev_percent:.0%} - {test_percent:.0%}')
print(f'train set: {len(TRAIN_SET)} examples') # 343
print(f'  dev set: {len(DEV_SET)} examples')   # 115
print(f' test set: {len(TEST_SET)} examples')  # 115

In [None]:
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
matcher.add('QUANTITY', None, *quantity_matcher_patterns)
matcher.add('UNIT', None, *unit_matcher_patterns)
matcher.add('NAME', None, *name_matcher_patterns)

TRAIN_DATA = generate_data(nlp, TRAIN_SET.ingredient)
DEV_DATA = generate_data(nlp, DEV_SET.ingredient)
TEST_DATA = generate_data(nlp, TEST_SET.ingredient)

pp('TRAIN_DATA', TRAIN_DATA[0:5])
pp('DEV_DATA', DEV_DATA[0:5])
pp('TEST_DATA', TEST_DATA[0:5])

# Training Model

In [None]:
nlp = spacy.blank('de')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)
ner.add_label('QUANTITY')
ner.add_label('UNIT')
ner.add_label('NAME')

# TODO 2. improve training model https://v2.spacy.io/usage/training

optimizer = nlp.begin_training()

hyper = {'iterations': 10, 'minibatch_size': 2, 'dropout': 0.2}
loss_history = []
for iteration in range(hyper['iterations']):
    random.shuffle(TRAIN_DATA)
    
    losses = {}
    
    batches = spacy.util.minibatch(TRAIN_DATA, size=hyper['minibatch_size'])
    for batch in batches:
        texts = []
        annotations = []
        for text, entity_offsets in batch:
            doc = nlp.make_doc(text)
            gold = GoldParse(doc, entities=entity_offsets['entities'])
            spacy.gold.biluo_tags_from_offsets(doc, entity_offsets['entities'])
            texts.append(doc)
            annotations.append(gold)
        nlp.update(texts, annotations, losses=losses, sgd=optimizer, drop=hyper['dropout'])
        
    loss_history.append(losses['ner'])
    print(f'iteration {iteration} - loss: {losses}')

In [None]:
plt.plot(loss_history)
plt.ylabel('Loss')
plt.xlabel('iterations')
plt.show()

# Saving Model to Disk

In [None]:
now = datetime.today().strftime('%Y-%m-%d_%H-%M-%S')
model_name = f'model_{now}_loss-{loss_history[-1]:.2f}'
Path(f'./model').mkdir(parents=True, exist_ok=True)
nlp.to_disk(f'./model/{model_name}')

# Loading Model from Disk

In [None]:
model_to_load = model_name;

nlp = spacy.load(f'model/{model_to_load}')

# Testing Model

In [None]:
examples_to_try = 10
for i in range(examples_to_try):
    random.shuffle(TEST_DATA)
    text, _ = TEST_DATA[i]
    doc = nlp(text)
    print(text, '\n\t==>', [(ent.label_, ent.text) for ent in doc.ents])

# Evaluating Model

In [None]:
def evaluate(model, data):
    scorer = Scorer()
    for text, annotations in data:
        doc = model.make_doc(text)
        gold = GoldParse(doc, entities=annotations['entities'])
        prediction = model(text)
        scorer.score(prediction, gold)
    return scorer.scores

In [None]:
pp('Scores on TRAIN_DATA', evaluate(nlp, TRAIN_DATA))
pp('Scores on DEV_DATA', evaluate(nlp, DEV_DATA))
pp('Scores on TEST_DATA', evaluate(nlp, TEST_DATA))