In [None]:
!python -m pip install --upgrade pip
!pip install spacy==2.3.5
!python -m spacy download en_core_web_sm

In [None]:
import pandas as pd

In [None]:
csv_file = 'data/data.csv'
ingredients_df = pd.read_csv(csv_file)

print(f"Total number of rows: {len(ingredients_df.index)}")

# print out the first few rows of data info
ingredients_df.head(10)

In [None]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
import pprint
pprinter = pprint.PrettyPrinter(indent=4)

In [None]:
from spacy.lang.de import German
from spacy.matcher import PhraseMatcher

nlp = German()

def pp(title, obj, newline=True):
    print(f'{title}:')
    pprinter.pprint(obj)
    if(newline):
        print()

def generate_patterns(nlp, series, label):
    entity_patterns = []
    matcher_patterns = []
    
    for item in series:
        pattern = item.lower() if isinstance(item, str) else f'{item:.2f}'
        entity_patterns.append({'label': label, 'pattern': pattern})
        matcher_patterns.append(nlp.make_doc(pattern))
    
    return (entity_patterns, matcher_patterns)

# 1. Generate patterns
We are going to generate patterns we can use for automatically labeling our training data using spacy.

In [None]:
UNITS = ingredients_df.unit.unique()

(unit_entity_patterns, unit_matcher_patterns) = generate_patterns(nlp, UNITS, 'UNIT')

pp('Unit entity examples:', unit_entity_patterns[0:10])
pp('Unit pattern examples', unit_matcher_patterns[0:10])

In [None]:
QUANTITIES = ingredients_df.quantity.unique()

(quantity_entity_patterns, quantity_matcher_patterns) = generate_patterns(nlp, QUANTITIES, 'QUANTITY')

pp('Quantity entity examples:', quantity_entity_patterns[0:10])
pp('Quantity pattern examples', quantity_matcher_patterns[0:10])

In [None]:
NAMES = ingredients_df.name.unique()

(name_entity_patterns, name_matcher_patterns) = generate_patterns(nlp, NAMES, 'NAME')

pp('Name entity examples:', name_entity_patterns[0:10])
pp('Name pattern examples', name_matcher_patterns[0:10])

# 2. Define a Metric

Different types of metrics can be used:
1. accuracy:
  - `(true positives + true negatives) / total`
  - In other words: correctly predicted / total
  - Question accuracy answers: How many selected items were correctly categorized?
2. recall:
  - `true positives / (true positives + false negatives)`
  - In other words: words correctly identified as entities / all words that are entities
  - Question recall answers: How many relevant items are selected?
3. precision:
  - `true positives / (true positives + false positives)`
  - In other words: words correctly identified as entities / words correctly and incorrectly identified as entities
  - Question precision answers: How many selected items are relevant?

`Accuracy` can be misleading if we have imbalanced data.

`Recall` is in our case more important than `precision`, since we want to catch all ingredients,
and we don't care if some non-ingredients are marked as such, we can just delete them later in the app with a
single click of a button. On the other hand not recognizing ingredients would force us to type them manually.

We still don't want to have too many non-ingredients in our ingredient's list though, so we could to use the `F1-Score`
which combines `recall` and `precision` as a metric for our problem.

Another approach could be to use precision as a **satisficing metric** and recall as an **optimizing metric**.

# 3. Create Training Data
Now we are going to loop through all the texts in the ingredient column and mark each part (quantity, unit and name) as different entities using the patterns we generated in step 1.

In [None]:
from sklearn.model_selection import train_test_split

# Number of examples - train: 343 - dev: 115 - test: 115
TRAIN_DEV_SET, TEST_SET = train_test_split(ingredients_df, test_size=0.2)
TRAIN_SET, DEV_SET = train_test_split(TRAIN_DEV_SET, test_size=0.25)

total = len(ingredients_df)
train_percent = len(TRAIN_SET) / total
dev_percent = len(DEV_SET) / total
test_percent = len(TEST_SET) / total

print(f'train-dev-test split: {train_percent:.0%} - {dev_percent:.0%} - {test_percent:.0%}')
print(f'train set: {len(TRAIN_SET)} examples') # 343
print(f'  dev set: {len(DEV_SET)} examples')   # 115
print(f' test set: {len(TEST_SET)} examples')  # 115

In [None]:
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
matcher.add('QUANTITY', None, *quantity_matcher_patterns)
matcher.add('UNIT', None, *unit_matcher_patterns)
matcher.add('NAME', None, *name_matcher_patterns)

TRAIN_DATA = []

for doc in nlp.pipe(ingredients_df.ingredient):
    entities = []
    for match_id, start, end in matcher(doc):
        span = doc[start:end]
        label = nlp.vocab.strings[match_id]
        entity = (span.start_char, span.end_char, label)
        entities.append(entity)
        
    training_example = (doc.text, {'entities': entities})
    TRAIN_DATA.append(training_example)
    
    print(training_example)