In [1]:
!python -m pip install --upgrade pip
!pip install spacy==2.3.5
!python -m spacy download en_core_web_sm

Collecting pip
  Using cached pip-21.0.1-py3-none-any.whl (1.5 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.3.3
    Uninstalling pip-20.3.3:
      Successfully uninstalled pip-20.3.3
Successfully installed pip-21.0.1
Collecting spacy==2.3.5
  Downloading spacy-2.3.5-cp36-cp36m-manylinux2014_x86_64.whl (10.4 MB)
[K     |████████████████████████████████| 10.4 MB 16.5 MB/s eta 0:00:01
[?25hCollecting srsly<1.1.0,>=1.0.2
  Downloading srsly-1.0.5-cp36-cp36m-manylinux2014_x86_64.whl (184 kB)
[K     |████████████████████████████████| 184 kB 48.5 MB/s eta 0:00:01
[?25hCollecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.5-cp36-cp36m-manylinux2014_x86_64.whl (20 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.5-cp36-cp36m-manylinux2014_x86_64.whl (35 kB)
Collecting thinc<7.5.0,>=7.4.1
  Downloading thinc-7.4.5-cp36-cp36m-manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 5

In [2]:
import pandas as pd

In [3]:
csv_file = 'data/data.csv'
ingredients_df = pd.read_csv(csv_file)

print(f"Total number of rows: {len(ingredients_df.index)}")

# print out the first few rows of data info
ingredients_df.head(10)

Total number of rows: 573


Unnamed: 0,ingredient,quantity,unit,name
0,2.00 Becher süße Sahne,2.0,Becher,süße Sahne
1,6.00 Blätter Basilikum,6.0,Blätter,Basilikum
2,4.00 Blätter Petersilie,4.0,Blätter,Petersilie
3,0.52 Bund Minze,0.52,Bund,Minze
4,0.52 Bund Petersilie,0.52,Bund,Petersilie
5,1.00 Bund Frühlingszwiebeln,1.0,Bund,Frühlingszwiebeln
6,0.52 Bund Petersilie,0.52,Bund,Petersilie
7,1.00 Bund Frühlingszwiebeln,1.0,Bund,Frühlingszwiebeln
8,1.00 Bund Koriander,1.0,Bund,Koriander
9,0.52 Bund Petersilie,0.52,Bund,Petersilie


In [4]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [5]:
import pprint
pprinter = pprint.PrettyPrinter(indent=4)

In [6]:
from spacy.lang.de import German
from spacy.matcher import PhraseMatcher

nlp = German()

def pp(title, obj, newline=True):
    print(f'{title}:')
    pprinter.pprint(obj)
    if(newline):
        print()

def generate_patterns(nlp, series, label):
    entity_patterns = []
    matcher_patterns = []
    
    for item in series:
        pattern = item.lower() if isinstance(item, str) else str(item)
        entity_patterns.append({'label': label, 'pattern': pattern})
        matcher_patterns.append(nlp.make_doc(pattern))
    
    return (entity_patterns, matcher_patterns)

# 1. Generate patterns
We are going to generate patterns we can use for automatically labeling our training data using spacy.

In [7]:
UNITS = ingredients_df.unit.unique()

(unit_entity_patterns, unit_matcher_patterns) = generate_patterns(nlp, UNITS, 'UNIT')

pp('Unit entity examples:', unit_entity_patterns[0:10])
pp('Unit pattern examples', unit_matcher_patterns[0:10])

Unit entity examples::
[   {'label': 'UNIT', 'pattern': 'becher'},
    {'label': 'UNIT', 'pattern': 'blätter'},
    {'label': 'UNIT', 'pattern': 'bund'},
    {'label': 'UNIT', 'pattern': 'cm'},
    {'label': 'UNIT', 'pattern': 'dosen'},
    {'label': 'UNIT', 'pattern': 'el'},
    {'label': 'UNIT', 'pattern': 'g'},
    {'label': 'UNIT', 'pattern': 'gläser'},
    {'label': 'UNIT', 'pattern': 'handvoll'},
    {'label': 'UNIT', 'pattern': 'kannen'}]

Unit pattern examples:
[becher, blätter, bund, cm, dosen, el, g, gläser, handvoll, kannen]



In [8]:
QUANTITIES = ingredients_df.quantity.unique()

(quantity_entity_patterns, quantity_matcher_patterns) = generate_patterns(nlp, QUANTITIES, 'QUANTITY')

pp('Quantity entity examples:', quantity_entity_patterns[0:10])
pp('Quantity pattern examples', quantity_matcher_patterns[0:10])

Quantity entity examples::
[   {'label': 'QUANTITY', 'pattern': '2.0'},
    {'label': 'QUANTITY', 'pattern': '6.0'},
    {'label': 'QUANTITY', 'pattern': '4.0'},
    {'label': 'QUANTITY', 'pattern': '0.52'},
    {'label': 'QUANTITY', 'pattern': '1.0'},
    {'label': 'QUANTITY', 'pattern': '0.32'},
    {'label': 'QUANTITY', 'pattern': '5.0'},
    {'label': 'QUANTITY', 'pattern': '3.0'},
    {'label': 'QUANTITY', 'pattern': '8.0'},
    {'label': 'QUANTITY', 'pattern': '0.28'}]

Quantity pattern examples:
[2.0, 6.0, 4.0, 0.52, 1.0, 0.32, 5.0, 3.0, 8.0, 0.28]



In [9]:
INGREDIENTS = ingredients_df.name.unique()

(ingredient_entity_patterns, ingredient_matcher_patterns) = generate_patterns(nlp, INGREDIENTS, 'INGREDIENT')

pp('Ingredient entity examples:', ingredient_entity_patterns[0:10])
pp('Ingredient pattern examples', ingredient_matcher_patterns[0:10])

Ingredient entity examples::
[   {'label': 'INGREDIENT', 'pattern': 'süße sahne'},
    {'label': 'INGREDIENT', 'pattern': 'basilikum'},
    {'label': 'INGREDIENT', 'pattern': 'petersilie'},
    {'label': 'INGREDIENT', 'pattern': 'minze'},
    {'label': 'INGREDIENT', 'pattern': 'frühlingszwiebeln'},
    {'label': 'INGREDIENT', 'pattern': 'koriander'},
    {'label': 'INGREDIENT', 'pattern': 'ingwer'},
    {'label': 'INGREDIENT', 'pattern': 'kichererbsen'},
    {'label': 'INGREDIENT', 'pattern': 'thunfisch'},
    {'label': 'INGREDIENT', 'pattern': 'geschälte tomaten'}]

Ingredient pattern examples:
[   süße sahne,
    basilikum,
    petersilie,
    minze,
    frühlingszwiebeln,
    koriander,
    ingwer,
    kichererbsen,
    thunfisch,
    geschälte tomaten]



# 2. Create Training Data
Now we are going to loop through all the texts in the ingredient column and mark each part (quantity, unit and name) as different entities using the patterns we generated in step 1.

In [11]:
from sklearn.model_selection import train_test_split

# Number of examples - train: 343 - dev: 115 - test: 115
TRAIN_DEV_DATA, TEST_DATA = train_test_split(ingredients_df, test_size=0.2)
TRAIN_DATA, DEV_DATA = train_test_split(TRAIN_DEV_DATA, test_size=0.25)

print(f'train set: {len(TRAIN_DATA)} examples') # 343
print(f'  dev set: {len(DEV_DATA)} examples')   # 115
print(f' test set: {len(TEST_DATA)} examples')  # 115

TRAIN_DATA = []

train set: 343 examples
  dev set: 115 examples
 test set: 115 examples


# 3. Define a Metric

Different types of metrics:
1. accuracy:
  - `(true positives + true negatives) / total`
  - correctly predicted / total
2. recall:
  - `true positives / (true positives + false negatives)`
  - words correctly identified as entities / all words that are entities
  - How many relevant items are selected?
3. precision:
  - `true positives / (true positives + false positives)`
  - words correctly identified as entities / words correctly and incorrectly identified as entities
  - How many selected items are relevant?

`Accuracy` can be misleading if we have imbalanced data.

`Recall` is in our case more important than `precision`, since we want to catch all ingredients,
and we don't care if some non-ingredients are marked as such, we can just delete them later in the app with a
single click of a button. On the other hand not recognizing ingredients would force us to type them manually.

We still don't want to have too many non-ingredients in our ingredient's list though, so we could to use the `F1-Score`
which combines `recall` and `precision` as a metric for our problem.

Another approach could be to use precision as a **satisficing metric** and recall as an **optimizing metric**.