In [None]:
!python -m pip install --upgrade pip
!pip install spacy==2.3.5
!python -m spacy download en_core_web_sm

In [2]:
import pandas as pd

In [3]:
csv_file = 'data/data.csv'
ingredients_df = pd.read_csv(csv_file)

print(f"Total number of rows: {len(ingredients_df.index)}")

# print out the first few rows of data info
ingredients_df.head(10)

Total number of rows: 573


Unnamed: 0,ingredient,quantity,unit,name
0,2.00 Becher süße Sahne,2.0,Becher,süße Sahne
1,6.00 Blätter Basilikum,6.0,Blätter,Basilikum
2,4.00 Blätter Petersilie,4.0,Blätter,Petersilie
3,0.52 Bund Minze,0.52,Bund,Minze
4,0.52 Bund Petersilie,0.52,Bund,Petersilie
5,1.00 Bund Frühlingszwiebeln,1.0,Bund,Frühlingszwiebeln
6,0.52 Bund Petersilie,0.52,Bund,Petersilie
7,1.00 Bund Frühlingszwiebeln,1.0,Bund,Frühlingszwiebeln
8,1.00 Bund Koriander,1.0,Bund,Koriander
9,0.52 Bund Petersilie,0.52,Bund,Petersilie


In [4]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [34]:
import pprint
pprinter = pprint.PrettyPrinter(indent=4)

In [64]:
from spacy.lang.de import German
from spacy.matcher import PhraseMatcher

nlp = German()

def pp(title, obj, newline=True):
    print(f'{title}:')
    pprinter.pprint(obj)
    if(newline):
        print()

def generate_patterns(nlp, series, label):
    entity_patterns = []
    matcher_patterns = []
    
    for item in series:
        pattern = item.lower() if isinstance(item, str) else str(item)
        entity_patterns.append({'label': label, 'pattern': pattern})
        matcher_patterns.append(nlp.make_doc(pattern))
    
    return (entity_patterns, matcher_patterns)

In [65]:
UNITS = ingredients_df.unit.unique()

(unit_entity_patterns, unit_matcher_patterns) = generate_patterns(nlp, UNITS, 'UNIT')

pp('Unit entity examples:', unit_entity_patterns[0:10])
pp('Unit pattern examples', unit_matcher_patterns[0:10])

Unit entity examples::
[   {'label': 'UNIT', 'pattern': 'becher'},
    {'label': 'UNIT', 'pattern': 'blätter'},
    {'label': 'UNIT', 'pattern': 'bund'},
    {'label': 'UNIT', 'pattern': 'cm'},
    {'label': 'UNIT', 'pattern': 'dosen'},
    {'label': 'UNIT', 'pattern': 'el'},
    {'label': 'UNIT', 'pattern': 'g'},
    {'label': 'UNIT', 'pattern': 'gläser'},
    {'label': 'UNIT', 'pattern': 'handvoll'},
    {'label': 'UNIT', 'pattern': 'kannen'}]

Unit pattern examples:
[becher, blätter, bund, cm, dosen, el, g, gläser, handvoll, kannen]



In [66]:
QUANTITIES = ingredients_df.quantity.unique()

(quantity_entity_patterns, quantity_matcher_patterns) = generate_patterns(nlp, QUANTITIES, 'QUANTITY')

pp('Quantity entity examples:', quantity_entity_patterns[0:10])
pp('Quantity pattern examples', quantity_matcher_patterns[0:10])

Quantity entity examples::
[   {'label': 'QUANTITY', 'pattern': '2.0'},
    {'label': 'QUANTITY', 'pattern': '6.0'},
    {'label': 'QUANTITY', 'pattern': '4.0'},
    {'label': 'QUANTITY', 'pattern': '0.52'},
    {'label': 'QUANTITY', 'pattern': '1.0'},
    {'label': 'QUANTITY', 'pattern': '0.32'},
    {'label': 'QUANTITY', 'pattern': '5.0'},
    {'label': 'QUANTITY', 'pattern': '3.0'},
    {'label': 'QUANTITY', 'pattern': '8.0'},
    {'label': 'QUANTITY', 'pattern': '0.28'}]

Quantity pattern examples:
[2.0, 6.0, 4.0, 0.52, 1.0, 0.32, 5.0, 3.0, 8.0, 0.28]

