In [14]:
import pandas as pd
import re
import random
import spacy
from spacy.util import minibatch, compounding
import warnings
#import matplotlib.pyplot as plt
# https://en.wikipedia.org/wiki/List_of_cuisines

In [15]:
# read in the food csv file
food_df = pd.read_csv("cuisines.csv")
food_df = food_df.sample(frac=1)

# print row and column information
food_df.head()

# print the size 
food_df["food"].size

# diaqualify foods with special characters, lowercase and extract results from "food" column
foods = food_df[food_df["food"].str.contains("[^a-zA-Z ]") == False]["food"].apply(lambda food: food.lower())

# filter out foods with more than 3 words, drop any duplicates
foods = foods[foods.str.split().apply(len) <= 3].drop_duplicates()
# find one-worded, two-worded and three-worded foods
one_worded_foods = foods[foods.str.split().apply(len) == 1]
two_worded_foods = foods[foods.str.split().apply(len) == 2]
three_worded_foods = foods[foods.str.split().apply(len) == 3]

# total number of foods
total_num_foods = round(one_worded_foods.size / 45 * 100)

# shuffle the 2-worded and 3-worded foods since we'll be slicing them
two_worded_foods = two_worded_foods.sample(frac=1)
three_worded_foods = three_worded_foods.sample(frac=1)

# append the foods together 
foods = one_worded_foods.append(two_worded_foods[:round(total_num_foods * 0.30)]).append(three_worded_foods[:round(total_num_foods * 0.25)])

# print the resulting sizes
for i in range(3):
    print(f"{i+1}-worded food entities:", foods[foods.str.split().apply(len) == i + 1].size)

1-worded food entities: 185
2-worded food entities: 77
3-worded food entities: 30


In [16]:
food_templates = [
    "I ate my {}",
    "I'm eating a {}",
    "I just ate a {}",
    "I only ate the {}",
    "I'm done eating a {}",
    "I've already eaten a {}",
    "I just finished my {}",
    "When I was having lunch I ate a {}",
    "I had a {} and a {} today",
    "I ate a {} and a {} for lunch",
    "I made a {} and {} for lunch",
    "I ate {} and {}",
    "today I ate a {} and a {} for lunch",
    "I had {} with my husband last night",
    "I brought you some {} on my birthday",
    "I made {} for yesterday's dinner",
    "last night, a {} was sent to me with {}",
    "I had {} yesterday and I'd like to eat it anyway",
    "I ate a couple of {} last night",
    "I had some {} at dinner last night",
    "Last night, I ordered some {}",
    "I made a {} last night",
    "I had a bowl of {} with {} and I wanted to go to the mall today",
    "I brought a basket of {} for breakfast this morning",
    "I had a bowl of {}",
    "I ate a {} with {} in the morning",
    "I made a bowl of {} for my breakfast",
    "There's {} for breakfast in the bowl this morning",
    "This morning, I made a bowl of {}",
    "I decided to have some {} as a little bonus",
    "I decided to enjoy some {}",
    "I've decided to have some {} for dessert",
    "I had a {}, a {} and {} at home",
    "I took a {}, {} and {} on the weekend",
    "I ate a {} with {} and {} just now",
    "Last night, I ate an {} with {} and {}",
    "I tasted some {}, {} and {} at the office",
    "There's a basket of {}, {} and {} that I consumed",
    "I devoured a {}, {} and {}",
    "I've already had a bag of {}, {} and {} from the fridge"
]

# create dictionaries to store the generated food combinations. Do note that one_food != one_worded_food. one_food == "barbecue sauce", one_worded_food == "sauce"
TRAIN_FOOD_DATA = {
    "one_food": [],
    "two_foods": [],
    "three_foods": []
}

TEST_FOOD_DATA = {
    "one_food": [],
    "two_foods": [],
    "three_foods": []
}

In [17]:
# one_food, two_food, and three_food combinations will be limited to 167 sentences
FOOD_SENTENCE_LIMIT = 80

# helper function for deciding what dictionary and subsequent array to append the food sentence on to
def get_food_data(count):
    return {
        1: TRAIN_FOOD_DATA["one_food"] if len(TRAIN_FOOD_DATA["one_food"]) < FOOD_SENTENCE_LIMIT else TEST_FOOD_DATA["one_food"],
        2: TRAIN_FOOD_DATA["two_foods"] if len(TRAIN_FOOD_DATA["two_foods"]) < FOOD_SENTENCE_LIMIT else TEST_FOOD_DATA["two_foods"],
        3: TRAIN_FOOD_DATA["three_foods"] if len(TRAIN_FOOD_DATA["three_foods"]) < FOOD_SENTENCE_LIMIT else TEST_FOOD_DATA["three_foods"],
    }[count]

# the pattern to replace from the template sentences
pattern_to_replace = "{}"


# the count that helps us decide when to break from the for loop
food_entity_count = foods.size - 1

In [18]:
idx_lst = foods.index
idx_lst


Int64Index([ 93,   6, 115, 334, 293,  20,  77,  17, 133, 328,
            ...
            201, 250, 230, 274, 286, 291, 265, 258, 228, 273],
           dtype='int64', length=292)

In [19]:
# start the while loop, ensure we don't get an index out of bounds error
while food_entity_count >= 2:
    entities = []

    # pick a random food template
    sentence = food_templates[random.randint(0, len(food_templates) - 1)]

    # find out how many braces "{}" need to be replaced in the template
    matches = re.findall(pattern_to_replace, sentence)

    # for each brace, replace with a food entity from the shuffled food data
    for match in matches:
        food = foods.iloc[food_entity_count]
        lab = food_df.loc[idx_lst[food_entity_count]]['label']
        food_entity_count -= 1
        # replace the pattern, but then find the match of the food entity we just inserted
        sentence = sentence.replace(match, food, 1)
        match_span = re.search(food, sentence).span()

        # use that match to find the index positions of the food entity in the sentence, append
        entities.append((match_span[0], match_span[1], lab))
    # append the sentence and the position of the entities to the correct dictionary and array
    get_food_data(len(matches)).append((sentence, {"entities": entities}))


In [20]:
sentence

'I made a bowl of kimbap for my breakfast'

In [21]:
# print the number of food sentences, as well as an example sentence
for key in TRAIN_FOOD_DATA:
    print("{} {} sentences: {}".format(len(TRAIN_FOOD_DATA[key]), key, TRAIN_FOOD_DATA[key][0]))

for key in TEST_FOOD_DATA:
    print("{} {} items: {}".format(len(TEST_FOOD_DATA[key]), key, TEST_FOOD_DATA[key][0]))

# READ!
# can ignore the error below, some lists are empty because we have not enough data

80 one_food sentences: ("There's steamed fish head for breakfast in the bowl this morning", {'entities': [(8, 25, 'CHINESE')]})
29 two_foods sentences: ('I ate jiyu fried shallots and deep fried sausage', {'entities': [(6, 25, 'CHINESE'), (30, 48, 'WESTERN')]})
37 three_foods sentences: ('I had a kung pao chicken, a cured ham  cowpeas and five colours pearls at home', {'entities': [(8, 24, 'CHINESE'), (28, 46, 'CHINESE'), (51, 70, 'CHINESE')]})
41 one_food items: ('I just finished my ganghoe', {'entities': [(19, 26, 'KOREAN')]})


IndexError: list index out of range

In [22]:
print(len(TEST_FOOD_DATA['one_food']))
TEST_FOOD_DATA

41


{'one_food': [('I just finished my ganghoe',
   {'entities': [(19, 26, 'KOREAN')]}),
  ("I've already eaten a tuna", {'entities': [(21, 25, 'JAPANESE')]}),
  ('I just ate a dangmyeon', {'entities': [(13, 22, 'KOREAN')]}),
  ("I made bagel for yesterday's dinner", {'entities': [(7, 12, 'WESTERN')]}),
  ('When I was having lunch I ate a tonkatsu',
   {'entities': [(32, 40, 'JAPANESE')]}),
  ('I made a soegogi last night', {'entities': [(9, 16, 'KOREAN')]}),
  ('I made a bowl of boksunga for my breakfast',
   {'entities': [(17, 25, 'KOREAN')]}),
  ('I decided to enjoy some daktoritang', {'entities': [(24, 35, 'KOREAN')]}),
  ("I had oiseon yesterday and I'd like to eat it anyway",
   {'entities': [(6, 12, 'KOREAN')]}),
  ("I had calas yesterday and I'd like to eat it anyway",
   {'entities': [(6, 11, 'WESTERN')]}),
  ('When I was having lunch I ate a truffle',
   {'entities': [(32, 39, 'WESTERN')]}),
  ("I'm eating a subak", {'entities': [(13, 18, 'KOREAN')]}),
  ("I made chueotang for ye

In [23]:
len(TRAIN_FOOD_DATA['one_food'])

80

In [24]:
nlp = spacy.load("en_core_web_sm")


In [None]:
# nlp=spacy.blank("en")

# nlp.create_pipe('ner')

<spacy.pipeline.ner.EntityRecognizer at 0x162e385f0>

In [25]:
nlp

<spacy.lang.en.English at 0x1176a117d90>

In [26]:
# read in the revision data (just used a random article dataset from a different course I had taken)
npr_df = pd.read_csv("npr.csv")

# print row and column information
npr_df.head()

Unnamed: 0,Article
0,I would like a place selling sushi at changi
1,show me places selling chinese food around town
2,show me places selling ribs around town
3,show me places selling sushi at town


In [28]:
revision_texts = []

# convert the articles to spacy objects to better identify the sentences. Disabled unneeded components. # takes ~ 4 minutes
for doc in nlp.pipe(npr_df["Article"][:100], batch_size=30, disable=["tagger", "ner"]):
    for sentence in doc.sents:
        revision_texts.append(" ".join(re.split("\s+", sentence.text, flags=re.UNICODE)))

In [29]:
revisions = []

# Use the existing spaCy model to predict the entities, then append them to revision
for doc in nlp.pipe(revision_texts, disable=["tagger", "parser"]):
    
    # don't append sentences that have no entities
    if len(doc.ents) > 0:
        revisions.append((doc.text, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}))

In [30]:
# print an example of the revision sentence
print(revisions[0][0])

# print an example of the revision data
print(revisions[0][1])

show me places selling chinese food around town
{'entities': [(23, 30, 'NORP')]}


In [31]:
# create arrays to store the revision data
TRAIN_REVISION_DATA = []
TEST_REVISION_DATA = []

# create dictionaries to keep count of the different entities
TRAIN_ENTITY_COUNTER = {}
TEST_ENTITY_COUNTER = {}

# This will help distribute the entities (i.e. we don't want 1000 PERSON entities, but only 80 ORG entities)
REVISION_SENTENCE_SOFT_LIMIT = 100

# helper function for incrementing the revision counters
def increment_revision_counters(entity_counter, entities):
    for entity in entities:
        label = entity[2]
        if label in entity_counter:
            entity_counter[label] += 1
        else:
            entity_counter[label] = 1

random.shuffle(revisions)
for revision in revisions:
    # get the entities from the revision sentence
    entities = revision[1]["entities"]

    # simple hack to make sure spaCy entities don't get too one-sided
    should_append_to_train_counter = 0
    for _, _, label in entities:
        if label in TRAIN_ENTITY_COUNTER and TRAIN_ENTITY_COUNTER[label] > REVISION_SENTENCE_SOFT_LIMIT:
            should_append_to_train_counter -= 1
        else:
            should_append_to_train_counter += 1

    # simple switch for deciding whether to append to train data or test data
    if should_append_to_train_counter >= 0:
        TRAIN_REVISION_DATA.append(revision)
        increment_revision_counters(TRAIN_ENTITY_COUNTER, entities)
    else:
        TEST_REVISION_DATA.append(revision)
        increment_revision_counters(TEST_ENTITY_COUNTER, entities)

In [32]:
# combine the food training data
TRAIN_FOOD_DATA_COMBINED = TRAIN_FOOD_DATA["one_food"] + TRAIN_FOOD_DATA["two_foods"] + TRAIN_FOOD_DATA["three_foods"]

# print the length of the food training data
print("FOOD", len(TRAIN_FOOD_DATA_COMBINED))

# print the length of the revision training data
print("REVISION", len(TRAIN_REVISION_DATA))

# join and print the combined length
TRAIN_DATA = TRAIN_REVISION_DATA + TRAIN_FOOD_DATA_COMBINED
# TRAIN_DATA = TRAIN_FOOD_DATA_COMBINED
print("COMBINED", len(TRAIN_DATA))

FOOD 146
REVISION 1
COMBINED 147


In [33]:
TRAIN_FOOD_DATA_COMBINED

[("There's steamed fish head for breakfast in the bowl this morning",
  {'entities': [(8, 25, 'CHINESE')]}),
 ("I made dongpo braised pork for yesterday's dinner",
  {'entities': [(7, 26, 'CHINESE')]}),
 ('I made a crispy fish skin last night', {'entities': [(9, 25, 'CHINESE')]}),
 ('I just finished my stirfried duck blood',
  {'entities': [(19, 39, 'CHINESE')]}),
 ('I just finished my fried eel slices', {'entities': [(19, 35, 'CHINESE')]}),
 ("I've already eaten a rice and gravy", {'entities': [(21, 35, 'WESTERN')]}),
 ('I just ate a twice cooked pork', {'entities': [(13, 30, 'CHINESE')]}),
 ('I ate a couple of xiao long bao last night',
  {'entities': [(18, 31, 'CHINESE')]}),
 ('This morning, I made a bowl of stuffed fish balls',
  {'entities': [(31, 49, 'CHINESE')]}),
 ('I decided to enjoy some chili con carne',
  {'entities': [(24, 39, 'WESTERN')]}),
 ("I'm done eating a five colours shrimp",
  {'entities': [(18, 37, 'CHINESE')]}),
 ('I ate my mac and cheese', {'entities': [(9, 23,

In [34]:
# add NER to the pipeline and the new label
ner = nlp.get_pipe("ner")
ner.add_label("FOOD")

# get the names of the components we want to disable during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]


In [35]:
nlp.vocab

<spacy.vocab.Vocab at 0x11758ba1c10>

In [36]:
from spacy.training import Example
from spacy.tokens import Doc

In [37]:
# start the training loop, only training NER
epochs = 30
optimizer = nlp.resume_training()
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    sizes = compounding(1.0, 4.0, 1.001)
    
    # batch up the examples using spaCy's minibatc
    for epoch in range(epochs):
        examples = TRAIN_DATA
        random.shuffle(examples)
        batches = minibatch(examples, size=sizes)
        losses = {}
        
        for batch in batches:
            texts, annotations = zip(*batch)
            doc = nlp.make_doc(texts[0])
            a = annotations[0]
            token_ref = []
            tags_ref = []
            words = []
            for ent1 in a['entities']:
                start = ent1[0]
                end = ent1[1]
                label =texts[0][start:end]
                token = ent1[2]
                token_ref.append(token)
                tags_ref.append(label)
                words.append(token)
            predicted = Doc(nlp.vocab, words)
            example = Example.from_dict(predicted, {"words": token_ref, "tags": tags_ref})
            nlp.update([example], sgd=optimizer, drop=0.1, losses=losses)

        print("Losses ({}/{})".format(epoch + 1, epochs), losses)



Losses (1/30) {'ner': 0.0}
Losses (2/30) {'ner': 0.0}
Losses (3/30) {'ner': 0.0}
Losses (4/30) {'ner': 0.0}
Losses (5/30) {'ner': 0.0}
Losses (6/30) {'ner': 0.0}
Losses (7/30) {'ner': 0.0}
Losses (8/30) {'ner': 0.0}
Losses (9/30) {'ner': 0.0}
Losses (10/30) {'ner': 0.0}
Losses (11/30) {'ner': 0.0}
Losses (12/30) {'ner': 0.0}
Losses (13/30) {'ner': 0.0}
Losses (14/30) {'ner': 0.0}
Losses (15/30) {'ner': 0.0}
Losses (16/30) {'ner': 0.0}
Losses (17/30) {'ner': 0.0}
Losses (18/30) {'ner': 0.0}
Losses (19/30) {'ner': 0.0}
Losses (20/30) {'ner': 0.0}
Losses (21/30) {'ner': 0.0}
Losses (22/30) {'ner': 0.0}
Losses (23/30) {'ner': 0.0}
Losses (24/30) {'ner': 0.0}
Losses (25/30) {'ner': 0.0}
Losses (26/30) {'ner': 0.0}
Losses (27/30) {'ner': 0.0}
Losses (28/30) {'ner': 0.0}
Losses (29/30) {'ner': 0.0}
Losses (30/30) {'ner': 0.0}


In [39]:
# display sentence involving original entities
spacy.displacy.render(nlp("sushi is delicious"), style="ent")

# display sentences involving target entity
spacy.displacy.render(nlp("I had a fish and chips for lunch today."), style="ent")
spacy.displacy.render(nlp("I decided to have kimchi as a little treat for myself."), style="ent")
spacy.displacy.render(nlp("I ordered basmati rice, leaf spinach and cheese from Tesco yesterday"), style="ent")


In [40]:
doc = nlp("I was eating a burger")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities []


In [41]:
texts[0]

'show me places selling chinese food around town'

In [42]:
a

{'entities': [(23, 30, 'NORP')]}

In [43]:
a = annotations[0]
for ent1 in a['entities']:
    print(ent1)
    start = ent1[0]
    end = ent1[1]
    print('word picked out is:', texts[0][start:end])


(23, 30, 'NORP')
word picked out is: chinese
