In [1]:
# Import and load the spacy model
import spacy
from spacy.training import Example

nlp=spacy.blank("en") 

# Getting the ner component
ner=nlp.add_pipe('ner')

In [2]:
# New label to add
LABEL = "FOOD"

# Training examples in the required format
TRAIN_DATA =[ ("Pizza is a common fast food.", {"entities": [(0, 5, "FOOD")]}),
              ("Pasta is an italian recipe", {"entities": [(0, 5, "FOOD")]}),
              ("China's noodles are very famous", {"entities": [(8,14, "FOOD")]}),
              ("Shrimps are famous in China too", {"entities": [(0,7, "FOOD")]}),
              ("Lasagna is another classic of Italy", {"entities": [(0,7, "FOOD")]}),
              ("Sushi is extemely famous and expensive Japanese dish", {"entities": [(0,5, "FOOD")]}),
              ("Unagi is a famous seafood of Japan", {"entities": [(0,5, "FOOD")]}),
              ("Tempura , Soba are other famous dishes of Japan", {"entities": [(0,7, "FOOD")]}),
              ("Udon is a healthy type of noodles", {"entities": [(0,4, "ORG")]}),
              ("Yogurt is a healthy type of food", {"entities": [(0,6, "FOOD")]}),
              ("Chocolate soufflé is extremely famous french cuisine", {"entities": [(0,17, "FOOD")]}),
              ("Flamiche is french pastry", {"entities": [(0,8, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Frenchfries are considered too oily", {"entities": [(0,11, "FOOD")]})
           ]

In [3]:
from sklearn.model_selection import ShuffleSplit
import numpy as np
rs = ShuffleSplit(n_splits=1, test_size=.25, random_state=57)

train = []
test = []

for train_index, test_index in rs.split(TRAIN_DATA):
    train = np.array(TRAIN_DATA)[train_index.astype(int)]
    test = np.array(TRAIN_DATA)[test_index.astype(int)]

print(test)
    
examples = []
for text, annots in train:
    examples.append(Example.from_dict(nlp.make_doc(text), annots))
nlp.initialize(lambda: examples)

[['Chocolate soufflé is extremely famous french cuisine'
  {'entities': [(0, 17, 'FOOD')]}]
 ['Burgers are the most commonly consumed fastfood'
  {'entities': [(0, 7, 'FOOD')]}]
 ['Lasagna is another classic of Italy' {'entities': [(0, 7, 'FOOD')]}]
 ['Shrimps are famous in China too' {'entities': [(0, 7, 'FOOD')]}]]




<thinc.optimizers.Optimizer at 0x1a0fd5f7ea0>

In [4]:
# Add the new label to ner
ner.add_label(LABEL)

# Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [5]:
# Importing requirements
from spacy.util import minibatch, compounding
import random

# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes) :

    sizes = compounding(1.0, 4.0, 1.001)
    # Training for 30 iterations     
    for itn in range(30):
    # shuffle examples before training
        random.shuffle(examples)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(examples, size=sizes)
        losses = {}
        for batch in batches:
            # Calling update() over the iteration
            nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

Losses {'ner': 2.999999850988388}
Losses {'ner': 9.333915024995804}
Losses {'ner': 15.568691402673721}
Losses {'ner': 23.427887231111526}
Losses {'ner': 27.751150585711002}
Losses {'ner': 31.096242882311344}
Losses {'ner': 37.45279733091593}
Losses {'ner': 42.975265838205814}
Losses {'ner': 46.76392171531916}
Losses {'ner': 51.8276744261384}
Losses {'ner': 56.42733583599329}
Losses {'ner': 5.521767973899841}
Losses {'ner': 9.56801849603653}
Losses {'ner': 11.900128066539764}
Losses {'ner': 15.747126191854477}
Losses {'ner': 17.828722268342972}
Losses {'ner': 20.951374396681786}
Losses {'ner': 23.465959936380386}
Losses {'ner': 25.671693846583366}
Losses {'ner': 27.235261999070644}
Losses {'ner': 28.616341162472963}
Losses {'ner': 30.169143456965685}
Losses {'ner': 1.7895366502925754}
Losses {'ner': 3.6105298111215234}
Losses {'ner': 5.16975699365139}
Losses {'ner': 6.813375455560163}
Losses {'ner': 7.025237644062145}
Losses {'ner': 8.781821798969759}
Losses {'ner': 9.996981482709089}
L

Losses {'ner': 0.0019474280931841212}
Losses {'ner': 0.0019474282036954149}
Losses {'ner': 0.0019474803775781702}
Losses {'ner': 0.0019475452884119824}
Losses {'ner': 0.0019475479956527046}
Losses {'ner': 0.0019478687939710555}
Losses {'ner': 0.0019478687949428118}
Losses {'ner': 0.001947868843095954}
Losses {'ner': 4.456039428223336e-15}
Losses {'ner': 1.8024609830822888e-10}
Losses {'ner': 2.008960936677982e-10}
Losses {'ner': 9.550426058442257e-05}
Losses {'ner': 9.550449745074715e-05}
Losses {'ner': 9.55054563378578e-05}
Losses {'ner': 9.550569354110242e-05}
Losses {'ner': 9.55109151099435e-05}
Losses {'ner': 9.702046123284951e-05}
Losses {'ner': 9.712057823769158e-05}
Losses {'ner': 9.71210626533859e-05}
Losses {'ner': 9.926828928456256e-08}
Losses {'ner': 1.0015722903426194e-07}
Losses {'ner': 1.0015765566599365e-07}
Losses {'ner': 5.177716314724843e-06}
Losses {'ner': 5.177950928427357e-06}
Losses {'ner': 5.236911726486621e-06}
Losses {'ner': 5.2524014412241905e-06}
Losses {'ner

In [6]:
# Testing the NER

for t in test:
    test_text = t[0]
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent)
        print(t[1])

Entities in 'Chocolate soufflé is extremely famous french cuisine'
Chocolate
{'entities': [(0, 17, 'FOOD')]}
Entities in 'Burgers are the most commonly consumed fastfood'
Burgers
{'entities': [(0, 7, 'FOOD')]}
Entities in 'Lasagna is another classic of Italy'
Lasagna
{'entities': [(0, 7, 'FOOD')]}
Entities in 'Shrimps are famous in China too'
Shrimps
{'entities': [(0, 7, 'FOOD')]}


In [None]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
matched_sents = []  # Collect data of matched sentences to be visualized
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # Matched span
    sent = span.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": "MATCH",
    }]
    matched_sents.append({"text": sent.text, "ents": match_ents})

patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns, on_match=collect_sents)

doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)

# Serve visualization of sentences containing match with displaCy
# set manual=True to make displaCy render straight from a dictionary
# (if you're not running the code within a Jupyer environment, you can
# use displacy.serve instead)
displacy.render(matched_sents, style="ent", manual=True)