In [8]:
# Import and load the spacy model
import spacy
from spacy.training import Example

nlp=spacy.blank("en") 

# Getting the ner component
ner=nlp.add_pipe('ner')

In [11]:
# New label to add
LABEL = "FOOD"

# Training examples in the required format
TRAIN_DATA =[ ("Pizza is a common fast food.", {"entities": [(0, 5, "FOOD")]}),
              ("Pasta is an italian recipe", {"entities": [(0, 5, "FOOD")]}),
              ("China's noodles are very famous", {"entities": [(8,14, "FOOD")]}),
              ("Shrimps are famous in China too", {"entities": [(0,7, "FOOD")]}),
              ("Lasagna is another classic of Italy", {"entities": [(0,7, "FOOD")]}),
              ("Sushi is extemely famous and expensive Japanese dish", {"entities": [(0,5, "FOOD")]}),
              ("Unagi is a famous seafood of Japan", {"entities": [(0,5, "FOOD")]}),
              ("Tempura , Soba are other famous dishes of Japan", {"entities": [(0,7, "FOOD")]}),
              ("Udon is a healthy type of noodles", {"entities": [(0,4, "ORG")]}),
              ("Yogurt is a healthy type of food", {"entities": [(0,6, "FOOD")]}),
              ("Chocolate soufflé is extremely famous french cuisine", {"entities": [(0,17, "FOOD")]}),
              ("Flamiche is french pastry", {"entities": [(0,8, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Frenchfries are considered too oily", {"entities": [(0,11, "FOOD")]})
           ]

In [16]:
from sklearn.model_selection import ShuffleSplit
import numpy as np
rs = ShuffleSplit(n_splits=1, test_size=.25, random_state=57)

train = []
test = []

for train_index, test_index in rs.split(TRAIN_DATA):
    train = np.array(TRAIN_DATA)[train_index.astype(int)]
    test = np.array(TRAIN_DATA)[test_index.astype(int)]

print(test)
    
examples = []
for text, annots in train:
    examples.append(Example.from_dict(nlp.make_doc(text), annots))
nlp.initialize(lambda: examples)

[['Chocolate soufflé is extremely famous french cuisine'
  {'entities': [(0, 17, 'FOOD')]}]
 ['Burgers are the most commonly consumed fastfood'
  {'entities': [(0, 7, 'FOOD')]}]
 ['Lasagna is another classic of Italy' {'entities': [(0, 7, 'FOOD')]}]
 ['Shrimps are famous in China too' {'entities': [(0, 7, 'FOOD')]}]]




<thinc.optimizers.Optimizer at 0x127877540>

In [17]:
# Add the new label to ner
ner.add_label(LABEL)

# Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [20]:
# Importing requirements
from spacy.util import minibatch, compounding
import random

# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes) :

    sizes = compounding(1.0, 4.0, 1.001)
    # Training for 30 iterations     
    for itn in range(30):
    # shuffle examples before training
        random.shuffle(examples)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(examples, size=sizes)
        losses = {}
        for batch in batches:
            # Calling update() over the iteration
            nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

Losses {'ner': 3.022743818141439e-06}
Losses {'ner': 3.022745126082262e-06}
Losses {'ner': 3.022821100921677e-06}
Losses {'ner': 3.0287262932085724e-06}
Losses {'ner': 3.0386742307933816e-06}
Losses {'ner': 3.0386754426287986e-06}
Losses {'ner': 3.03867752197488e-06}
Losses {'ner': 3.0388024391458863e-06}
Losses {'ner': 3.0388024400146825e-06}
Losses {'ner': 9.247937721471119e-07}
Losses {'ner': 9.247939770917272e-07}
Losses {'ner': 9.247987497827011e-07}
Losses {'ner': 9.248718119900304e-07}
Losses {'ner': 9.248718121148857e-07}
Losses {'ner': 1.6757339259980018e-06}
Losses {'ner': 1.691859147311737e-06}
Losses {'ner': 1.8879322359639247e-06}
Losses {'ner': 1.8879322377340774e-06}
Losses {'ner': 1.8879379636695022e-06}
Losses {'ner': 1.89294355833425e-06}
Losses {'ner': 7.404015710911418e-13}
Losses {'ner': 4.049533900257057e-06}
Losses {'ner': 4.124678924683233e-06}
Losses {'ner': 4.124681135122588e-06}
Losses {'ner': 4.125113349406199e-06}
Losses {'ner': 4.125113359289221e-06}
Losse

In [22]:
# Testing the NER

for t in test:
    test_text = t[0]
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent)
        print(t[1])

Entities in 'Chocolate soufflé is extremely famous french cuisine'
Entities in 'Burgers are the most commonly consumed fastfood'
Burgers
{'entities': [(0, 7, 'FOOD')]}
Entities in 'Lasagna is another classic of Italy'
Lasagna
{'entities': [(0, 7, 'FOOD')]}
Entities in 'Shrimps are famous in China too'
Shrimps
{'entities': [(0, 7, 'FOOD')]}


In [7]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
matched_sents = []  # Collect data of matched sentences to be visualized
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # Matched span
    sent = span.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": "MATCH",
    }]
    matched_sents.append({"text": sent.text, "ents": match_ents})

patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns, on_match=collect_sents)

doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)

# Serve visualization of sentences containing match with displaCy
# set manual=True to make displaCy render straight from a dictionary
# (if you're not running the code within a Jupyer environment, you can
# use displacy.serve instead)
displacy.render(matched_sents, style="ent", manual=True)

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.