In [108]:
# Load Packages
from __future__ import unicode_literals, print_function

import plac #  wrapper over argparse
import random
from pathlib import Path
import spacy
from tqdm import tqdm # loading bar

In [109]:
nlp1 = spacy.load('en')

In [110]:
docx1 = nlp1(u"Who was Kofi Annan?")

In [111]:
docx1.ents

(Kofi Annan,)

In [112]:
for token in docx1.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

Kofi Annan 8 18 PERSON


In [113]:
docx2 = nlp1(u"Who is Steve Jobs?")

In [114]:
for token in docx2.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

Steve Jobs 7 17 PERSON


In [115]:
docx3 = nlp1(u"Who is Shaka Khan?")

In [116]:
# training data
TRAIN_DATA = [
    ('Who is Kofi Annan?', {
        'entities': [(8, 18, 'PERSON')]
    }),
     ('Who is Steve Jobs?', {
        'entities': [(7, 17, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]

In [117]:
# Define our variables
model = None
output_dir=Path("G:\\Datascience_experiments\\")
n_iter=100

In [118]:
if model is not None:
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")

Created blank 'en' model


In [119]:
# create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

In [120]:
# add labels
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

    # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.5,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)

100%|██████████| 3/3 [00:00<00:00, 10.09it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.02it/s]

{'ner': 11.90903426706791}


100%|██████████| 3/3 [00:00<00:00, 10.94it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.33it/s]

{'ner': 11.001475095748901}


100%|██████████| 3/3 [00:00<00:00, 10.81it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.55it/s]

{'ner': 10.173701491206884}


100%|██████████| 3/3 [00:00<00:00, 10.16it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.27it/s]

{'ner': 8.802559848874807}


100%|██████████| 3/3 [00:00<00:00, 10.94it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 7.040969837456942}


100%|██████████| 3/3 [00:00<00:00, 11.18it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.65it/s]

{'ner': 4.5674719519447535}


100%|██████████| 3/3 [00:00<00:00, 11.30it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.66it/s]

{'ner': 4.834288824931718}


100%|██████████| 3/3 [00:00<00:00, 11.36it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.80it/s]

{'ner': 5.669250600210944}


100%|██████████| 3/3 [00:00<00:00, 10.86it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.83it/s]

{'ner': 5.405205718721845}


100%|██████████| 3/3 [00:00<00:00, 10.13it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.33it/s]

{'ner': 5.524043088882308}


100%|██████████| 3/3 [00:00<00:00, 11.14it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.39it/s]

{'ner': 3.961837227015394}


100%|██████████| 3/3 [00:00<00:00, 11.06it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.39it/s]

{'ner': 4.740958258926895}


100%|██████████| 3/3 [00:00<00:00, 11.02it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.52it/s]

{'ner': 5.192479009567364}


100%|██████████| 3/3 [00:00<00:00, 11.14it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 4.643893905988122}


100%|██████████| 3/3 [00:00<00:00, 11.10it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.93it/s]

{'ner': 4.801002757704809}


100%|██████████| 3/3 [00:00<00:00,  9.86it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.72it/s]

{'ner': 4.741636088972558}


100%|██████████| 3/3 [00:00<00:00, 11.26it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 2.9274036311184854}


100%|██████████| 3/3 [00:00<00:00, 11.26it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.73it/s]

{'ner': 2.97713293080767}


100%|██████████| 3/3 [00:00<00:00, 11.36it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.66it/s]

{'ner': 3.0397408932803955}


100%|██████████| 3/3 [00:00<00:00, 11.22it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.80it/s]

{'ner': 3.2110125650621058}


100%|██████████| 3/3 [00:00<00:00, 11.39it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.48it/s]

{'ner': 2.5297914076680046}


100%|██████████| 3/3 [00:00<00:00, 11.32it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.44it/s]

{'ner': 4.958796734610022}


100%|██████████| 3/3 [00:00<00:00,  9.43it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.46it/s]

{'ner': 3.6454831073202425}


100%|██████████| 3/3 [00:00<00:00, 10.41it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.26it/s]

{'ner': 2.7698089408357456}


100%|██████████| 3/3 [00:00<00:00, 11.10it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.93it/s]

{'ner': 2.652892311356738}


100%|██████████| 3/3 [00:00<00:00, 11.44it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.39it/s]

{'ner': 2.283830924253378}


100%|██████████| 3/3 [00:00<00:00, 11.25it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.73it/s]

{'ner': 2.607202861749812}


100%|██████████| 3/3 [00:00<00:00, 11.44it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.93it/s]

{'ner': 1.1859053190395101}


100%|██████████| 3/3 [00:00<00:00,  9.99it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 3.0289168015672985}


100%|██████████| 3/3 [00:00<00:00, 11.35it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 1.927689846064833}


100%|██████████| 3/3 [00:00<00:00, 11.26it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.79it/s]

{'ner': 0.24313056744235245}


100%|██████████| 3/3 [00:00<00:00, 11.48it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.87it/s]

{'ner': 0.7341089364819586}


100%|██████████| 3/3 [00:00<00:00, 11.57it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 0.807311561840526}


100%|██████████| 3/3 [00:00<00:00, 11.35it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.78it/s]

{'ner': 0.13383477644792258}


100%|██████████| 3/3 [00:00<00:00, 10.37it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.93it/s]

{'ner': 0.4323439933882913}


100%|██████████| 3/3 [00:00<00:00, 10.94it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 1.276399183145441}


100%|██████████| 3/3 [00:00<00:00, 11.39it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.52it/s]

{'ner': 0.030350768706331}


100%|██████████| 3/3 [00:00<00:00, 11.35it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.72it/s]

{'ner': 0.0042154442177951765}


100%|██████████| 3/3 [00:00<00:00, 11.44it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.73it/s]

{'ner': 0.0616114894325609}


100%|██████████| 3/3 [00:00<00:00, 11.31it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 0.0015470076381514032}


100%|██████████| 3/3 [00:00<00:00, 11.44it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.84it/s]

{'ner': 0.0005584678377720084}


100%|██████████| 3/3 [00:00<00:00, 10.55it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.73it/s]

{'ner': 0.0010690131356793904}


100%|██████████| 3/3 [00:00<00:00, 10.16it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.08it/s]

{'ner': 0.0002062729669783376}


100%|██████████| 3/3 [00:00<00:00, 10.94it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.33it/s]

{'ner': 1.9516898548932304}


100%|██████████| 3/3 [00:00<00:00, 11.10it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.52it/s]

{'ner': 6.107994844922993e-06}


100%|██████████| 3/3 [00:00<00:00, 11.31it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.08it/s]

{'ner': 0.00020762048837573918}


100%|██████████| 3/3 [00:00<00:00, 10.13it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0005864505309312578}


100%|██████████| 3/3 [00:00<00:00,  9.52it/s]
 33%|███▎      | 1/3 [00:00<00:00,  8.50it/s]

{'ner': 0.5583154141565053}


100%|██████████| 3/3 [00:00<00:00,  9.20it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.78it/s]

{'ner': 7.6721614054373e-05}


100%|██████████| 3/3 [00:00<00:00, 10.59it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 0.8449315596520364}


100%|██████████| 3/3 [00:00<00:00, 11.31it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.20it/s]

{'ner': 0.4850019491059173}


100%|██████████| 3/3 [00:00<00:00, 11.06it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 3.1095574563997196e-06}


100%|██████████| 3/3 [00:00<00:00, 11.35it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.50it/s]

{'ner': 0.31773168455360207}


100%|██████████| 3/3 [00:00<00:00, 10.09it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.20it/s]

{'ner': 2.0187124253840586e-05}


100%|██████████| 3/3 [00:00<00:00, 11.06it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.53it/s]

{'ner': 0.035686202470647174}


100%|██████████| 3/3 [00:00<00:00, 11.39it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.73it/s]

{'ner': 4.549134446258122e-08}


100%|██████████| 3/3 [00:00<00:00, 11.48it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 12.08it/s]

{'ner': 1.2172580896338535e-06}


100%|██████████| 3/3 [00:00<00:00, 11.66it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.08it/s]

{'ner': 3.345704208465393e-05}


100%|██████████| 3/3 [00:00<00:00, 11.02it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.66it/s]

{'ner': 0.01348796106658078}


100%|██████████| 3/3 [00:00<00:00, 11.48it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.39it/s]

{'ner': 6.821192640303363e-07}


100%|██████████| 3/3 [00:00<00:00, 10.70it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.39it/s]

{'ner': 0.0009685848349031925}


100%|██████████| 3/3 [00:00<00:00, 10.34it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 0.00011521209398656916}


100%|██████████| 3/3 [00:00<00:00, 11.44it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 5.789541696212215e-06}


100%|██████████| 3/3 [00:00<00:00, 11.35it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.73it/s]

{'ner': 9.692525030184513e-07}


100%|██████████| 3/3 [00:00<00:00, 11.44it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.87it/s]

{'ner': 2.973331964064253e-06}


100%|██████████| 3/3 [00:00<00:00, 11.52it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.46it/s]

{'ner': 0.0005126107493538908}


100%|██████████| 3/3 [00:00<00:00, 11.14it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.46it/s]

{'ner': 2.1208838011049863e-05}


100%|██████████| 3/3 [00:00<00:00, 10.20it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.73it/s]

{'ner': 9.437990655345076e-06}


100%|██████████| 3/3 [00:00<00:00, 11.44it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 1.5704481589983336e-07}


100%|██████████| 3/3 [00:00<00:00, 11.31it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.52it/s]

{'ner': 4.2306868380472013e-07}


100%|██████████| 3/3 [00:00<00:00, 10.27it/s]
 33%|███▎      | 1/3 [00:00<00:00,  8.15it/s]

{'ner': 1.0060429408848688e-09}


100%|██████████| 3/3 [00:00<00:00, 10.13it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 5.898405984720241e-08}


100%|██████████| 3/3 [00:00<00:00, 11.02it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.93it/s]

{'ner': 4.186886011516668e-08}


100%|██████████| 3/3 [00:00<00:00, 10.44it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.26it/s]

{'ner': 0.0010288805865688946}


100%|██████████| 3/3 [00:00<00:00, 11.22it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.72it/s]

{'ner': 3.5734642651588135e-07}


100%|██████████| 3/3 [00:00<00:00, 11.44it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.52it/s]

{'ner': 4.5111849267992654e-07}


100%|██████████| 3/3 [00:00<00:00, 11.31it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 2.0306530257547237e-08}


100%|██████████| 3/3 [00:00<00:00, 11.26it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.66it/s]

{'ner': 1.286730452029141e-08}


100%|██████████| 3/3 [00:00<00:00, 11.35it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.53it/s]

{'ner': 2.1888974140863933e-07}


100%|██████████| 3/3 [00:00<00:00, 11.17it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.73it/s]

{'ner': 6.711371979938179e-07}


100%|██████████| 3/3 [00:00<00:00,  9.23it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.46it/s]

{'ner': 4.67457393613981e-06}


100%|██████████| 3/3 [00:00<00:00, 10.44it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.14it/s]

{'ner': 0.008430134690051948}


100%|██████████| 3/3 [00:00<00:00, 11.02it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.52it/s]

{'ner': 3.3625388200530095e-07}


100%|██████████| 3/3 [00:00<00:00, 11.35it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.59it/s]

{'ner': 4.4077928692787255e-05}


100%|██████████| 3/3 [00:00<00:00, 11.18it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.14it/s]

{'ner': 0.0001888932188914457}


100%|██████████| 3/3 [00:00<00:00, 10.82it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.37it/s]

{'ner': 8.712311455418609e-07}


100%|██████████| 3/3 [00:00<00:00,  9.09it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.72it/s]

{'ner': 6.672951763742948e-10}


100%|██████████| 3/3 [00:00<00:00, 10.59it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.83it/s]

{'ner': 1.1552773705794284e-05}


100%|██████████| 3/3 [00:00<00:00, 10.16it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.96it/s]

{'ner': 7.447343151314777e-06}


100%|██████████| 3/3 [00:00<00:00, 10.98it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.66it/s]

{'ner': 1.5673737214456367e-10}


100%|██████████| 3/3 [00:00<00:00, 11.48it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.39it/s]

{'ner': 1.0598017756245761e-08}


100%|██████████| 3/3 [00:00<00:00, 10.82it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.83it/s]

{'ner': 4.0438972585424516e-07}


100%|██████████| 3/3 [00:00<00:00, 10.26it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.73it/s]

{'ner': 1.4368318243284767e-11}


100%|██████████| 3/3 [00:00<00:00, 11.39it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.73it/s]

{'ner': 1.053270266244409e-08}


100%|██████████| 3/3 [00:00<00:00, 11.48it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.73it/s]

{'ner': 3.1336275490590363e-06}


100%|██████████| 3/3 [00:00<00:00, 11.39it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.80it/s]

{'ner': 7.735367174819894e-07}


100%|██████████| 3/3 [00:00<00:00, 11.41it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.99it/s]

{'ner': 3.0221903860317253e-06}


100%|██████████| 3/3 [00:00<00:00, 11.56it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 11.87it/s]

{'ner': 6.36682072774711e-07}


100%|██████████| 3/3 [00:00<00:00, 11.57it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 10.72it/s]

{'ner': 0.0020008758414002585}


100%|██████████| 3/3 [00:00<00:00, 10.30it/s]

{'ner': 7.407117880786522e-05}





In [121]:
# test the trained model
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('Steve Jobs', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Steve', 'PERSON', 3), ('Jobs', 'PERSON', 1), ('?', '', 2)]
Entities [('Kofi Annan', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Kofi', 'PERSON', 3), ('Annan', 'PERSON', 1), ('?', '', 2)]
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]


In [122]:
# save model to output directory
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to G:\Datascience_experiments


# Saved model will be in the name of meta.json and load the same for testing

In [123]:
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from G:\Datascience_experiments
Entities [('Steve Jobs', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Steve', 'PERSON', 3), ('Jobs', 'PERSON', 1), ('?', '', 2)]
Entities [('Kofi Annan', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Kofi', 'PERSON', 3), ('Annan', 'PERSON', 1), ('?', '', 2)]
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]


In [127]:
from __future__ import unicode_literals, print_function

import plac # no need of this file/import
import random
from pathlib import Path
import spacy

In [128]:
# new entity label
LABEL = 'ANIMAL'

In [129]:
TRAIN_DATA = [
    ("Horses are too tall and they pretend to care about your feelings", {
        'entities': [(0, 6, 'ANIMAL')]
    }),

    ("Do they bite?", {
        'entities': []
    }),

    ("horses are too tall and they pretend to care about your feelings", {
        'entities': [(0, 6, 'ANIMAL')]
    }),

    ("horses pretend to care about your feelings", {
        'entities': [(0, 6, 'ANIMAL')]
    }),

    ("they pretend to care about your feelings, those horses", {
        'entities': [(48, 54, 'ANIMAL')]
    }),

    ("horses?", {
        'entities': [(0, 6, 'ANIMAL')]
    })
]

In [130]:
def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # test the trained model
    test_text = 'Do you like horses?'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)



In [131]:
# Run our Function
main()

  0%|          | 0/6 [00:00<?, ?it/s]

Created blank 'en' model


100%|██████████| 6/6 [00:00<00:00, 10.30it/s]
 17%|█▋        | 1/6 [00:00<00:00,  9.83it/s]

{'ner': 32.23149424791336}


100%|██████████| 6/6 [00:00<00:00,  8.91it/s]
 17%|█▋        | 1/6 [00:00<00:00,  9.93it/s]

{'ner': 17.986414566636086}


100%|██████████| 6/6 [00:00<00:00, 10.15it/s]
 33%|███▎      | 2/6 [00:00<00:00, 10.90it/s]

{'ner': 7.502557674139098}


100%|██████████| 6/6 [00:00<00:00, 11.10it/s]
 17%|█▋        | 1/6 [00:00<00:00,  8.15it/s]

{'ner': 8.16169999099175}


100%|██████████| 6/6 [00:00<00:00,  9.39it/s]
 17%|█▋        | 1/6 [00:00<00:00,  9.37it/s]

{'ner': 4.969576364137595}


100%|██████████| 6/6 [00:00<00:00,  9.21it/s]
 17%|█▋        | 1/6 [00:00<00:00,  9.46it/s]

{'ner': 2.6871653651397764}


100%|██████████| 6/6 [00:00<00:00,  9.73it/s]
 33%|███▎      | 2/6 [00:00<00:00, 11.02it/s]

{'ner': 0.9044525255620648}


100%|██████████| 6/6 [00:00<00:00, 11.16it/s]
 33%|███▎      | 2/6 [00:00<00:00, 11.46it/s]

{'ner': 0.2130922406865311}


100%|██████████| 6/6 [00:00<00:00, 11.44it/s]
 33%|███▎      | 2/6 [00:00<00:00, 11.26it/s]

{'ner': 0.05526599752630109}


100%|██████████| 6/6 [00:00<00:00, 11.41it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 0.0022269016962061326}


100%|██████████| 6/6 [00:00<00:00, 10.11it/s]
 17%|█▋        | 1/6 [00:00<00:00,  9.93it/s]

{'ner': 0.0035192315828964653}


100%|██████████| 6/6 [00:00<00:00, 10.90it/s]
 33%|███▎      | 2/6 [00:00<00:00, 11.52it/s]

{'ner': 4.4205517416112466e-05}


100%|██████████| 6/6 [00:00<00:00, 11.39it/s]
 33%|███▎      | 2/6 [00:00<00:00, 10.78it/s]

{'ner': 0.00021026107141211924}


100%|██████████| 6/6 [00:00<00:00, 10.28it/s]
 33%|███▎      | 2/6 [00:00<00:00, 10.96it/s]

{'ner': 0.0006968800082366241}


100%|██████████| 6/6 [00:00<00:00, 11.29it/s]
 33%|███▎      | 2/6 [00:00<00:00, 11.46it/s]

{'ner': 4.447625660591058e-08}


100%|██████████| 6/6 [00:00<00:00, 11.00it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 2.3919675498617756e-06}


100%|██████████| 6/6 [00:00<00:00, 10.65it/s]
 33%|███▎      | 2/6 [00:00<00:00, 11.33it/s]

{'ner': 1.8593096279720764e-06}


100%|██████████| 6/6 [00:00<00:00, 11.29it/s]
 33%|███▎      | 2/6 [00:00<00:00, 11.02it/s]

{'ner': 9.786046356937846e-06}


100%|██████████| 6/6 [00:00<00:00, 11.39it/s]
 33%|███▎      | 2/6 [00:00<00:00, 11.33it/s]

{'ner': 4.57209760260233e-06}


100%|██████████| 6/6 [00:00<00:00, 11.17it/s]

{'ner': 5.373253760739089e-07}
Entities in 'Do you like horses?'
ANIMAL horses



