## Load Packages

In [1]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm 

In [2]:
nlp1 = spacy.load('en_core_web_lg')

## Working of NER

In [3]:
docx1 = nlp1(u"Who is Nishanth?")

In [4]:
for token in docx1.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

Nishanth 7 15 PERSON


In [5]:
docx2 = nlp1(u"Who is Kamal Khumar?")

In [6]:
for token in docx2.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

Kamal Khumar 7 19 PERSON


## Train Data

In [7]:
TRAIN_DATA = [
    ('Who is Nishanth?', {
        'entities': [(7, 15, 'PERSON')]
    }),
     ('Who is Kamal Khumar?', {
        'entities': [(7, 19, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]

## Define our variables

In [8]:
model = None
output_dir=Path("C:\\Users\\nithi\\Documents\\ner")
n_iter=100

## Load the model

In [9]:
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

Created blank 'en' model


## Set up the pipeline

In [10]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

## Train the Recognizer

In [11]:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  
                [annotations],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

  **kwargs
100%|██████████| 3/3 [00:00<00:00, 26.62it/s]


{'ner': 13.289173007011414}


100%|██████████| 3/3 [00:00<00:00, 28.11it/s]


{'ner': 12.414458155632019}


100%|██████████| 3/3 [00:00<00:00, 27.10it/s]


{'ner': 11.202702164649963}


100%|██████████| 3/3 [00:00<00:00, 32.00it/s]


{'ner': 10.165919601917267}


100%|██████████| 3/3 [00:00<00:00, 30.38it/s]


{'ner': 8.44960543513298}


100%|██████████| 3/3 [00:00<00:00, 28.11it/s]


{'ner': 7.798196479678154}


100%|██████████| 3/3 [00:00<00:00, 33.42it/s]


{'ner': 6.569828731939197}


100%|██████████| 3/3 [00:00<00:00, 29.20it/s]


{'ner': 6.784278305480257}


100%|██████████| 3/3 [00:00<00:00, 33.42it/s]


{'ner': 6.996531369164586}


100%|██████████| 3/3 [00:00<00:00, 32.70it/s]


{'ner': 6.852636652998626}


100%|██████████| 3/3 [00:00<00:00, 32.34it/s]


{'ner': 6.637710725655779}


100%|██████████| 3/3 [00:00<00:00, 31.01it/s]


{'ner': 5.308007724117488}


100%|██████████| 3/3 [00:00<00:00, 34.18it/s]


{'ner': 6.261936842012801}


100%|██████████| 3/3 [00:00<00:00, 31.33it/s]


{'ner': 5.696804424747825}


100%|██████████| 3/3 [00:00<00:00, 32.66it/s]


{'ner': 4.2220914154313505}


100%|██████████| 3/3 [00:00<00:00, 29.78it/s]


{'ner': 7.32105001504533}


100%|██████████| 3/3 [00:00<00:00, 29.20it/s]


{'ner': 4.733935753349215}


100%|██████████| 3/3 [00:00<00:00, 33.80it/s]


{'ner': 4.7929259040392935}


100%|██████████| 3/3 [00:00<00:00, 31.66it/s]


{'ner': 3.7480255567934364}


100%|██████████| 3/3 [00:00<00:00, 32.70it/s]


{'ner': 3.6030448971432634}


100%|██████████| 3/3 [00:00<00:00, 31.01it/s]


{'ner': 2.984586422695429}


100%|██████████| 3/3 [00:00<00:00, 35.35it/s]


{'ner': 4.080246267847542}


100%|██████████| 3/3 [00:00<00:00, 35.80it/s]


{'ner': 2.396151978294256}


100%|██████████| 3/3 [00:00<00:00, 33.41it/s]


{'ner': 2.9708919061977213}


100%|██████████| 3/3 [00:00<00:00, 37.58it/s]


{'ner': 3.124516086777021}


100%|██████████| 3/3 [00:00<00:00, 31.33it/s]


{'ner': 2.266252386643032}


100%|██████████| 3/3 [00:00<00:00, 28.65it/s]


{'ner': 2.0699961034052876}


100%|██████████| 3/3 [00:00<00:00, 32.45it/s]


{'ner': 1.2966782864483477}


100%|██████████| 3/3 [00:00<00:00, 35.07it/s]


{'ner': 1.645277187816894}


100%|██████████| 3/3 [00:00<00:00, 35.49it/s]


{'ner': 1.2471649073949607}


100%|██████████| 3/3 [00:00<00:00, 35.80it/s]


{'ner': 1.9767626742924236}


100%|██████████| 3/3 [00:00<00:00, 33.65it/s]


{'ner': 2.2609614619708998}


100%|██████████| 3/3 [00:00<00:00, 25.22it/s]


{'ner': 1.0743873100139631}


100%|██████████| 3/3 [00:00<00:00, 24.06it/s]


{'ner': 1.8448130177425868}


100%|██████████| 3/3 [00:00<00:00, 26.39it/s]


{'ner': 1.357637208115494}


100%|██████████| 3/3 [00:00<00:00, 21.49it/s]


{'ner': 1.8424517484679943}


100%|██████████| 3/3 [00:00<00:00, 23.32it/s]


{'ner': 0.9615059040750317}


100%|██████████| 3/3 [00:00<00:00, 25.93it/s]


{'ner': 0.537510085635887}


100%|██████████| 3/3 [00:00<00:00, 24.66it/s]


{'ner': 0.7948578974412663}


100%|██████████| 3/3 [00:00<00:00, 24.13it/s]


{'ner': 0.1137402939171647}


100%|██████████| 3/3 [00:00<00:00, 24.13it/s]


{'ner': 0.31659301493247805}


100%|██████████| 3/3 [00:00<00:00, 23.38it/s]


{'ner': 0.2985648904777062}


100%|██████████| 3/3 [00:00<00:00, 25.94it/s]


{'ner': 0.005982262522983435}


100%|██████████| 3/3 [00:00<00:00, 25.33it/s]


{'ner': 0.19967248298938595}


100%|██████████| 3/3 [00:00<00:00, 20.23it/s]


{'ner': 0.027748550969521342}


100%|██████████| 3/3 [00:00<00:00, 25.40it/s]


{'ner': 0.0002355359583202347}


100%|██████████| 3/3 [00:00<00:00, 25.93it/s]


{'ner': 0.001245846615631348}


100%|██████████| 3/3 [00:00<00:00, 25.73it/s]


{'ner': 0.1384277629389947}


100%|██████████| 3/3 [00:00<00:00, 23.83it/s]


{'ner': 2.5388879362475033e-06}


100%|██████████| 3/3 [00:00<00:00, 25.71it/s]


{'ner': 0.014741281109069068}


100%|██████████| 3/3 [00:00<00:00, 23.71it/s]


{'ner': 0.7157285214382185}


100%|██████████| 3/3 [00:00<00:00, 25.28it/s]


{'ner': 3.244267929675676e-05}


100%|██████████| 3/3 [00:00<00:00, 25.71it/s]


{'ner': 0.05835713364862018}


100%|██████████| 3/3 [00:00<00:00, 25.93it/s]


{'ner': 0.0002508708162295204}


100%|██████████| 3/3 [00:00<00:00, 34.18it/s]


{'ner': 1.6946091970760512e-05}


100%|██████████| 3/3 [00:00<00:00, 27.85it/s]


{'ner': 9.62541011568001e-06}


100%|██████████| 3/3 [00:00<00:00, 26.38it/s]


{'ner': 0.10284944012563473}


100%|██████████| 3/3 [00:00<00:00, 24.66it/s]


{'ner': 0.0007663793138746722}


100%|██████████| 3/3 [00:00<00:00, 25.27it/s]


{'ner': 3.126371562519889e-07}


100%|██████████| 3/3 [00:00<00:00, 25.07it/s]


{'ner': 2.0217684843293586e-05}


100%|██████████| 3/3 [00:00<00:00, 24.06it/s]


{'ner': 1.218231428182522e-05}


100%|██████████| 3/3 [00:00<00:00, 24.46it/s]


{'ner': 3.181537376351195e-06}


100%|██████████| 3/3 [00:00<00:00, 21.33it/s]


{'ner': 0.00026197582790653343}


100%|██████████| 3/3 [00:00<00:00, 21.80it/s]


{'ner': 0.0003894786458399904}


100%|██████████| 3/3 [00:00<00:00, 22.28it/s]


{'ner': 3.4010406020859926e-05}


100%|██████████| 3/3 [00:00<00:00, 21.65it/s]


{'ner': 1.9612036935329582e-05}


100%|██████████| 3/3 [00:00<00:00, 22.11it/s]


{'ner': 0.004094531692732815}


100%|██████████| 3/3 [00:00<00:00, 19.72it/s]


{'ner': 3.1664290765182284e-07}


100%|██████████| 3/3 [00:00<00:00, 21.91it/s]


{'ner': 7.285047079350139e-06}


100%|██████████| 3/3 [00:00<00:00, 19.66it/s]


{'ner': 2.394377973120872e-07}


100%|██████████| 3/3 [00:00<00:00, 18.12it/s]


{'ner': 0.00022465953246274834}


100%|██████████| 3/3 [00:00<00:00, 22.14it/s]


{'ner': 1.0863004763571723e-06}


100%|██████████| 3/3 [00:00<00:00, 22.74it/s]


{'ner': 0.0023946468426480406}


100%|██████████| 3/3 [00:00<00:00, 20.35it/s]


{'ner': 6.169837382418367e-06}


100%|██████████| 3/3 [00:00<00:00, 22.12it/s]


{'ner': 0.00030678138916277324}


100%|██████████| 3/3 [00:00<00:00, 23.14it/s]


{'ner': 0.00022935201453786304}


100%|██████████| 3/3 [00:00<00:00, 18.57it/s]


{'ner': 6.255226670428841e-06}


100%|██████████| 3/3 [00:00<00:00, 18.99it/s]


{'ner': 4.085394059302123e-08}


100%|██████████| 3/3 [00:00<00:00, 19.44it/s]


{'ner': 6.995940536268303e-07}


100%|██████████| 3/3 [00:00<00:00, 20.89it/s]


{'ner': 4.706886355837702e-07}


100%|██████████| 3/3 [00:00<00:00, 20.32it/s]


{'ner': 0.011415514144148941}


100%|██████████| 3/3 [00:00<00:00, 21.33it/s]


{'ner': 5.458422404451642e-08}


100%|██████████| 3/3 [00:00<00:00, 17.70it/s]


{'ner': 2.5626111289965546e-08}


100%|██████████| 3/3 [00:00<00:00,  9.47it/s]


{'ner': 0.0005705031495488346}


100%|██████████| 3/3 [00:00<00:00, 13.14it/s]


{'ner': 3.657292176990035e-08}


100%|██████████| 3/3 [00:00<00:00, 16.35it/s]


{'ner': 5.172763367355009e-06}


100%|██████████| 3/3 [00:00<00:00, 17.49it/s]


{'ner': 8.243823683565664e-08}


100%|██████████| 3/3 [00:00<00:00, 16.80it/s]


{'ner': 4.928377747025868e-07}


100%|██████████| 3/3 [00:00<00:00, 17.49it/s]


{'ner': 8.718774975073686e-09}


100%|██████████| 3/3 [00:00<00:00, 16.90it/s]


{'ner': 1.1960221041722968e-05}


100%|██████████| 3/3 [00:00<00:00, 20.32it/s]


{'ner': 2.9751551858409105e-05}


100%|██████████| 3/3 [00:00<00:00, 19.16it/s]


{'ner': 2.96942204058517e-06}


100%|██████████| 3/3 [00:00<00:00, 21.49it/s]


{'ner': 0.0016165699260966425}


100%|██████████| 3/3 [00:00<00:00, 21.64it/s]


{'ner': 4.713544226093801e-10}


100%|██████████| 3/3 [00:00<00:00, 22.62it/s]


{'ner': 0.0031288532863410316}


100%|██████████| 3/3 [00:00<00:00, 19.92it/s]


{'ner': 3.34105816504464e-05}


100%|██████████| 3/3 [00:00<00:00, 15.51it/s]


{'ner': 5.541132249760118e-10}


100%|██████████| 3/3 [00:00<00:00, 14.97it/s]


{'ner': 3.6742865249447716e-06}


100%|██████████| 3/3 [00:00<00:00, 15.43it/s]


{'ner': 1.8795149241263365e-05}


100%|██████████| 3/3 [00:00<00:00, 12.23it/s]


{'ner': 2.7214211207259498e-09}


## Test the trained model

In [12]:
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('Kamal Khumar', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Kamal', 'PERSON', 3), ('Khumar', 'PERSON', 1), ('?', '', 2)]
Entities [('Nishanth', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Nishanth', 'PERSON', 3), ('?', '', 2)]
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]


## Save the model

In [16]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)        

Saved model to C:\Users\nithi\Documents\ner


## Test the saved model

In [14]:
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from C:\Users\nithi\Documents\ner
Entities [('Kamal Khumar', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Kamal', 'PERSON', 3), ('Khumar', 'PERSON', 1), ('?', '', 2)]
Entities [('Nishanth', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Nishanth', 'PERSON', 3), ('?', '', 2)]
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
