In [0]:
from __future__ import unicode_literals, print_function
import pickle
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse
from spacy.scorer import Scorer

In [0]:
LABEL=['<unk>', 'O', 'B-TimeAttributes', 'I-TimeAttributes', 'B-Diseases', 'I-Diseases', 'B-SectionHeader', 'I-SectionHeader', 'B-Procedure', 'I-Procedure', 'B-StatusCode', 'B-LabTests', 'B-Medicines', 'I-Medicines', 'B-Observations', 'I-Observations', 'I-LabTests', 'B-BodyParts', 'I-BodyParts', 'I-StatusCode', '<START>', '<STOP>']

In [0]:
with open ('spacy_ner_train', 'rb') as fp:
    TRAIN_DATA = pickle.load(fp)

In [13]:
len(TRAIN_DATA)

6371

In [19]:
model=None
new_model_name='new_model'
output_dir='content/'
n_iter=20


"""Setting up the pipeline and entity recognizer, and training the new entity."""
if model is not None:
    nlp = spacy.load(model)  # load existing spacy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe('ner')

for i in LABEL:
    ner.add_label(i)   # Add new entity labels to entity recognizer

if model is None:
    optimizer = nlp.begin_training()
else:
    optimizer = nlp.entity.create_optimizer()

# Get names of other pipes to disable them during training to train only NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                        losses=losses)
        print('Losses', losses)

# Test the trained model
test_text = 'He was struck by H5N1 virus in 2006.'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

# Save model 
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.meta['name'] = new_model_name  # rename model
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

    # Test the saved model
    print("Loading from", output_dir)
    nlp2 = spacy.load(output_dir)
    doc2 = nlp2(test_text)
    for ent in doc2.ents:
        print(ent.label_, ent.text)

Created blank 'en' model
Losses {'ner': 36616.04152476229}
Losses {'ner': 26494.45594771442}
Losses {'ner': 22903.074168548068}
Losses {'ner': 21131.74600276307}
Losses {'ner': 19658.92136406795}
Losses {'ner': 18960.29967912492}
Losses {'ner': 17792.131396850295}
Losses {'ner': 17269.929143589387}
Losses {'ner': 16526.044357932216}
Losses {'ner': 16196.33894905938}
Losses {'ner': 15514.881357138545}
Losses {'ner': 15105.317374693452}
Losses {'ner': 14825.244452329564}
Losses {'ner': 14739.465681111722}
Losses {'ner': 14082.12670323226}
Losses {'ner': 13792.736044179926}
Losses {'ner': 13449.626141550923}
Losses {'ner': 13201.730488527766}
Losses {'ner': 12979.198371092802}
Losses {'ner': 12725.564523932386}
Entities in 'He was struck by H5N1 virus in 2006.'
B-TimeAttributes 2006.
Saved model to content
Loading from content
B-TimeAttributes 2006.


In [0]:
with open ('spacy_ner_test', 'rb') as fp:
    test_data = pickle.load(fp)

In [0]:
def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        annot=annot['entities']
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot)
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

In [0]:
nlp.to_disk("ner_model")

In [0]:
ner_model = spacy.load("ner_model") # for spaCy's pretrained use 'en_core_web_sm'
results = evaluate(ner_model, test_data)

In [69]:
results

{'ents_f': 75.97616111796135,
 'ents_p': 74.55132083081266,
 'ents_per_type': {'B-BodyParts': {'f': 86.7924528301887,
   'p': 100.0,
   'r': 76.66666666666667},
  'B-Diseases': {'f': 75.71701720841301,
   'p': 71.39423076923077,
   'r': 80.59701492537313},
  'B-LabTests': {'f': 82.02443280977313,
   'p': 80.2047781569966,
   'r': 83.92857142857143},
  'B-Medicines': {'f': 79.15690866510539,
   'p': 82.84313725490196,
   'r': 75.7847533632287},
  'B-Observations': {'f': 23.91304347826087,
   'p': 44.0,
   'r': 16.417910447761194},
  'B-Procedure': {'f': 66.66666666666666,
   'p': 65.29411764705883,
   'r': 68.09815950920245},
  'B-SectionHeader': {'f': 91.8918918918919,
   'p': 91.8918918918919,
   'r': 91.8918918918919},
  'B-StatusCode': {'f': 57.14285714285715, 'p': 66.66666666666666, 'r': 50.0},
  'B-TimeAttributes': {'f': 75.0316856780735,
   'p': 70.64439140811456,
   'r': 80.0},
  'I-BodyParts': {'f': 33.33333333333333, 'p': 50.0, 'r': 25.0},
  'I-Diseases': {'f': 78.979771328056