# Entities Extractor

In [1]:
import pandas as pd
import spacy
import json
import re
import random
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.model_selection import train_test_split
from ipymarkup import show_box_markup
from ipymarkup.palette import palette, BLUE, RED, GREEN, ORANGE, PURPLE, BROWN

### Dataset Preparation

Load dataset, split by 70:30, and convert the dataset to a json format required by Spacy. See https://spacy.io/usage/training

In [2]:
# load dataset
df_intents = pd.read_json('intents.json')
df_intents = df_intents['text_intent'].apply(pd.Series)

with open('entities.json') as json_file:
    json_entities = json.load(json_file)
    
# split dataset
train_x, test_x, train_y, test_y = train_test_split(df_intents['text'], df_intents['intent'], random_state=2, test_size=0.30)

def convert_to_spacy_format(df_intents_text, json_entities):
    '''to convert dataset to spacy format.
    see https://spacy.io/usage/training'''
    formatted_json = []
    for i, row in df_intents_text.iteritems():
        formatted_json.append((row, {'entities':[]}))
        for entity in json_entities:
            for entity_val in json_entities[entity]:
                for m in re.finditer(entity_val.lower(), row.lower()):
                     formatted_json[-1][1]['entities'].append((m.start(), m.end(), entity))
    return formatted_json

# convert dataset to Spacy format
formatted_train_x = convert_to_spacy_format(train_x, json_entities)
formatted_test_x = convert_to_spacy_format(test_x, json_entities)

### Setting Up Model

Load pre-trained model if exist (if not, create blank model). Add new entities to entity recognizer. Initialize the optimizer.

In [3]:
# load pre-trained model
try:
    model = spacy.load('en_core_web_sm')
except:
    print('the pre-trained model is not found')
    model = None

# setting up the pipeline and entity recognizer.
if model is not None:
    print("Loaded model '%s'" % model)
else:
    model = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")
    
if 'ner' not in model.pipe_names:
    ner = model.create_pipe('ner')
    model.add_pipe(ner)
    print("Create new 'ner'")
else:
    ner = model.get_pipe('ner')
    print("Load existing 'ner'")
    
# add new entity labels to entity recognizer
for entity in json_entities:
    ner.add_label(entity)
    
# inititalizing optimizer
if model is None:
    optimizer = model.begin_training()
else:
    optimizer = model.entity.create_optimizer()

Loaded model '<spacy.lang.en.English object at 0x000001BE457C4EB8>'
Load existing 'ner'


### Model Training

Shuffle and iterate training data in batches, and use model.update to make a prediction and adjust the weights of the model.

In [4]:
n_iter = 200
verbose = 0
# get names of other pipes (except 'ner')
other_pipes = [pipe for pipe in model.pipe_names if pipe != 'ner']
with model.disable_pipes(*other_pipes):  # disable other pipes (only train NER)
    for itn in range(n_iter):
        random.shuffle(formatted_train_x)
        losses = {}
        batches = minibatch(formatted_train_x, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, entities = zip(*batch) 
            # Updating the weights
            model.update(texts, entities, sgd=optimizer, drop=0.35, losses=losses)
        if verbose > 0:
            print('Losses', losses)

### Model Evaluation

Evaluate the trained model on the test data, and print the predictions.

In [5]:
def show_markup(pred, text):
    '''to show markup in the text'''
    spans = []
    for ent in pred.ents:
        for m in re.finditer(ent.text, text):
            spans.append((m.start(), m.end(), ent.label_))
    show_box_markup(text, spans, palette=palette(PERSON=BLUE, TOPIC=RED, DATE=GREEN, DURATION=PURPLE, TIME=BROWN))

def evaluate(model, test):
    '''return scores, and support for each entity type'''
    scorer = Scorer()
    supports = {}
    for text, entities in test:
        doc_gold_text = model.make_doc(text)
        entities = entities['entities']
        for entity in entities:
            supports[entity[2]] = supports.get(entity[2], 0) + 1
        gold = GoldParse(doc_gold_text, entities=entities)
        predicted = model(text)
        show_markup(predicted, text)
        scorer.score(predicted, gold)
    return scorer.scores, supports

def print_score(scores, supports):
    '''print json scores in a tabular format'''
    print('{:>12}{:>12}{:>12}{:>12}{:>12}'.format('','precision','recall','f1-score', 'support'))
    for ent in supports:
        ent_scores = scores['ents_per_type'][ent]
        print('{:>12}{:>12}{:>12}{:>12}{:>12}'.format(str(ent), round(ent_scores['p'], 2), round(ent_scores['r'], 2), round(ent_scores['f'], 2), supports.get(ent,0)))
    print('\n{:>12}{:>12}{:>12}{:>12}{:>12}'.format('avg/total', round(scores['ents_p'], 2), round(scores['ents_r'], 2), round(scores['ents_f'], 2), sum(supports.values())))

scores, supports = evaluate(model, formatted_test_x)
print_score(scores, supports)

               precision      recall    f1-score     support
        DATE       100.0       81.82        90.0          11
       TOPIC       100.0       100.0       100.0           3
        TIME       100.0       66.67        80.0           6
    DURATION       66.67       66.67       66.67           4
      PERSON       85.71       100.0       92.31           9

   avg/total       92.31       82.76       87.27          33
