# <font size="10">Custom entity recognition </font>
## Model environment setup

This notebook contains test code to train the implemented model in the generated training data outputted from
the [ner-train notebook located here](./ner-train-note.ipynb).

For simple loop model training (old), go [here](## Run training - using simple training loop from blank -- Old)

## Setup

In [1]:
import pandas as pd
import random
import warnings
import json
import ast
import datetime as dt
from pathlib import Path
##SpaCy

import en_core_web_sm
import spacy
from spacy import displacy
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from spacy.pipeline import Sentencizer
from spacy.lemmatizer import Lemmatizer, ADJ, NOUN, VERB
from spacy.util import minibatch, compounding


## Import training data

lets import the training data we generated:

In [2]:
import os
train_path = r'./train/train.csv'
print('Filepath is :',(os.path.join( train_path)))

Filepath is : ./train/train.csv


In [3]:
DATA = pd.read_csv(os.path.join(train_path))
DATA[0:10]

Unnamed: 0,text,position
0,What are the advantages of including a green r...,"{'entities': [(39, 49, 'SUSTECH')]}"
1,"Lu, J.; Yuan, J.; Yang, J.; Yang, Z. Responses...","{'entities': [(122, 132, 'SUSTECH')]}"
2,"In total, 135 residences and businesses applie...","{'entities': [(56, 66, 'SUSTECH')]}"
3,The substrates consist mainly of mineral mater...,"{'entities': [(126, 136, 'SUSTECH')]}"
4,8\n\nF\nO\nO\nR\n\n \n\nN\nE\nE\nR\nG\n\n \n\...,"{'entities': [(150, 160, 'SUSTECH')]}"
5,"[CrossRef]\nSpeak, A.F.; Rothwell, J.J.; Lindl...","{'entities': [(124, 134, 'SUSTECH')]}"
6,"In addition, it should be considered that the ...","{'entities': [(135, 145, 'SUSTECH')]}"
7,"Until then, there are many niche opportunities...","{'entities': [(60, 70, 'SUSTECH')]}"
8,If a green roof is part of the initial design ...,"{'entities': [(5, 15, 'SUSTECH')]}"
9,Some of the \nnew membranes developed specific...,"{'entities': [(54, 64, 'SUSTECH')]}"


In [4]:
#convert to list for model intake
TRAIN_DATA = DATA.values.tolist()

#for element in index 1 convert string (Entity position) to dictionary to be able to read by the model function
for position in TRAIN_DATA:
    position[1]=ast.literal_eval(position[1])
    
#Check our input list
print(TRAIN_DATA[0:2])

[['What are the advantages of including a green roof in the design phase of a project,\n\nrather than in a retrofit situation?', {'entities': [(39, 49, 'SUSTECH')]}], ['Lu, J.; Yuan, J.; Yang, J.; Yang, Z. Responses of morphology and drought tolerance of Sedum lineare to\nwatering regime in green roof system: A root perspective.', {'entities': [(122, 132, 'SUSTECH')]}]]


## Run a test before training
### Test existing default spacy model 

In [5]:
nlp = spacy.load("en_core_web_sm")
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x164ab1da340>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x164aaf12100>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x164aaf12b80>)]

In [6]:
doc = nlp('Here is a green roof on this house. A green roof is good.')
displacy.render(doc, style="ent")
# verified green roof does not match an entity in the NER



## TRAINING THE MODEL
## Train model setup

In [11]:
'''# setup our new label
LABEL = 'SUSTECH'

# create blank nlp model
def blank_nlp_model(train_data):
    nlp = spacy.blank("en")
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
    ner = nlp.get_pipe("ner")
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    return nlp'''


'# setup our new label\nLABEL = \'SUSTECH\'\n\n# create blank nlp model\ndef blank_nlp_model(train_data):\n    nlp = spacy.blank("en")\n    ner = nlp.create_pipe("ner")\n    nlp.add_pipe(ner, last=True)\n    ner = nlp.get_pipe("ner")\n    for _, annotations in train_data:\n        for ent in annotations.get("entities"):\n            ner.add_label(ent[2])\n    return nlp'

## Run training - using simple training loop from blank -- Old

In [13]:
'''if model is None:
    nlp = blank_nlp_model(TRAIN_DATA)
    optimizer = nlp.begin_training()
for i in range(20):
    losses = {}
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,  # batch of texts
            annotations,  # batch of annotations
            drop=0.1,  # dropout - make it harder to memorise data
            losses=losses,
        )
    print(f"Losses at iteration {i} - {dt.datetime.now()} {losses}")'''

'if model is None:\n    nlp = blank_nlp_model(TRAIN_DATA)\n    optimizer = nlp.begin_training()\nfor i in range(20):\n    losses = {}\n    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))\n    for batch in batches:\n        texts, annotations = zip(*batch)\n        nlp.update(\n            texts,  # batch of texts\n            annotations,  # batch of annotations\n            drop=0.1,  # dropout - make it harder to memorise data\n            losses=losses,\n        )\n    print(f"Losses at iteration {i} - {dt.datetime.now()} {losses}")'

## Run training - performance enhancements -- Old

In [15]:
'''nlp = blank_nlp_model(TRAIN_DATA)'''

'nlp = blank_nlp_model(TRAIN_DATA)'

In [16]:
'''nlp.pipeline'''

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1dfbe73ff10>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1dfbe39b400>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1dfbe39b1c0>)]

In [17]:
'''optimizer = nlp.begin_training()
for i in range(20):
    losses = {}
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,  # batch of texts
            annotations,  # batch of annotations
            drop=0.1,  # dropout - make it harder to memorise data
            losses=losses,
        )
    print(f"Losses at iteration {i} - {dt.datetime.now()} {losses}")'''

'optimizer = nlp.begin_training()\nfor i in range(20):\n    losses = {}\n    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))\n    for batch in batches:\n        texts, annotations = zip(*batch)\n        nlp.update(\n            texts,  # batch of texts\n            annotations,  # batch of annotations\n            drop=0.1,  # dropout - make it harder to memorise data\n            losses=losses,\n        )\n    print(f"Losses at iteration {i} - {dt.datetime.now()} {losses}")'

# Model Training

In [13]:
#New training model loop to either accept existing model, if not model is not defined then create a blank nlp model using english vocab

def train_model(**model_params):
    
    model = model_params['model']
    iterations = model_params['iterations']
    train_data = model_params['train_data']
    
    
    random.seed(0)
    
    if model is not None:
        nlp = spacy.load(model) #load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")
        print("Created blank 'en' model")
    
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise,get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    #ner.add_label(LABEL)  # add new entity label to entity recognizer
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            #print(ent[2])

    # Adding extraneous labels shouldn't mess anything up
    ner.add_label("VEGETABLE")
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    # only train NER
    
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        sizes=compounding(4.0, 32.0, 1.001)
        # batch up the examples using spaCy's minibatch
                  
        if model is None:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.resume_training()

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        for itn in range(iterations):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, 
                           annotations,
                           sgd=optimizer, drop=0.1, 
                           losses=losses)
            print(f"Losses at iteration {itn} - {dt.datetime.now()} {losses}")
    
    print('Model training completed')
    return nlp

In [14]:
model_params = {
    'model': None,
    'iterations': 20,
    'train_data': TRAIN_DATA
}

In [15]:
nlp = train_model(**model_params)

Created blank 'en' model


  proc.begin_training(


Losses at iteration 0 - 2020-10-20 15:06:50.747508 {'ner': 1002.4988477857921}
Losses at iteration 1 - 2020-10-20 15:06:56.844070 {'ner': 0.3382388563186799}
Losses at iteration 2 - 2020-10-20 15:07:03.009030 {'ner': 2.314843076717813e-06}
Losses at iteration 3 - 2020-10-20 15:07:08.145071 {'ner': 1.4389402114935229e-06}
Losses at iteration 4 - 2020-10-20 15:07:13.444033 {'ner': 6.970059375317793e-08}
Losses at iteration 5 - 2020-10-20 15:07:18.763029 {'ner': 4.415401743625468e-09}
Losses at iteration 6 - 2020-10-20 15:07:23.210071 {'ner': 1.0562531943584313e-09}
Losses at iteration 7 - 2020-10-20 15:07:28.005063 {'ner': 2.4602129852783327e-08}
Losses at iteration 8 - 2020-10-20 15:07:32.702033 {'ner': 1.4264602461743666e-08}
Losses at iteration 9 - 2020-10-20 15:07:36.751033 {'ner': 1.4091154084281965e-08}
Losses at iteration 10 - 2020-10-20 15:07:40.465065 {'ner': 2.3211939862835905e-09}
Losses at iteration 11 - 2020-10-20 15:07:44.450028 {'ner': 2.7417796637739753e-09}
Losses at ite

## Test the trained model

In [16]:
nlp.pipeline

[('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x164aada3c40>)]

In [17]:
doc = nlp('Here is a green roof on this house. A green roof is good.')


In [18]:
displacy.render(doc, style="ent")

## Save model for testing

In [19]:
output_dir = r'./model'

if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to model


## Loading and testing the saved model

In [22]:
output_dir = r'./model'
x = ['i am a green roof']
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text in x:
    doc = nlp2(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from ./model
Entities [('green roof', 'SUSTECH')]
Tokens [('i', '', 2), ('am', '', 2), ('a', '', 2), ('green', 'SUSTECH', 3), ('roof', 'SUSTECH', 1)]
