# <font size="10">Custom entity recognition </font>
## Model environment setup

This notebook contains test code to train the implemented model in the generated training data outputted from
the [ner-train notebook located here](./ner-train-note.ipynb).

For simple loop model training (old), go [here](## Run training - using simple training loop from blank -- Old)

## Setup

In [1]:
!pip install -U spacy
!pip install spacy[lookups]


Requirement already up-to-date: spacy in /usr/local/lib/python3.6/dist-packages (2.3.2)


In [2]:
!python -m spacy download en

!python -m spacy download en_core_web_md
!python -m spacy validate

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: /usr/local/lib/python3.6/dist-packages/spacy[0m

TYPE      NAME             MODEL            VERSION                            
package   en-core-web-sm   en_core_web_sm   [38;5;2m2.3.1[0m   [38;5;2m✔[0m
package   en-core-web-md   en_core_web_md   [38;5;2m2.3.1[0m   [38;5;2m✔[0m
link      en               en_core_web_sm   [38;5;2m2.3.1[0m   [38;5;2m✔[0m



In [3]:
import pandas as pd
import random
import warnings
import json
import ast
import datetime as dt
from pathlib import Path
import os
import glob

##SpaCy

import en_core_web_sm
import en_core_web_md

import spacy
from spacy import displacy
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from spacy.pipeline import Sentencizer
from spacy.lemmatizer import Lemmatizer, ADJ, NOUN, VERB
from spacy.util import minibatch, compounding
from spacy.util import decaying
from spacy.pipeline import Tagger
from spacy.pipeline import DependencyParser




# For Colab

In [4]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import training data

lets import the training data we generated:

In [5]:
train_path = r'/content/drive/My Drive/Colab Notebooks/nlp-ner-sustain-notebook/train/'

csvs = glob.glob(train_path + "/*.csv")


print('Filepath is :',(csvs))

train_list = []

for filename in csvs:
    df = pd.read_csv(filename, index_col=None, header=0)
    train_list.append(df)


Filepath is : ['/content/drive/My Drive/Colab Notebooks/nlp-ner-sustain-notebook/train/trainset0.csv', '/content/drive/My Drive/Colab Notebooks/nlp-ner-sustain-notebook/train/trainset1.csv', '/content/drive/My Drive/Colab Notebooks/nlp-ner-sustain-notebook/train/trainset2.csv']


In [6]:
DATA = pd.concat(train_list, axis=0, ignore_index=True)
DATA = DATA.drop_duplicates()
DATA[0:10]
print(len(DATA))

2101


In [7]:
#convert to list for model intake
TRAIN_DATA = DATA.values.tolist()

#for element in index 1 convert string (Entity position) to dictionary to be able to read by the model function
for position in TRAIN_DATA:
    position[1]=ast.literal_eval(position[1])
    
#Check our input list
print(TRAIN_DATA[0:2])

[['With increasing severe weather events and disasters \ntriggering greater numbers of costly power outages, there is a growing interest in generators for \nreliable backup power.', {'entities': [(161, 173, 'SUSTECH')]}], ['$3,153 \n\nNPC of Backup Power per Unit ($/kW)', {'entities': [(16, 28, 'SUSTECH')]}]]


## Run a test before training
### Test existing default spacy model 

In [8]:
nlp = en_core_web_md.load()
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fdd7349df60>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fdd74303e88>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fdd7462c888>)]

In [9]:
doc = nlp('Here is a green roof on this house. A green roof is good.')
displacy.render(doc, style="ent")
# verified green roof does not match an entity in the NER



'<div class="entities" style="line-height: 2.5; direction: ltr">Here is a green roof on this house. A green roof is good.</div>'

## TRAINING THE MODEL
## Train model setup

### Define compounding batch size

In [10]:
def get_batches(train_data, model_type):
    max_batch_sizes = {"tagger": 32, "parser": 16, "ner": 16, "textcat": 64}
    max_batch_size = max_batch_sizes[model_type]
    if len(train_data) < 1000:
        max_batch_size /= 2
    if len(train_data) < 500:
        max_batch_size /= 2
    batch_size = compounding(1, max_batch_size, 1.001)
    batches = minibatch(train_data, size=batch_size)
    return batches

In [11]:
#New training model loop to either accept existing model, if not model is not defined then create a blank nlp model using english vocab

def train_model(**model_params):

    model = model_params['model']
    iterations = model_params['iterations']
    train_data = model_params['train_data']
    dropout = model_params['dropout']
    
    
    random.seed(0)
    
    if model is not None:
        nlp = model.load() #load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")
        #lang = "en"
        #cls = spacy.util.get_lang_class(lang)   # 1. Get Language instance, e.g. English()
        #nlp = cls() 
        print("Created blank 'en' model")
    
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise,get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    #ner.add_label(LABEL)  # add new entity label to entity recognizer
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            #print(ent[2])

    # Adding extraneous labels shouldn't mess anything up
    ner.add_label("VEGETABLE")
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    # only train NER
    
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        sizes=compounding(4.0, 32.0, 1.001)
        # batch up the examples using spaCy's minibatch
                  
        if model is None:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.resume_training()

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        for itn in range(iterations):
            random.shuffle(train_data)
            #batches = get_batches(TRAIN_DATA, 'ner') resulted in poor loss
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, 
                           annotations,
                           sgd=optimizer, 
                           drop = 0.1, 
                           losses=losses)
            print(f"Losses at iteration {itn} - {dt.datetime.now()} {losses}")
    
    print('Model training completed')
    return nlp

In [12]:
model_params = {
    'model': None,
    'iterations': 30,
    'train_data': TRAIN_DATA,
    'dropout': decaying(0.6, 0.2, 1e-4)
}

In [13]:
nlp = train_model(**model_params)

Created blank 'en' model
Losses at iteration 0 - 2020-10-29 01:02:38.173322 {'ner': 977.3275402284257}
Losses at iteration 1 - 2020-10-29 01:03:01.085662 {'ner': 64.18739407680947}
Losses at iteration 2 - 2020-10-29 01:03:23.086181 {'ner': 48.141407746686156}
Losses at iteration 3 - 2020-10-29 01:03:47.210654 {'ner': 823.6130996842343}
Losses at iteration 4 - 2020-10-29 01:04:12.147487 {'ner': 597.8629093681793}
Losses at iteration 5 - 2020-10-29 01:04:36.772256 {'ner': 48.08922797004742}
Losses at iteration 6 - 2020-10-29 01:05:01.359702 {'ner': 75.93506657107862}
Losses at iteration 7 - 2020-10-29 01:05:26.126669 {'ner': 24.702418292899008}
Losses at iteration 8 - 2020-10-29 01:05:52.067602 {'ner': 24.269425496126562}
Losses at iteration 9 - 2020-10-29 01:06:16.940329 {'ner': 16.38765447758055}
Losses at iteration 10 - 2020-10-29 01:06:45.743376 {'ner': 26.064786924687056}
Losses at iteration 11 - 2020-10-29 01:07:16.866072 {'ner': 10.721990758914256}
Losses at iteration 12 - 2020-10

## Test the trained model

In [14]:
nlp.pipeline

[('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fdd7041be28>)]

In [15]:

#tagger = Tagger(nlp.vocab)
#parser = DependencyParser(nlp.vocab)



In [16]:
nlp.pipeline

[('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fdd7041be28>)]

In [17]:
doc = nlp('Here is a green roof on this house. A green roof is good. water piping, I have alot of battery packs')


In [18]:
displacy.render(doc, style="ent")

'<div class="entities" style="line-height: 2.5; direction: ltr">Here is a \n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    green roof\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">SUSTECH</span>\n</mark>\n on this house. A \n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    green roof\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">SUSTECH</span>\n</mark>\n is good. water piping, I have alot of battery packs</div>'

## Save model for testing

In [19]:
output_dir = r'/content/drive/My Drive/Colab Notebooks/nlp-ner-sustain-notebook/model'

if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to /content/drive/My Drive/Colab Notebooks/nlp-ner-sustain-notebook/model


## Loading and testing the saved model

In [20]:
x = ['i am a green roof']
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)

for text in x:
    doc = nlp2(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from /content/drive/My Drive/Colab Notebooks/nlp-ner-sustain-notebook/model
Entities [('green roof', 'SUSTECH')]
Tokens [('i', '', 2), ('am', '', 2), ('a', '', 2), ('green', 'SUSTECH', 3), ('roof', 'SUSTECH', 1)]
