<a href="https://colab.research.google.com/github/sunc-dev/spaCY-ner-sustain/blob/main/ner-model-note.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <font size="10">Custom entity recognition </font>
## Model environment setup

This notebook contains test code to train the implemented model in the generated training data outputted from
the [ner-train notebook located here](./ner-train-note.ipynb).

For simple loop model training (old), go [here](## Run training - using simple training loop from blank -- Old)

## Setup

In [None]:
import pandas as pd
import random
import warnings
import json
import ast
import datetime as dt
from pathlib import Path
import os
import glob

##SpaCy

import en_core_web_sm
import spacy
from spacy import displacy
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from spacy.pipeline import Sentencizer
from spacy.lemmatizer import Lemmatizer, ADJ, NOUN, VERB
from spacy.util import minibatch, compounding
from spacy.util import decaying


# For Colab

In [None]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

## Import training data

lets import the training data we generated:

In [None]:
train_path = r'./train/'

csvs = glob.glob(train_path + "/*.csv")


print('Filepath is :',(csvs))

train_list = []

for filename in csvs:
    df = pd.read_csv(filename, index_col=None, header=0)
    train_list.append(df)


Filepath is : ./train/train.csv


In [None]:
DATA = pd.concat(train_list, axis=0, ignore_index=True)
DATA = DATA.drop_duplicates()
DATA[0:10]
print(len(DATA))

Unnamed: 0,text,position
0,A Comparison of Fuel Choice for \nBackup Gener...,"{'entities': [(33, 50, 'SUSTECH')]}"
1,A Comparison of Fuel Choice for \nBackup Gener...,"{'entities': [(33, 50, 'SUSTECH')]}"
2,A Comparison of Fuel Choice for \nBackup Gener...,"{'entities': [(33, 50, 'SUSTECH')]}"
3,Businesses are either considering installing b...,"{'entities': [(45, 62, 'SUSTECH')]}"
4,This report discusses the costs and benefits o...,"{'entities': [(48, 64, 'SUSTECH')]}"
5,We discuss how to assign value to the reliabil...,"{'entities': [(101, 118, 'SUSTECH')]}"
6,"At the same time, backup generators are not \n...","{'entities': [(18, 35, 'SUSTECH')]}"
7,This makes backup generators best suited \nfor...,"{'entities': [(11, 28, 'SUSTECH')]}"
8,"Regions with \ncoincident peak charges, along ...","{'entities': [(161, 178, 'SUSTECH')]}"
9,connected natural gas backup generators to eco...,"{'entities': [(22, 39, 'SUSTECH')]}"


In [None]:
#convert to list for model intake
TRAIN_DATA = DATA.values.tolist()

#for element in index 1 convert string (Entity position) to dictionary to be able to read by the model function
for position in TRAIN_DATA:
    position[1]=ast.literal_eval(position[1])
    
#Check our input list
print(TRAIN_DATA[0:2])

[['A Comparison of Fuel Choice for \nBackup Generators   \n\nSean Ericson and Dan Olis', {'entities': [(33, 50, 'SUSTECH')]}], ['A Comparison of Fuel Choice for \nBackup Generators   \n\nSean Ericson and Dan Olis', {'entities': [(33, 50, 'SUSTECH')]}]]


## Run a test before training
### Test existing default spacy model 

In [None]:
nlp = spacy.load("en_core_web_sm")
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x229098b5820>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x229095267c0>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x22909526580>)]

In [None]:
doc = nlp('Here is a green roof on this house. A green roof is good.')
displacy.render(doc, style="ent")
# verified green roof does not match an entity in the NER



## TRAINING THE MODEL
## Train model setup

### Define compounding batch size

In [None]:
def get_batches(train_data, model_type):
    max_batch_sizes = {"tagger": 32, "parser": 16, "ner": 16, "textcat": 64}
    max_batch_size = max_batch_sizes[model_type]
    if len(train_data) < 1000:
        max_batch_size /= 2
    if len(train_data) < 500:
        max_batch_size /= 2
    batch_size = compounding(1, max_batch_size, 1.001)
    batches = minibatch(train_data, size=batch_size)
    return batches

In [None]:
#New training model loop to either accept existing model, if not model is not defined then create a blank nlp model using english vocab

def train_model(**model_params):

    model = model_params['model']
    iterations = model_params['iterations']
    train_data = model_params['train_data']
    dropout = model_params['dropout']
    
    
    random.seed(0)
    
    if model is not None:
        nlp = spacy.load(model) #load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")
        print("Created blank 'en' model")
    
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise,get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    #ner.add_label(LABEL)  # add new entity label to entity recognizer
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            #print(ent[2])

    # Adding extraneous labels shouldn't mess anything up
    ner.add_label("VEGETABLE")
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    # only train NER
    
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        sizes=compounding(4.0, 32.0, 1.001)
        # batch up the examples using spaCy's minibatch
                  
        if model is None:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.resume_training()

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        for itn in range(iterations):
            random.shuffle(train_data)
            #batches = get_batches(TRAIN_DATA, 'ner') resulted in poor loss
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, 
                           annotations,
                           sgd=optimizer, 
                           drop = 0.1, 
                           losses=losses)
            print(f"Losses at iteration {itn} - {dt.datetime.now()} {losses}")
    
    print('Model training completed')
    return nlp

In [None]:
model_params = {
    'model': None,
    'iterations': 30,
    'train_data': TRAIN_DATA,
    'dropout': decaying(0.6, 0.2, 1e-4)
}

In [None]:
nlp = train_model(**model_params)

Created blank 'en' model


  proc.begin_training(


Losses at iteration 0 - 2020-10-27 10:52:56.484202 {'ner': 800.8414212201719}
Losses at iteration 1 - 2020-10-27 10:53:06.224824 {'ner': 7.533502997782789}
Losses at iteration 2 - 2020-10-27 10:53:17.922434 {'ner': 7.245988433990591}
Losses at iteration 3 - 2020-10-27 10:53:28.529365 {'ner': 11.463132024309004}
Losses at iteration 4 - 2020-10-27 10:53:40.717586 {'ner': 5.186865078675736}
Losses at iteration 5 - 2020-10-27 10:53:51.362593 {'ner': 1.523608087809979}
Losses at iteration 6 - 2020-10-27 10:54:02.307085 {'ner': 3.0806059114671296}
Losses at iteration 7 - 2020-10-27 10:54:17.089014 {'ner': 1.040647210005701e-08}
Losses at iteration 8 - 2020-10-27 10:54:30.373195 {'ner': 2.6109129437929435e-08}
Losses at iteration 9 - 2020-10-27 10:54:44.277446 {'ner': 7.540048324779354e-09}
Losses at iteration 10 - 2020-10-27 10:54:59.476116 {'ner': 0.00981159075042022}
Losses at iteration 11 - 2020-10-27 10:55:13.385301 {'ner': 1.5563027785160132e-05}
Losses at iteration 12 - 2020-10-27 10:5

## Test the trained model

In [None]:
nlp.pipeline

[('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x22978692d60>)]

In [None]:
doc = nlp('Here is a green roof on this house. A green roof is good. water piping, I have alot of battery packs')


In [None]:
displacy.render(doc, style="ent")

## Save model for testing

In [None]:
output_dir = r'./model'

if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to model


## Loading and testing the saved model

In [None]:
output_dir = r'./model'
x = ['i am a green roof']
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)

for text in x:
    doc = nlp2(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from ./model
Entities [('green roof', 'SUSTECH')]
Tokens [('i', '', 2), ('am', '', 2), ('a', '', 2), ('green', 'SUSTECH', 3), ('roof', 'SUSTECH', 1)]
