# <font size="10">Custom entity recognition </font>
## Model environment setup

This notebook contains test code to train the implemented model in the generated training data outputted from
the [ner-train notebook located here](./ner-train-note.ipynb).

For simple loop model training (old), go [here](## Run training - using simple training loop from blank -- Old)

## Setup

In [None]:
!pip install -U spacy
!pip install spacy[lookups]


Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/10/b5/c7a92c7ce5d4b353b70b4b5b4385687206c8b230ddfe08746ab0fd310a3a/spacy-2.3.2-cp36-cp36m-manylinux1_x86_64.whl (9.9MB)
[K     |████████████████████████████████| 10.0MB 2.7MB/s 
Collecting thinc==7.4.1
[?25l  Downloading https://files.pythonhosted.org/packages/10/ae/ef3ae5e93639c0ef8e3eb32e3c18341e511b3c515fcfc603f4b808087651/thinc-7.4.1-cp36-cp36m-manylinux1_x86_64.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 48.2MB/s 
Installing collected packages: thinc, spacy
  Found existing installation: thinc 7.4.0
    Uninstalling thinc-7.4.0:
      Successfully uninstalled thinc-7.4.0
  Found existing installation: spacy 2.2.4
    Uninstalling spacy-2.2.4:
      Successfully uninstalled spacy-2.2.4


In [None]:
!python -m spacy download en

!python -m spacy download en_core_web_md
!python -m spacy validate

In [None]:
import pandas as pd
import random
import warnings
import json
import ast
import datetime as dt
from pathlib import Path
import os
import glob
from __future__ import unicode_literals, print_function

##SpaCy

import en_core_web_sm
import en_core_web_md

import spacy
from spacy import displacy
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from spacy.pipeline import Sentencizer
from spacy.lemmatizer import Lemmatizer, ADJ, NOUN, VERB
from spacy.util import minibatch, compounding
from spacy.util import decaying
from spacy.pipeline import Tagger
from spacy.pipeline import DependencyParser
from thinc.neural.optimizers import Adam



# For Colab

In [None]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

# For Local

In [None]:
'''train_path = r'./train/'

csvs = glob.glob(train_path + "/*.csv")


print('Filepath is :',(csvs))

train_list = []

for filename in csvs:
    df = pd.read_csv(filename, index_col=None, header=0)
    train_list.append(df)'''


## Import training data

lets import the training data we generated:

In [None]:
train_path = r'/content/drive/My Drive/Colab Notebooks/nlp-ner-sustain-notebook/train/'

csvs = glob.glob(train_path + "/*.csv")


print('Filepath is :',(csvs))

train_list = []

for filename in csvs:
    df = pd.read_csv(filename, index_col=None, header=0)
    train_list.append(df)


In [None]:
DATA = pd.concat(train_list, axis=0, ignore_index=True)
DATA = DATA.drop_duplicates()
DATA[0:10]
print(len(DATA))

In [None]:
#convert to list for model intake
TRAIN_DATA = DATA.values.tolist()

#for element in index 1 convert string (Entity position) to dictionary to be able to read by the model function
for position in TRAIN_DATA:
    position[1]=ast.literal_eval(position[1])
    
#Check our input list
print(TRAIN_DATA[0:2])

## Run a test before training
### Test existing default spacy model 

In [None]:
nlp = en_core_web_md.load()
nlp.pipeline

In [None]:
doc = nlp('Here is a green roof on this house. A green roof is good.')
displacy.render(doc, style="ent")
# verified green roof does not match an entity in the NER

## TRAINING THE MODEL
## Train model setup

### Define compounding batch size

In [None]:
def get_batches(train_data, model_type):
    max_batch_sizes = {"tagger": 32, "parser": 16, "ner": 16, "textcat": 64}
    max_batch_size = max_batch_sizes[model_type]
    if len(train_data) < 1000:
        max_batch_size /= 2
    if len(train_data) < 500:
        max_batch_size /= 2
    batch_size = compounding(1, max_batch_size, 1.001)
    batches = minibatch(train_data, size=batch_size)
    return batches

## Define custom Adam optimizer





In [None]:

def custom_optimizer(optimizer, learn_rate=0.001, beta1=0.9, beta2=0.999, eps=1e-8, L2=1e-6, max_grad_norm=1.0):
    """
    Function to customizer spaCy default optimizer
    """
    
    optimizer.learn_rate = learn_rate
    optimizer.beta1 = beta1
    optimizer.beta2 = beta2
    optimizer.eps = eps
    optimizer.L2 = L2
    optimizer.max_grad_norm = max_grad_norm
    
    return optimizer

## Training Loop

In [None]:

def train_model(**model_params):

    model = model_params['model']
    iterations = model_params['iterations']
    train_data = model_params['train_data']
    dropout = model_params['dropout'],
    learn_rate = model_params['learn_rate'], 
    beta1 = model_params['beta1'], 
    beta2 = model_params['beta2'], 
    eps = model_params['eps'], 
    L2 = model_params['L2'], 
    max_grad_norm = model_params['max_grad_norm']

    """Load the model, set up the pipeline and train the entity recognizer."""
    
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        nlp.remove_pipe("ner")
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        
        if model is None:
            optimizer = nlp.begin_training(component_cfg={"ner": {"conv_window": 3}})
            optimizer=custom_optimizer(optimizer, learn_rate=learn_rate)
        
        else:
            optimizer = nlp.resume_training(component_cfg={"ner": {"conv_window": 3}})
            optimizer=custom_optimizer(optimizer, learn_rate=learn_rate)


        # Define decaying dropout
        dropout = decaying(0.6, 0.2, 1e-4)
        
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=next(dropout),  # dropout - make it harder to memorise data
                    sgd= optimizer,
                    losses=losses,
                )
        print(f"Losses at iteration {itn} - {dt.datetime.now()} {losses}")
    
    print('Model training completed')
    return nlp

In [None]:
model_params = {
    'model': en_core_web_md,
    'iterations': 40,
    'train_data': TRAIN_DATA,
    'dropout': decaying(0.6, 0.2, 1e-4),
    'learn_rate':0.001, 
    'beta1': 0.9, 
    'beta2': 0.999, 
    'eps': 1e-8, 
    'L2': 1e-6, 
    'max_grad_norm':1.0
}

In [None]:
nlp = train_model(**model_params)

## Test the trained model

In [None]:
nlp.pipeline

In [None]:

#tagger = Tagger(nlp.vocab)
#parser = DependencyParser(nlp.vocab)



In [None]:
nlp.pipeline

In [None]:
doc = nlp('Here is a green roof on this house. A green roof is good. water piping, I have alot of battery packs')


In [None]:
displacy.render(doc, style="ent")

## Save model for testing

In [None]:
output_dir = r'/content/drive/My Drive/Colab Notebooks/nlp-ner-sustain-notebook/model'

if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

In [None]:
'''output_dir = r'./model'

if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)'''

## Loading and testing the saved model

In [None]:
x = ['i am a green roof']
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)

for text in x:
    doc = nlp2(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

## For Local

In [None]:
'''output_dir = r'./model'

if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)'''

In [None]:
'''output_dir = r'./model'
x = ['rainwater harvesting is great']
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)

for text in x:
    doc = nlp2(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])'''