## Wavelength - Fine Tuning a Spacy NER Model - Final Training ready for Test

Import Libraries

In [43]:
import spacy
from spacy.matcher import Matcher
from spacy.pipeline import EntityRuler
from spacy.tokens import Token, Span, Doc
from spacy import displacy
# from openpyxl import load_workbook
import numpy as np
import pandas as pd
from dateutil import parser
import wavefunctions as wf
import re
import random
from sklearn.model_selection import train_test_split

In [44]:
nlp = spacy.load('en_core_web_sm')


In [45]:
train_raw = pd.read_excel('data\Employee Train.xlsx',sheet_name="Training Dataset")
train_raw.sort_values(by='Employee ID',inplace=True)
times_extract = train_raw['Times [Extract]']

Export text in JSON lines format to be annotated in annotation software Doccano installed locally in Docker

In [48]:
#times_export = train_raw[['Employee ID','Times [Extract]','Days per week specified','Hours per week specified']]
#times_export.to_json('training_raw.json',orient="records",lines=True)
times_extract.to_json('JSON/training_raw.json',orient="records",lines=True)

Declare custom Document and Span extensions to store relevant days and hours figures for downstream calculation

In [49]:
BASE_DATE = "13th October 2019 "
Doc.set_extension("specified_days",default=0, force=True)
Doc.set_extension("specified_hours",default=0, force=True)
Doc.set_extension("lunch_hours",default=0, force=True)
Span.set_extension("saved_hours",default=0, force=True)
Span.set_extension("saved_days",default=0, force=True)

In [50]:
docs = nlp.pipe(times_extract)

Import annotated JSON lines file from Doccano

In [52]:
TRAIN_IMPORT = pd.read_json("file.json1",orient="records",lines=True)

Check annotations

In [53]:
TRAIN_IMPORT.sample(8)

Unnamed: 0,id,text,meta,annotation_approver,labels
64,643,,{},,[]
6,579,6.1 Your normal working hours are 08.45 to 17....,{},,"[[34, 48, TIME], [68, 76, TIME], [89, 102, DATE]]"
38,611,,{},,[]
18,591,6.1 Your normal working hours are 08.45 to 17....,{},,"[[34, 48, TIME], [50, 67, DATE], [87, 95, TIME]]"
26,599,,{},,[]
0,626,,{},,[]
54,633,,{},,[]
37,610,,{},,[]


Reformat Annotations into Entity Labels for each text example, labels are stored as a dictionary of tuples.

In [54]:
def create_entity_tags(labels_list):
    '''Format imported labels from Doccano into entity tags'''
    entity_dict = {} 
    entity_dict["entities"] = tuple(labels_list)
    
    return entity_dict
        

In [55]:
TRAIN_IMPORT['entities'] = TRAIN_IMPORT['labels'].apply(create_entity_tags)

Combine text and entity tags into the correct training data format for a Spacy model

In [56]:
TRAINING_DATA_2 = []
for _, row in TRAIN_IMPORT[['text','entities']].iterrows():
    training_example = (row['text'], row['entities'])
    TRAINING_DATA_2.append(training_example)

Create named Entity Recognition model inside a spacy pipeline

In [57]:
ner = nlp.create_pipe("ner")

In [38]:
nlp.begin_training()

<thinc.neural.optimizers.Optimizer at 0x21ec00ddf70>

In [39]:
train_data, test_data = train_test_split(TRAINING_DATA_2,test_size=0.1)

In [58]:
for itn in range(10):
    random.shuffle(train_data)
    for batch in spacy.util.minibatch(TRAINING_DATA_2, size=2):
        texts = [text for text,annotation in batch]
        annotations = [annotation for text, annotation in batch]
        nlp.update(texts, annotations)
        

  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


In [59]:
test_text = [text for text, _ in test_data]

In [60]:
len(test_text)

8

In [61]:
docs = nlp.pipe(times_extract)

In [62]:
test = next(docs)

In [63]:
test = nlp("Your normal working hours are 40 hours per week Your specific working hours within this will be instructed by your line manager and will be in the campus routines with breaks and a 1 hour unpaid lunch break.")

In [64]:
test

Your normal working hours are 40 hours per week Your specific working hours within this will be instructed by your line manager and will be in the campus routines with breaks and a 1 hour unpaid lunch break.

In [71]:
nlp.to_disk("final model/")