## Wavelength - Fine Tuning a Spacy NER Model 

Import Libraries

In [33]:
import spacy
from spacy.matcher import Matcher
from spacy.pipeline import EntityRuler
from spacy.tokens import Token, Span, Doc
from spacy import displacy
# from openpyxl import load_workbook
import numpy as np
import pandas as pd
from dateutil import parser
import wavefunctions as wf
import re
import random
from sklearn.model_selection import train_test_split

In [3]:
nlp = spacy.load('en_core_web_sm')


In [4]:
train_raw = pd.read_excel('data\Employee Train.xlsx',sheet_name="Training Dataset")
train_raw.sort_values(by='Employee ID',inplace=True)
times_extract = train_raw['Times [Extract]']

Export text in JSON lines format to be annotated in annotation software Doccano installed locally in Docker

In [18]:
#times_export = train_raw[['Employee ID','Times [Extract]','Days per week specified','Hours per week specified']]
#times_export.to_json('training_raw.json',orient="records",lines=True)
times_extract.to_json('training_raw.json',orient="records",lines=True)

Declare custom Document and Span extensions to store relevant days and hours figures for downstream calculation

In [20]:
BASE_DATE = "13th October 2019 "
Doc.set_extension("specified_days",default=0, force=True)
Doc.set_extension("specified_hours",default=0, force=True)
Doc.set_extension("lunch_hours",default=0, force=True)
Span.set_extension("saved_hours",default=0, force=True)
Span.set_extension("saved_days",default=0, force=True)

In [21]:
docs = nlp.pipe(times_extract)

Import annotated JSON lines file from Doccano

In [22]:
TRAIN_IMPORT = pd.read_json("file.json1",orient="records",lines=True)

Check annotations

In [29]:
TRAIN_IMPORT.sample(8)

Unnamed: 0,id,text,meta,annotation_approver,labels
1,627,,{},,[]
12,585,6.1 Your normal working hours are 25 hours per...,{},,"[[34, 51, TIME], [60, 72, TIME], [76, 92, DATE..."
34,607,Your normal working hours are between 08:45am ...,{},,"[[38, 56, TIME], [60, 74, DATE], [80, 88, TIME..."
75,654,The normal working week is 37.5 hours. You are...,{},,"[[27, 37, TIME], [104, 120, DATE], [194, 210, ..."
56,635,,{},,[]
39,612,,{},,[]
26,599,,{},,[]
67,646,Your normal working hours are 21 hours each we...,{},,"[[30, 48, TIME], [61, 92, DATE], [146, 162, TI..."


Reformat Annotations into Entity Labels for each text example, labels are stored as a dictionary of tuples.

In [30]:
def create_entity_tags(labels_list):
    '''Format imported labels from Doccano into entity tags'''
    entity_dict = {} 
    entity_dict["entities"] = tuple(labels_list)
    
    return entity_dict
        

In [31]:
TRAIN_IMPORT['entities'] = TRAIN_IMPORT['labels'].apply(create_entity_tags)

Combine text and entity tags into the correct training data format for a Spacy model

In [36]:
TRAINING_DATA_2 = []
for _, row in TRAIN_IMPORT[['text','entities']].iterrows():
    training_example = (row['text'], row['entities'])
    TRAINING_DATA_2.append(training_example)

Create named Entity Recognition model inside a spacy pipeline

In [37]:
ner = nlp.create_pipe("ner")

Train the model

In [38]:
nlp.begin_training()

<thinc.neural.optimizers.Optimizer at 0x21ec00ddf70>

In [39]:
train_data, test_data = train_test_split(TRAINING_DATA_2,test_size=0.1)

In [40]:
for itn in range(10):
    random.shuffle(train_data)
    for batch in spacy.util.minibatch(train_data, size=2):
        texts = [text for text,annotation in batch]
        annotations = [annotation for text, annotation in batch]
        nlp.update(texts, annotations)
        

  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


Check how model is working

In [13]:
docs = nlp.pipe(times_extract)

In [31]:
test = next(docs)

In [80]:
test = nlp("Your normal working hours are 40 hours per week Your specific working hours within this will be instructed by your line manager and will be in the campus routines with breaks and a 1 hour unpaid lunch break.")

In [62]:
test

• The normal working hours are 8.45am to 5.3opm), Monday to Friday, with breaks, and a one hour lunch.

In [81]:
test.ents

(40 hours per week, 1 hour)

In [82]:
wf.extract_specified_times(test)

(0, 1.0)

In [75]:
nlp.to_disk("")