### Wavelength - Testing a Rule based approach with Spacy Entity Ruler and Matcher 

In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.pipeline import EntityRuler
from spacy.tokens import Token, Span, Doc
from spacy import displacy
from openpyxl import load_workbook
import numpy as np
import pandas as pd
from dateutil import parser
import wavefunctions as wf
import re

In [2]:
BASE_DATE = "13th October 2019 "
Doc.set_extension("specified_days",getter=wf.get_days_per_week_new)
Span.set_extension("saved_days",default=0)
Span.set_extension("previous_ent",default=0)
Span.set_extension("next_ent",default=0)

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
train_raw = pd.read_excel('data\Employee Train.xlsx',sheet_name="Training Dataset")

In [5]:
times_extract = train_raw['Times [Extract]']
times_extract.columns = ["text"]

Define Entity Ruler Patterns

In [6]:
#Entity Ruler Patterns

patterns = [{"label" : "TIMES", 
             "pattern" : [# {"LOWER" : {"IN" : ["monday","tuesday","wednesday","thursday","friday","saturday","sunday"]}, 
                          #  "OP" : "?"},
                          {"IS_PUNCT" : True, "OP" : "?"},            
                          {"TEXT" : {"REGEX" : "\w?\d+\w?[.:]?\w\d+\w?"}},
                          # {"SHAPE" : {"IN" :["dddd","ddd","d:dd","d:ddxx","dd:dx","dd:dd","dd:ddxx",
                          #                    "dd:ddxxx","d.dd","d.ddxx","dd.dx","dd.dd","dd.ddxx",
                          #                    "dd.ddxxx"]}},
                          {"LOWER" : {"IN" : ["to", "-"]}},
                          {"TEXT" : {"REGEX" : "\d+[.:]?\d+\w+"}}],
             "id" : "hours_range"},
            
            {"label" : "DATE", 
             "pattern" : [{"LOWER" : {"IN" : ["monday", "tuesday", "wednesday","thursday","friday",
                                              "saturday","sunday"]}},
                          {"LOWER" : {"IN" : ["to", "-"]}},
                          {"LOWER" : {"IN" : ["monday", "tuesday", "wednesday","thursday","friday",
                                              "saturday","sunday"]}}],
             "id" : "days_range"},
            
            {"label" : "DATE",
             "pattern" : [{"LIKE_NUM" : True},
                          {"LOWER" : {"IN" : ["hour","hours"]}},
                          {"IS_ALPHA" : True, "OP" : "?"},
                          {"LOWER" : "lunch"}],
             "id" : "lunch_break"},
            {"label" : "DATE",
             "pattern" : [{"LIKE_NUM" : True},
                          {"LOWER" : {"IN" : ["day","days"]}}],
             "id" : "days_worked"}
           ]

Patterns for Spacy Matcher

In [7]:


hours_pattern = [#{"LOWER" : {"IN" : ["monday","tuesday","wednesday","thursday","friday","saturday","sunday"]}, 
                 #          "OP" : "?"},
                 #         {"IS_PUNCT" : True, "OP" : "?"},            
                          {"TEXT" : {"REGEX" : "\w?\d+\w?[.:]?\w\d+\w?"}},
                          {"LOWER" : {"IN" : ["to", "-"]}},
                          {"TEXT" : {"REGEX" : "\d+[.:]?\d+\w+"}}]

days_pattern = [{"LOWER" : {"IN" : ["monday", "tuesday", "wednesday","thursday","friday","saturday","sunday"]}},
               {"LOWER" : {"IN" : ["to", "-"]}},
               {"LOWER" : {"IN" : ["monday", "tuesday", "wednesday","thursday","friday","saturday","sunday"]}}] 
                
lunch_pattern = [{"LIKE_NUM" : True},
                          {"LOWER" : {"IN" : ["hour","hours"]}},
                          {"IS_ALPHA" : True, "OP" : "?"},
                          {"LOWER" : "lunch"}]

work_pattern = [{"LIKE_NUM" : True},
                          {"LOWER" : {"IN" : ["day","days"]}}]

    
patterns = [{"label" : "TIMES", 
             "pattern" : [        
                          {"TEXT" : {"REGEX" : "\w?\d+\w?[.:]?\w\d+\w?"}},
                          {"LOWER" : {"IN" : ["to", "-"]}},
                          {"TEXT" : {"REGEX" : "\d+[.:]?\d+\w+"}}],
             "id" : "hours_range"},
            
            {"label" : "DATE", 
             "pattern" : [{"LOWER" : {"IN" : ["monday", "tuesday", "wednesday","thursday","friday",
                                              "saturday","sunday"]}},
                          {"LOWER" : {"IN" : ["to", "-"]}},
                          {"LOWER" : {"IN" : ["monday", "tuesday", "wednesday","thursday","friday",
                                              "saturday","sunday"]}}],
             "id" : "days_range"},
            
            {"label" : "DATE",
             "pattern" : [{"LIKE_NUM" : True},
                          {"LOWER" : {"IN" : ["hour","hours"]}},
                          {"IS_ALPHA" : True, "OP" : "?"},
                          {"LOWER" : "lunch"}],
             "id" : "lunch_break"},
            {"label" : "DATE",
             "pattern" : [{"LIKE_NUM" : True},
                          {"LOWER" : {"IN" : ["day","days"]}}],
             "id" : "days_worked"}
           ]

Create entity ruler for custom entity recognition

In [8]:
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler,before='ner')

Testing

In [9]:
docs = nlp.pipe(times_extract)

In [10]:
temp_doc = nlp(train_raw['Times [Extract]'][3])

In [11]:
temp_doc

6.1 Your normal working hours are 08.45 to 17.45 with breaks, and a one hour lunch break 3 days a week.

In [12]:
[print(e,e.label_) for e in temp_doc.ents] # issues with ents

6.1 CARDINAL
normal working hours TIME
08.45 to 17.45 TIMES
one hour lunch DATE
3 days DATE


[None, None, None, None, None]

In [13]:
prev_ent = 0
next_ent = 1
for e in temp_doc.ents:
    e._.previous_ent = prev_ent
    e._.next_ent = next_ent
    prev_ent += 1
    next_ent += 1
    
    

In [14]:
displacy.serve(temp_doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [15]:
for temp_doc in docs:
    for e in temp_doc.ents:
        if e.ent_id_ == "hours_range":
            print(e),
            print(wf.convert_time(e[0])),
            print(wf.convert_time(e[2]))
            print(wf.calc_hours(e))

08.45 to 17.45
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
08.45 to 17.45
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
8.45am to 5.3opm
2019-10-13 08:45:00
2019-10-13 17:30:00
8.75
08.45 to 17.45
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
08.45 to 17.45
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
08.45 to 17.45
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
0845 to 1745
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
08.45 to 17.45
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
08.45 to 17.45
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
08.45 to 17.45
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
08.45 to 17.45
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
0845 - 1745
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
08.45 to 17.45
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
08.45 to 17.45
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
08.45 to 17.45
2019-10-13 08:45:00
2019-10-13 17:45:00
9.0
08.45 to 12.45
2019-10-13 08:45:00
2019-10-13 12:45:00
4.0
09.00 to 18.00
2019-10-13 09:00:00
2019-10-13 18:00:00
9.0

In [16]:
for doc in docs:
    for e in temp_doc.ents:
        if e.ent_id_ == "hours_range":
            print(e,e[0].head,e.ent_id_)

In [17]:
from spacy import displacy
displacy.render(temp_doc, style='dep')

In [18]:
temp_doc.ents[0]


IndexError: tuple index out of range

In [None]:
round((convert_time(temp_doc.ents[2][2]) - convert_time(temp_doc.ents[2][0])).seconds/3600,2)

In [None]:
print(temp_doc.ents[2][0].head)

In [None]:
nlp(train_raw['Times [Extract]'][2])

In [None]:
train_raw['Times [Extract]'][1]

In [None]:
train_raw[train_raw['Days per week specified'].fillna(0) != pd.Series(preds).fillna(0)][['Times [Extract]','Days per week specified',"preds"]]

In [None]:
doc = next(docs)
print(doc)

In [None]:
train_raw["preds"] = preds