### Part 2: Contract Data Binning

Import libraries

In [77]:
import spacy
from spacy.matcher import Matcher
from spacy.pipeline import EntityRuler
from spacy.tokens import Token, Span, Doc
from spacy import displacy
from openpyxl import load_workbook
import numpy as np
import pandas as pd
from dateutil import parser
import wavefunctions as wf
import re
import random
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

Load spacy and pretrained model from pickle

In [78]:
nlp = spacy.load('en_core_web_sm')

nlp.from_disk("")


<spacy.lang.en.English at 0x1c514cf2a90>

Load data into pandas and start by looking at the times [extract] free text 

In [80]:
contract_df = pd.read_excel('data\Employee Train.xlsx',sheet_name="Training Dataset")
times_extract = contract_df['Times [Extract]']
employee_ids = contract_df['Employee ID']

Create custom extensions in Spacy to store structured information before adding it to 

In [5]:
# Time extract extensions
Doc.set_extension("specified_days",getter=wf.get_days_per_week_model, force=True)
Doc.set_extension("specified_hours",default=0, force=True)
Doc.set_extension("lunch_hours",default=0,force=True)
Span.set_extension("saved_hours",default=0,force=True)
Span.set_extension("saved_days",default=0,force=True)

#work on public holidays etxensions
Doc.set_extension("work_none",default=0, force=True)
Doc.set_extension("work_normal",default=0, force=True)
Doc.set_extension("work_required",default=0, force=True)
Doc.set_extension("work_easter",default=0, force=True)

#Holiday extensions
Doc.set_extension("hol_30x",default=0, force=True)
Doc.set_extension("dec_jan",default=0, force=True)
Doc.set_extension("leave_as_salary",default=0, force=True)
Doc.set_extension("pro_rata",default=0, force=True)


In [6]:
docs = nlp.pipe(times_extract)

In [7]:
def get_assumed(specified_time,timeframe="days"):
    
    multiplier = 1
        
    if timeframe == "hours":
        multiplier = 7.5
        
    if specified_time == 0:
        return 5 * multiplier
    else: return 0

def get_blended(specified_time,timeframe="days"):
    
    multiplier = 1
        
    if timeframe == "hours":
        multiplier = 7.5
    
    if specified_time == 0:
        return 5 * multiplier
    else: return specified_time


preds = pd.DataFrame(columns = ["text",
            'Days per week specified', 'Days per week assumed',
            'Days per week blended', 'Hours per week specified',
            'Hours per week assumed', 'Hours per week blended'])
  
for idx,doc in enumerate(docs):
    
    for ent in doc.ents:
        
        wf.extract_specified_times(ent)
    
    wf.get_days_per_week_model(doc)
    wf.get_hours_from_doc_ents(doc)
    
    spec_days = doc._.specified_days
    spec_hours = doc._.specified_hours
    
    preds.loc[idx] = [doc.text,spec_days,
                      get_assumed(spec_days),
                      get_blended(spec_days),
                      spec_hours,
                      get_assumed(spec_hours,"hours"),
                      get_blended(spec_hours,"hours")]

In [8]:
def check_results(preds, actuals):
    '''
    Function to check accuracy the of extracted data to the actual values stored in contract_df
    '''
    cols = preds.columns
    
    for t in ["text","days_holiday","pro_rata","days_entitled","days_phol"]: 
        if t in cols:
            cols = cols.drop(t)
    
    actuals = actuals[cols].fillna(0).reset_index()
    
    rows = len(preds)
    features = len(cols)
    score_matrix = np.zeros((rows,features))
    
    for idx,col in enumerate(cols):
        score_matrix[:,idx] = (preds[col] == actuals[col])
    
    return f"{np.sum(score_matrix) / (rows*features) * 100:.1f}% accuracy"

Checking how well the extraction is performing shows that we are getting classifying 99% of the training data correctly. 

In [9]:
check_results(preds,contract_df)

'98.9% accuracy'

In [124]:
preds[~(preds["Days per week blended"] == contract_df["Days per week blended"].fillna(0))]

Unnamed: 0,text,Days per week specified,Days per week assumed,Days per week blended,Hours per week specified,Hours per week assumed,Hours per week blended
10,"The normal working hours are to be arranged, a...",1.0,0,1.0,0.0,37.5,37.5


In [120]:
preds.loc[10,"text"]

'The normal working hours are to be arranged, and will include a one hour lunch break. One day each week starts with a staff meeting at 8.3o during term time and 9.00 outside term time'

In [15]:
test = contract_df.loc[17,'Times [Extract]']

In [16]:
export_df = contract_df
export_df[['Days per week specified', 'Days per week assumed',
            'Days per week blended', 'Hours per week specified',
            'Hours per week assumed', 'Hours per week blended']] = preds[['Days per week specified', 'Days per week assumed',
            'Days per week blended', 'Hours per week specified',
            'Hours per week assumed', 'Hours per week blended']]

In [17]:
path = "Employee Train.xlsx"
writer = pd.ExcelWriter(path, engine='openpyxl')
writer.book = load_workbook(path)
writer.sheets = dict((ws.title,ws) for ws in writer.book.worksheets)
contract_df.to_excel(writer,sheet_name="Training Dataset", startrow=2,index=False, header=False)
writer.save()

---------------------------------------------

### Extracting Data from the work on public holidays [Extract]

Use Spacy and the Matcher class to allow extra flexibility in matching text patterns

In [18]:
public_extract = contract_df["Work on Public Holidays [Extract]"]

In [19]:
public_matcher = Matcher(nlp.vocab)

In [20]:
def on_normal_match(matcher, doc, id, matches):
      doc._.work_normal = 1
        
def on_required_match(matcher, doc, id, matches):
      doc._.work_required = 1
        
def on_easter_match(matcher, doc, id, matches):
      doc._.work_easter = 1

public_matcher.add("normal_days", on_normal_match, [{"LOWER": "normal"},{"LOWER": "working"}, {"LOWER": "days"}])
public_matcher.add("required_to_work", on_required_match, [{"LOWER": "required"}, {"lower": "to"},{"lower": "work"}])
public_matcher.add("easter_work", on_easter_match, [{"LOWER": "easter"}, {"lower": "revision"}])

preds_public = pd.DataFrame(columns = ["text",
            'Silent re Public Holiday Arrangements', 'Public Holidays during term are normal working days',
            'Public Holidays during Easter Revision course are normal working days', 
            'Working on Public Holidays may be required'])


In [21]:
public_docs = nlp.pipe(public_extract)

In [22]:
for idx,doc in enumerate(public_docs):
    if doc.text == "None":
        doc._.work_none = 1
        
    matches = public_matcher(doc)
    
    preds_public.loc[idx] = [doc.text, doc._.work_none, doc._.work_normal,
                       doc._.work_easter, doc._.work_required]

Simple matching of text phrases gives 100% accuracy, it may be worth reviewing and making this more robust to text/spelling errors

In [23]:
check_results(preds_public,contract_df)

'100.0% accuracy'

-------------

### Extracting Data from the Holiday Entitelment [Extract]

Try a bag of words model with TfidfVecotrizer and a Decision Tree Classifier to predict the type of holiday clause for each document

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

Holiday entitlement helper functions

In [25]:
#not ideal repeated calls to doc.ents need to have this saved.

def get_days_holiday(doc):
    '''
    '''
    for ent in doc.ents:
        if("days" in ent.lower_ and check_nearby_tokens(ent[0],"entitled",[4,0])):
            return wf.get_number_from_token(ent[0])
        
def get_days_entitled(doc):
    
    for ent in doc.ents:
        if("days" in ent.lower_ and check_nearby_tokens(ent[0],"entitlement",[4,0])):
            return wf.get_number_from_token(ent[0])
        
def get_days_phol(doc):
    
    for ent in doc.ents:
        if("days" in ent.lower_ and (check_nearby_tokens(ent[0],"public",[0,4]) or 
           check_nearby_tokens(ent[0],"bank",[0,4]))):
            return wf.get_number_from_token(ent[0])

def check_nearby_tokens(token,search_word,n_range):
    '''
    For a given token, check if a given search_word string is present within
    a given n_range of surrounding tokens, return True if found otherwise False
    
    token: a spacy token object
    search_word: a string object to search for.
    n_range: list of 2 ints, giving the number of tokens before and after to 
             check
    '''
    if token.i < n_range[0]:
        start = 0
    else: start = token.i - n_range[0]
    
    finish = token.i + n_range[1]
    
    return (search_word in token.doc[start:finish].lower_)
    

Get text for information extraction

In [26]:
holidays_extract = contract_df["Holiday Entitlement [Extract]"]

consider removing common words via a stop words list to bring down the size of sparse matrices after tfidf processing, check if this improves accuracy as opposed to using max_df for a similar outcome.

In [27]:
stop_words0 = ["you","are","to","at","where","we","will","be","your"]
stop_words1 = ["depending","actually","entitlement","equivalent","calculated","calculating","involve","depending","you","are","entitled","to","taken","at","where","we","require","work","will","be","paid","your"]

In [28]:
tf_vect = TfidfVectorizer(ngram_range = (1,3),stop_words=stop_words1,use_idf=False)

Add categories to the target variable for each of the different common holiday clauses

In [29]:
y = contract_df["Holiday pro-rata to full-time 35 days plus 8 public holidays (43)"].fillna(0) * 6
is_hol_pro_33 = contract_df["Holiday pro-rata to full-time 33 days including 8 public holidays (or 25+8) (33)"]==1
is_hol_30x = contract_df["Holiday 6.12 weeks including public holidays (30.x)"]==1
is_hol_pro_28 = contract_df["Holiday pro-rata by hrs to full-time 5.6 weeks including public holidays (or 20+8) (28)"]==1
is_hol_pro_44 = contract_df["Holiday pro-rata to full-time 35 days plus 9 public holidays (44)"]==1

Combine similar and rare categories and correct an unclassified training example

In [30]:
y[is_hol_pro_33] = 5 # (33)
y[is_hol_pro_28] = 3 # (28)
y[is_hol_pro_44] = 6 # combined with (43) as very similar except for days
y[is_hol_30x] = 4 # (30.x)
y[y==0] = 4 # set remaining unclassified example to (30x) as it was not categorized in the xls file  
y_prep = y[~(y==4)] # remove all (30.x) examples from y that can be categorized from text lookup.
X_prep = holidays_extract[~(y==4)] # remove all (30.x) examples from holidays extract

Check that counts of values match those in excel

In [1]:
y_prep.value_counts()

NameError: name 'y_prep' is not defined

Pre Process the training set through the TfIdf vectorizer

In [32]:
holiday_vectors = tf_vect.fit_transform(X_prep)

Do a train test split to gauge initial accuracy of a tf-idf and DTC based model

In [33]:
X_train, X_test, y_train, y_test = train_test_split(holiday_vectors,y_prep,test_size=0.25)

In [34]:
dtc = DecisionTreeClassifier()

In [35]:
dtc.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [36]:
hol_preds = dtc.predict(X_test)

In [37]:
print(classification_report(y_test,hol_preds))

              precision    recall  f1-score   support

         3.0       0.92      1.00      0.96        12
         5.0       1.00      0.50      0.67         2
         6.0       1.00      1.00      1.00         5

    accuracy                           0.95        19
   macro avg       0.97      0.83      0.88        19
weighted avg       0.95      0.95      0.94        19



Check this accuracy works well across all the data by using cross validation include the tfidf vectorizer params in a gridsearch

Use a pipeline to allow tfidf settings to be tuned at the same time as the DTC

In [38]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dtc', DecisionTreeClassifier())
])

In [39]:
param_grid = { 
    
    "dtc__max_depth" : [5,7,None],
    "dtc__max_features" : [None,0.8],
    "tfidf__ngram_range" : [(1,3),(1,4)],
    "tfidf__stop_words" : [None,stop_words1],
    # "tfidf__use_idf" : [True, False],
     "tfidf__max_df" : [1.0,0.9],
    # "tfidf__min_df" : [0.0,0.25]
}

gridCV = GridSearchCV(pipeline,param_grid)

In [40]:
gridCV.fit(X_prep,y_prep)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                      

Check the best score from the Grid Search

In [41]:
gridCV.best_score_

0.96

Save the best estimator

In [42]:
model = gridCV.best_estimator_

Check detailed Gridsearch results, a large variety of parameters give the same top score of 96 using a decision tree. Manual examination of the cases that fail to be classified properly in split0 could be useful

In [43]:
pd.DataFrame(gridCV.cv_results_).sort_values(by="mean_test_score",ascending=False)[1:10]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dtc__max_depth,param_dtc__max_features,param_tfidf__max_df,param_tfidf__ngram_range,param_tfidf__stop_words,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
41,0.017401,0.003843,0.004606,0.000794,,0.8,1.0,"(1, 3)","[depending, actually, entitlement, equivalent,...","{'dtc__max_depth': None, 'dtc__max_features': ...",0.8,1.0,1.0,1.0,1.0,0.96,0.08,1
28,0.0196,0.001026,0.00582,0.00136,7.0,0.8,0.9,"(1, 3)",,"{'dtc__max_depth': 7, 'dtc__max_features': 0.8...",0.8,1.0,1.0,1.0,0.933333,0.946667,0.077746,3
44,0.018197,0.000755,0.0046,0.00102,,0.8,0.9,"(1, 3)",,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.8,1.0,1.0,1.0,0.933333,0.946667,0.077746,3
14,0.028192,0.003126,0.008214,0.001313,5.0,0.8,0.9,"(1, 4)",,"{'dtc__max_depth': 5, 'dtc__max_features': 0.8...",0.8,1.0,1.0,1.0,0.933333,0.946667,0.077746,3
31,0.019209,0.001598,0.005191,0.000973,7.0,0.8,0.9,"(1, 4)","[depending, actually, entitlement, equivalent,...","{'dtc__max_depth': 7, 'dtc__max_features': 0.8...",0.8,0.933333,1.0,0.933333,1.0,0.933333,0.07303,6
36,0.0194,0.001851,0.004999,0.001276,,,0.9,"(1, 3)",,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.8,1.0,1.0,0.933333,0.933333,0.933333,0.07303,6
40,0.019992,0.001115,0.005206,0.000757,,0.8,1.0,"(1, 3)",,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.8,1.0,1.0,1.0,0.866667,0.933333,0.084327,8
39,0.021392,0.003541,0.004793,0.000768,,,0.9,"(1, 4)","[depending, actually, entitlement, equivalent,...","{'dtc__max_depth': None, 'dtc__max_features': ...",0.8,1.0,1.0,1.0,0.866667,0.933333,0.084327,8
38,0.025108,0.002056,0.006787,0.000754,,,0.9,"(1, 4)",,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.8,1.0,1.0,1.0,0.866667,0.933333,0.084327,8


Set up a predictions dataframe for Holiday Entitlement

In [98]:
preds_holiday = pd.DataFrame(columns = [
            'Number of days excluding public holidays expressed if part-time',
            'Number of days including public holidays expressed if part-time',
            'Holiday pro-rata to full-time 25 days (silent as to public holidays)(25)',
            'Holiday pro-rata by hrs to full-time 5.6 weeks including public holidays (or 20+8) (28)',
            'Holiday 6.12 weeks including public holidays (30.x)',
            'Holiday pro-rata to full-time 33 days including 8 public holidays (or 25+8) (33)',
            'Holiday pro-rata to full-time 35 days plus 8 public holidays (43)',
            'Holiday pro-rata to full-time 35 days plus 9 public holidays (44)',
            'Holiday Entitlement paid as part of annual salary',
            'Holiday Entitlement paid Dec and Jun',
            'days_holiday',
            'pro_rata',
            'days_entitled',
            'days_phol'])


Create rules in the matcher to match rule based categorizing several items to be extracted

In [99]:
hol_matcher = Matcher(nlp.vocab)

In [100]:
def on_dec_match(matcher, doc, id, matches):
      doc._.dec_jan = "yes"
        
def on_leave_match(matcher, doc, id, matches):
      doc._.leave_as_salary = "yes"
        
def on_hol30x_match(matcher, doc, id, matches):
      doc._.hol_30x = 1
        
def on_pro_match(matcher, doc, id, matches):
      doc._.pro_rata = 1

hol_matcher.add("dec_jan", on_dec_match, [{"LOWER": "twice"},{"LOWER": "yearly"}, {"LOWER": "in"},{"LOWER": "december"},{"LOWER": "and"}, {"LOWER": "june"}])
hol_matcher.add("leave_as_salary", on_leave_match, [{"LOWER": "leave"}, {"lower": "will"},{"lower": "be"},{"lower": "paid"},{"lower": "as"},{"lower": "part"}])
hol_matcher.add("hol_30x", on_hol30x_match, [{"LOWER": "6.12"}, {"lower": "weeks"}])
hol_matcher.add("pro_rata", on_pro_match, [{"LOWER": "pro"},{"LOWER": "-", "OP": "?"}, {"LOWER": "rata"}])


Get Spacy Doc objects from holiday extract text

In [101]:
hol_docs = nlp.pipe(holidays_extract)

In [102]:
DTC_predictions = model.predict(holidays_extract)

In [103]:
one_hot_dtc = pd.get_dummies(DTC_predictions)

In [104]:
for idx,doc in enumerate(hol_docs):
           
    matches = hol_matcher(doc)
    
    preds_holiday.loc[idx] = [0,0,0,0,
                              doc._.hol_30x,
                              0,0,0,
                              doc._.leave_as_salary, 
                              doc._.dec_jan,
                              get_days_holiday(doc),
                              doc._.pro_rata,
                              get_days_entitled(doc),
                              get_days_phol(doc)]

In [105]:
check_results(preds_holiday,contract_df)

'89.0% accuracy'

the holiday category (30x) days was not included in the decision tree classifier as there were too few examples and it hurt accuracy, also it seems that it could be easily predicted by a rule so we need to zero predictions from the DTC for examples that can be predicted as 30x by a simple Matcher rule.

In [106]:

hol30x = "Holiday 6.12 weeks including public holidays (30.x)" #4
is_hol30x = (preds_holiday.loc[:,hol30x] == 1)

#clear predictions for examples
one_hot_dtc[is_hol30x] = 0

days_exc = 'Number of days excluding public holidays expressed if part-time' #0
days_inc = 'Number of days including public holidays expressed if part-time' #1
hol25 = 'Holiday pro-rata to full-time 25 days (silent as to public holidays)(25)' #2
hol28 = 'Holiday pro-rata by hrs to full-time 5.6 weeks including public holidays (or 20+8) (28)' #3rd 

hol33 = 'Holiday pro-rata to full-time 33 days including 8 public holidays (or 25+8) (33)' #5
hol43 = 'Holiday pro-rata to full-time 35 days plus 8 public holidays (43)' #6
hol44 = "Holiday pro-rata to full-time 35 days plus 9 public holidays (44)" #7

dec_jan = 'Holiday Entitlement paid Dec and Jun' #8
leave_as_sal = 'Holiday Entitlement paid as part of annual salary' #9

preds_holiday.loc[:,hol28] = one_hot_dtc.loc[:,3]
preds_holiday.loc[:,hol33] = one_hot_dtc.loc[:,5]
preds_holiday.loc[:,hol43] = one_hot_dtc.loc[:,6]
                 

In [107]:
check_results(preds_holiday,contract_df)

'98.2% accuracy'

In [108]:
has_days_inc = ((preds_holiday[hol43]==1) & (preds_holiday["days_holiday"]<43) & 
               (preds_holiday["pro_rata"] ==1) & 
               (preds_holiday["days_entitled"]==43)) | ((preds_holiday[hol33]==1) & 
                (preds_holiday["days_holiday"]<33) &  (preds_holiday["pro_rata"] ==1) & 
                                                       (preds_holiday["days_entitled"]==33)) 

has_days_exc = ((preds_holiday[hol43]==1) & (preds_holiday["days_holiday"]<43) & 
               (preds_holiday["pro_rata"] ==1) & 
               (preds_holiday["days_entitled"]<43)) | ((preds_holiday[hol33]==1) & 
                (preds_holiday["days_holiday"]<33) &  (preds_holiday["pro_rata"] ==1) & 
                                                       (preds_holiday["days_entitled"]<33)) 

has_44_days = (preds_holiday[hol43]==1) & (preds_holiday["days_phol"] > 8)


In [109]:
preds_holiday.loc[has_days_exc,days_exc] = preds_holiday.loc[has_days_exc,"days_holiday"]
preds_holiday.loc[has_days_inc,days_inc] = preds_holiday.loc[has_days_inc,"days_holiday"]


Seperate hol44 class and hol43 class previously joined together for processing by the DTC, by using nr bank/public holidays included

In [None]:
preds_holiday.loc[has_44_days,[hol43,hol44]] = [0,1]

In [110]:
check_results(preds_holiday,contract_df)

'99.9% accuracy'

------

Possible future work , if more samples maybe Spacy text cat or a Deep learning approach with Keras 