# Modelling Experiments
1. Spacy - from scratch, fine-tuning
2. Flair - from scratch, fine-tuning
3. Bert - fine-tuning of FinBERT
4. NER Bert trained on GMB data

# Evaluation Strategy
1. leave one out validation

# Evaluation Metrics
1. Precision, recall and f1 score for every class

In [2]:
import pandas as pd
import numpy as np
import os, sys, swifter, re, json
from constants import *
from utility import *
from preprocess_utils import _remove_non_ascii_characters

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/varunnathan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/varunnathan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/varunnathan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/varunnathan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/varunnathan/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
# read data
df = json.load(open(TRAIN_FN))

print(len(df))

50


In [4]:
# find instances of overlapping entities and address them - could be because of wrong tagging
from itertools import combinations

out = []
for i, row in enumerate(df):
    spans = [x['span'] for x in row['entity']]
    for l1, l2 in combinations(spans, 2):
        if set(range(*l1)).intersection(set(range(*l2))):
            out.append((i, l1, l2))

print(out)

[]


### this is an issue with tagging. Manually updated with the correct tagging

In [5]:
# replace PAYMEN_AMOUNT with PAYMENT_AMOUNT
for i, row in enumerate(df):
    for j, ent in enumerate(row['entity']):
        if ent['type'] == 'PAYMEN_AMOUNT':
            df[i]['entity'][j]['type'] = 'PAYMENT_AMOUNT'

## Preprocessing
1. removing non-ascii characters
2. Contraction

In [6]:
preprocess_obj = Text_Preprocessing(keep_eng=False, remove_nonalpha=False, lower_case=False,
                         remove_punkt=False, remove_stop=False, remove_numerals=False,
                         spell_check=False, contraction=True,
                         contraction_var=CONTRACTIONS, stem=False,
                         lem=False, filter_pos=False, pos_var=('N', 'J'),
                         tokenize=False, template_removal=False,
                         template_start_string='', regex_cleaning=False,
                         remove_ignore_words=False, ignore_words=IGNORE_WORDS,
                         custom_stoplist=[], word_size=2, word_size_filter=False)

In [66]:
def preprocess(text):
    text_pre = _remove_non_ascii_characters(text)
    text_pre = preprocess_obj.fit_transform(pd.Series([text_pre])).values[0]
    return text_pre

def preprocess_row_A1(row):
    row1 = row.copy()
    row1['text'] = preprocess(row1['text'])
    out = []
    for ent in row1['entity']:
        string_pre = preprocess(ent['string'])
        pattern = re.compile(r'\b({})\b'.format(string_pre))
        _iter = pattern.finditer(row1['text'])
        if _iter:
            tmp_out = []
            for item in _iter:
                s_span, e_span = item.span()
                span = [s_span, e_span]
                tmp_out.append({'string': string_pre, 'span': span, 'type': ent['type']})
        for item in tmp_out:
            if item not in out:
                out.append(item)
    row1['entity'] = out
    return row1


def convert_to_seq_format(text, entity):
    spans = [row['span'][0] for row in entity]
    idxs = np.argsort(spans)
    entity_new = [entity[i] for i in idxs]
    text1 = text
    new_chars_len = 0
    for ent in entity_new:
        s_span, e_span = ent['span'][0], ent['span'][-1]
        s_span += new_chars_len
        e_span += new_chars_len
        _type = ent['type']
        new_chars_len += len('[]({}) '.format(_type))
        text1 = text1[:s_span] + '[' + text1[s_span:e_span] + ']({}) '.format(
            _type) + text1[e_span:]
    return text1


def convert_from_seq_original_format(utterance):
    start_pos_lst = []
    end_pos_lst = []
    value_lst = []
    entity_lst = []
    start_iters = [x.start() for x in re.finditer('\[', utterance)]
    end_iters = [x.start() for x in re.finditer('\]', utterance)]
    start_iters_ent = [x.start() for x in re.finditer('\(', utterance)]
    end_iters_ent = [x.start() for x in re.finditer('\)', utterance)]
    for i, start_it in enumerate(start_iters):
        start_pos_lst.append(start_it+1)
        end_pos_lst.append(end_iters[i])
        value_lst.append(utterance[start_it+1:end_iters[i]])
        entity_lst.append(utterance[start_iters_ent[i]+1:end_iters_ent[i]])

    utterance1 = utterance
    n_start_pos_lst, n_end_pos_lst = [], []
    new_chars_len = 0
    for i, s_span in enumerate(start_pos_lst):
        e_span = end_pos_lst[i]
        v = value_lst[i]
        ent = entity_lst[i]
        a_v = '[{}]({})'.format(v, ent)
        #new_chars_len += len('[]({})'.format(ent))
        #print(i, s_span, e_span, new_chars_len)
        utterance1 = utterance1.replace(a_v, v)
        if i == 0:
            n_start_pos_lst.append(s_span-1)
            n_end_pos_lst.append(e_span-1)
        else:
            n_start_pos_lst.append(s_span-new_chars_len-1)
            n_end_pos_lst.append(e_span-new_chars_len-1)
        new_chars_len += len('[]({})'.format(ent))
        #print(n_start_pos_lst, n_end_pos_lst)
    return value_lst, entity_lst, n_start_pos_lst, n_end_pos_lst, utterance1


def preprocess_row_A2(row):
    row1 = row.copy()
    
    text = row['text']
    entity = row['entity']
    
    # convert to seq format
    text1 = convert_to_seq_format(text, entity)
    
    # preprocess
    text1_pre = preprocess(text1)
    
    # convert to original format
    values, ents, s_spans, e_spans, text2 = convert_from_seq_original_format(text1_pre)
    
    # create desired format
    spans = list(zip(s_spans, e_spans))
    out = []
    for i, span in enumerate(spans):
        d = {}
        d['span'] = list(span)
        d['string'] = values[i]
        d['type'] = ents[i]
        out.append(d)
    row1['text'] = text2
    row1['entity'] = out
    
    return row1

In [8]:
%%time
row = df[17]
row1 = preprocess_row_A2(row)
print(row)
print()
print(row1)

contraction
{'_id': '34e575a0-d5c3-47a0-b856-2a3b85dff90a', 'text': "awesome what if it wasn't that it wouldn't go through would give an invalid next the expiration date is been inquiry entered at least one digit it won't go through alright wouldn't give a transaction right so every time let's review the arrangement must so today is the twenty first of december two thousand twenty you have offers authorized debit transaction in the amount of four hundred dollars to be taken from your debit card and it will be as such one hundred on the of january twenty twenty one one hundred on the eighth of february twenty twenty one one hundred and the of march twenty twenty one and one hundred on the eighth of april twenty twenty one you understand that the payment you're are authorizing will be office as an electronic debit to your account and you calling to this regarding service as the electronic signature for this payment arrangement please state your name and gray", 'entity': [{'span': [270, 3

In [9]:
%%time
df_pre = []
for i, row in enumerate(df):
    if i % 10 == 0:
        print(i)
    row1 = preprocess_row_A2(row)
    df_pre.append(row1)
    
assert len(df) == len(df_pre)

0
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
10
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
20
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
30
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
40
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
contraction
CPU times: user 32.4 ms, sys: 2.94 ms, total: 35.4 ms
Wall time: 33.8 ms


In [10]:
print(df[9])
print()
print(df_pre[9])

{'_id': 'cdec241d-a995-480a-be18-baf2ef873315', 'text': "alright very good so let me go ahead ahead and get these payments logged into the system alright so we have successfully set up six payments to be taken on your debit card on the fourteenth of each month starting with the first month bank in january of the new year and i have copied that information over to your files so that any agent that looks at your account can see that alright now what i'm gonna do charles is review what we've done today today's date is the twenty first of december and you've authorize six payments the debit transactions in amount fifty three eighty five with the first one being fifty three eighty seven to be taken from your debit regions bank account you understand that the payment you are will be processed as a electronic a ach debit to your account and you consent to this recording service visa is your electronic signature for this payment arrangement please state your name and state i agree charles", 'e

In [11]:
out = []
for i, row in enumerate(df_pre):
    spans = [x['span'] for x in row['entity']]
    for l1, l2 in combinations(spans, 2):
        if set(l1).intersection(set(l2)):
            out.append((i, l1, l2))

print(out)

[]


In [12]:
# support for each class
from collections import defaultdict

label_support_dct = defaultdict(list)
for row in df_pre:
    for ent in row['entity']:
        cur_cnt = label_support_dct.get(ent['type'], 0)
        label_support_dct[ent['type']] = cur_cnt + 1
    
for key, cnt in label_support_dct.items():
    print(key, '\t', cnt)

DATE 	 51
NUM_PAYMENTS 	 38
PAYMENT_AMOUNT 	 59
PAYMENT_DATE 	 46


## Experiment 1 - Spacy

### Spacy Pretrained model

In [12]:
%time nlp = spacy.load("en_core_web_trf", disable=["lemmatizer"])
nlp.pipe_names

CPU times: user 1.87 s, sys: 447 ms, total: 2.31 s
Wall time: 2.39 s


['transformer', 'tagger', 'parser', 'ner', 'attribute_ruler']

In [13]:
row = df_pre[0]
print(row)
print('\nXXXXXXXXXX\n')
doc = nlp(row['text'])
for ent in doc.ents:
    print(ent.text, '\t', ent.label_)

{'_id': 'fd69a1f8-6bb0-4624-8052-b6e46fe7622a', 'text': 'nine seven three and then let us just go over with this arrangement and then we will be completely set please state mister is december twenty first two thousand twenty authorize authorized one time transaction and of twelve dollar and ninety seven cents be address from your card will be processed as an electronic and solutions and your stevens if you just need to cause it has  recording shipping is your electronic signature can please speak your full name ai i agree', 'entity': [{'span': [126, 167], 'string': 'december twenty first two thousand twenty', 'type': 'DATE'}, {'span': [189, 197], 'string': 'one time', 'type': 'NUM_PAYMENTS'}, {'span': [217, 253], 'string': 'twelve dollar and ninety seven cents', 'type': 'PAYMENT_AMOUNT'}]}

XXXXXXXXXX

december twenty first 	 DATE
twelve dollar 	 MONEY


#### Custom tags are not being recognized which is logical

In [14]:
del nlp, doc, row

### Fine-tuning NER en_core_web_lg model

In [15]:
import spacy
from spacy.training.example import Example
from spacy.scorer import Scorer

EVAL_KEYS = ['token_acc', 'token_p', 'token_r', 'token_f', 'ents_p', 'ents_r', 'ents_f',
             'ents_per_type']
def evaluate(ner_model, examples):
    scorer = Scorer()
    out = []
    for input_, annotations in examples:
        pred_value = ner_model(input_)
        example = Example.from_dict(pred_value, annotations)
        out.append(example)
    _score = scorer.score(out)
    _score = {k: _score[k] for k in EVAL_KEYS}
    return _score

In [16]:
%%time
nlp = spacy.load("en_core_web_lg")
ner = nlp.get_pipe("ner")

CPU times: user 1.11 s, sys: 595 ms, total: 1.7 s
Wall time: 1.71 s


In [17]:
def prepare_data_for_spacy(row):
    out = []
    for item in row['entity']:
        out.append(tuple(item['span']+[item['type']]))
    return (row['text'], {'entities': out})

In [18]:
spacy_df = [prepare_data_for_spacy(x) for x in df_pre]
assert len(spacy_df) == len(df_pre)
spacy_df[0]

('nine seven three and then let us just go over with this arrangement and then we will be completely set please state mister is december twenty first two thousand twenty authorize authorized one time transaction and of twelve dollar and ninety seven cents be address from your card will be processed as an electronic and solutions and your stevens if you just need to cause it has  recording shipping is your electronic signature can please speak your full name ai i agree',
 {'entities': [(126, 167, 'DATE'),
   (189, 197, 'NUM_PAYMENTS'),
   (217, 253, 'PAYMENT_AMOUNT')]})

In [19]:
# Adding labels to the `ner` model

for _, annotations in spacy_df:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [20]:
# get the names of the components we want to disable during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [21]:
# start the training loop, only training NER

import random
import copy
import warnings
from spacy.util import minibatch, compounding
from spacy.training.example import Example
from pathlib import Path

epochs = 50
optimizer = nlp.resume_training()
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    sizes = compounding(1.0, 4.0, 1.001)
    
    # batch up the examples using spaCy's minibatc
    for epoch in range(epochs):
        examples = copy.deepcopy(spacy_df)
        random.shuffle(examples)
        batches = minibatch(examples, size=sizes)
        losses = {}
        
        for batch in batches:
            for text, annotations in batch:
                #print(text, annotations)
                doc = nlp.make_doc(text)
                ex = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([ex], sgd=optimizer, losses=losses, drop=0.3)

        print("Losses ({}/{})".format(epoch + 1, epochs), losses)

Losses (1/50) {'ner': 455.39905203371165}
Losses (2/50) {'ner': 511.9060763431808}
Losses (3/50) {'ner': 547.4542082586504}
Losses (4/50) {'ner': 219.97052309654129}
Losses (5/50) {'ner': 204.88349174312606}
Losses (6/50) {'ner': 164.85501753203914}
Losses (7/50) {'ner': 97.49430116069409}
Losses (8/50) {'ner': 102.6803981965313}
Losses (9/50) {'ner': 88.37078322858133}
Losses (10/50) {'ner': 80.1702332334518}
Losses (11/50) {'ner': 81.95071236586948}
Losses (12/50) {'ner': 64.78147103266346}
Losses (13/50) {'ner': 80.45692318131589}
Losses (14/50) {'ner': 68.95678567991344}
Losses (15/50) {'ner': 69.0375943711553}
Losses (16/50) {'ner': 57.02439057136955}
Losses (17/50) {'ner': 74.95906127518977}
Losses (18/50) {'ner': 58.90506626584116}
Losses (19/50) {'ner': 70.55116239806561}
Losses (20/50) {'ner': 42.95145840332149}
Losses (21/50) {'ner': 34.401681720884376}
Losses (22/50) {'ner': 48.73060784359411}
Losses (23/50) {'ner': 34.356468189956566}
Losses (24/50) {'ner': 26.9415134442046

#### Evaluation

In [83]:
text = "let's review the arrangement that he all have set up today today's date \
is monday december the twenty first and you have authorized the total i of two debit \
transactions in the amount of one hundred dollars should be taken from your debit card \
on today's date monday the twenty first and on january the sixteenth you understand that \
the payment you are authorizing will be processed as electronic service to your \
account and you can send to this recording that trying to signature for this payment \
arrangement please state your name"
print(text)
print()
spacy.displacy.render(nlp(text), style="ent")

let's review the arrangement that he all have set up today today's date is monday december the twenty first and you have authorized the total i of two debit transactions in the amount of one hundred dollars should be taken from your debit card on today's date monday the twenty first and on january the sixteenth you understand that the payment you are authorizing will be processed as electronic service to your account and you can send to this recording that trying to signature for this payment arrangement please state your name



In [85]:
for token in nlp(text):
    print(token.text, token.ent_)

AttributeError: 'spacy.tokens.token.Token' object has no attribute 'ent_'

In [23]:
example = [('nine seven three and then let us just go over with this arrangement and then we will be completely set please state mister is december twenty first two thousand twenty authorize authorized one time transaction and of twelve dollar and ninety seven cents be address from your card will be processed as an electronic and solutions and your stevens if you just need to cause it has  recording shipping is your electronic signature can please speak your full name ai i agree',
 {'entities': [(126, 167, 'DATE'),
   (189, 197, 'NUM_PAYMENTS'),
   (217, 253, 'PAYMENT_AMOUNT')]}), 
          ('nine seven three and then let us just go over with this arrangement and then we will be completely set please state mister is december twenty first two thousand twenty authorize authorized one time transaction and of twelve dollar and ninety seven cents be address from your card will be processed as an electronic and solutions and your stevens if you just need to cause it has  recording shipping is your electronic signature can please speak your full name ai i agree',
 {'entities': [(126, 167, 'DATE'),
   (189, 197, 'NUM_PAYMENTS'),
   (217, 253, 'PAYMENT_AMOUNT')]})]
%time evaluate(nlp, example)

CPU times: user 152 ms, sys: 4.11 ms, total: 156 ms
Wall time: 154 ms


{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'ents_p': 1.0,
 'ents_r': 1.0,
 'ents_f': 1.0,
 'ents_per_type': {'DATE': {'p': 1.0, 'r': 1.0, 'f': 1.0},
  'NUM_PAYMENTS': {'p': 1.0, 'r': 1.0, 'f': 1.0},
  'PAYMENT_AMOUNT': {'p': 1.0, 'r': 1.0, 'f': 1.0}}}

In [24]:
nlp.meta["name"] = "en_core_web_lg_entity_extractor_v1"
nlp.to_disk(os.path.join(MODEL_DIR, 'spacy_en_lg_v1'))

In [79]:
%%time
import spacy
nlp = spacy.load(SPACY_MODEL_DIR)

CPU times: user 1.11 s, sys: 610 ms, total: 1.72 s
Wall time: 1.77 s


In [25]:
# Leave one out validation
from sklearn.model_selection import LeaveOneOut
import time

pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]


def train_spacy_model(nlp, other_pipes, train_lst, epochs):
    optimizer = nlp.resume_training()
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        warnings.filterwarnings("once", category=UserWarning, module='spacy')
        sizes = compounding(1.0, 4.0, 1.001)

        # batch up the examples using spaCy's minibatc
        overall_losses = []
        for epoch in range(epochs):
            examples = copy.deepcopy(train_lst)
            random.shuffle(examples)
            batches = minibatch(examples, size=sizes)
            losses = {}

            for batch in batches:
                for text, annotations in batch:
                    #print(text, annotations)
                    doc = nlp.make_doc(text)
                    ex = Example.from_dict(doc, annotations)
                    # Update the model
                    nlp.update([ex], sgd=optimizer, losses=losses, drop=0.3)
            print("Losses ({}/{})".format(epoch + 1, epochs), losses)

            overall_losses.append(losses)
    return nlp, overall_losses[-1]


def cross_val(spacy_df, epochs, pipe_exceptions=pipe_exceptions, num_evals=10):
    loo = LeaveOneOut()
    cnt = 0
    nlps = []
    losses = []
    test_eval_metrics = []
    start = time.time()
    for train_index, test_index in loo.split(spacy_df):
        if cnt > num_evals:
            break
        X_train = [x for i, x in enumerate(spacy_df) if i not in test_index]
        X_test = [spacy_df[test_index[0]]]
        
        print('Model instantiation\n')
        nlp = spacy.load("en_core_web_lg")
        ner = nlp.get_pipe("ner")

        for _, annotations in X_train:
            for ent in annotations.get("entities"):
                ner.add_label(ent[2])

        other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
        
        print('Training Loop : {}'.format(cnt + 1))
        nlp, loss = train_spacy_model(nlp, other_pipes, X_train, epochs)
        print('final loss: ', loss)
        
        # evaluation on test set
        test_eval_metric = evaluate(nlp, X_test)
        print('test metrics: ', test_eval_metric)
        
        nlps.append(nlp)
        losses.append(loss)
        test_eval_metrics.append(test_eval_metric)
        cnt += 1
        print('time taken: {}'.format(time.time() - start))
        print('\nXXXXXXXXX\n')
    return nlps, losses, test_eval_metrics

In [26]:
%%time
print('Training Begins...\n')
nlps, losses, test_eval_metrics = cross_val(spacy_df, epochs=50, num_evals=10)

Training Begins...

Model instantiation

Training Loop : 1
Losses (1/50) {'ner': 455.667057522839}
Losses (2/50) {'ner': 741.5296075648171}
Losses (3/50) {'ner': 456.570073298722}
Losses (4/50) {'ner': 227.11811829042102}
Losses (5/50) {'ner': 181.2081088586001}
Losses (6/50) {'ner': 186.0459412480571}
Losses (7/50) {'ner': 117.4174967823696}
Losses (8/50) {'ner': 110.53142329123787}
Losses (9/50) {'ner': 79.11333160062871}
Losses (10/50) {'ner': 97.98347648066097}
Losses (11/50) {'ner': 88.84846267244208}
Losses (12/50) {'ner': 56.93974718336851}
Losses (13/50) {'ner': 74.50347032708991}
Losses (14/50) {'ner': 65.63352601655396}
Losses (15/50) {'ner': 39.96174946544512}
Losses (16/50) {'ner': 37.40590125565213}
Losses (17/50) {'ner': 106.00054680152435}
Losses (18/50) {'ner': 46.70422535125666}
Losses (19/50) {'ner': 39.21400689052706}
Losses (20/50) {'ner': 37.59950011699613}
Losses (21/50) {'ner': 30.18409829745803}
Losses (22/50) {'ner': 59.35291658944035}
Losses (23/50) {'ner': 24

Losses (15/50) {'ner': 51.27409966114628}
Losses (16/50) {'ner': 61.25571166837362}
Losses (17/50) {'ner': 68.23742915854206}
Losses (18/50) {'ner': 53.097624605755016}
Losses (19/50) {'ner': 36.038190105497065}
Losses (20/50) {'ner': 31.360904458988355}
Losses (21/50) {'ner': 41.93411175237115}
Losses (22/50) {'ner': 34.267241475008845}
Losses (23/50) {'ner': 29.223945708913963}
Losses (24/50) {'ner': 34.40556635198311}
Losses (25/50) {'ner': 35.333258753802454}
Losses (26/50) {'ner': 33.84417025612629}
Losses (27/50) {'ner': 56.11097774581406}
Losses (28/50) {'ner': 26.839122420006284}
Losses (29/50) {'ner': 40.74904427719278}
Losses (30/50) {'ner': 37.66504127334491}
Losses (31/50) {'ner': 44.037696409754965}
Losses (32/50) {'ner': 32.04673041046788}
Losses (33/50) {'ner': 32.36273056861725}
Losses (34/50) {'ner': 25.276302871300583}
Losses (35/50) {'ner': 26.866953923853295}
Losses (36/50) {'ner': 36.72776589783007}
Losses (37/50) {'ner': 22.417257431339436}
Losses (38/50) {'ner': 

Losses (28/50) {'ner': 37.49977325242533}
Losses (29/50) {'ner': 57.90144826187794}
Losses (30/50) {'ner': 54.42428903899563}
Losses (31/50) {'ner': 42.845764172666904}
Losses (32/50) {'ner': 28.59525468677697}
Losses (33/50) {'ner': 39.90007421352513}
Losses (34/50) {'ner': 34.73126976741644}
Losses (35/50) {'ner': 25.182112354077255}
Losses (36/50) {'ner': 29.455407713713807}
Losses (37/50) {'ner': 30.231883408372507}
Losses (38/50) {'ner': 27.42094649845676}
Losses (39/50) {'ner': 22.019005664525082}
Losses (40/50) {'ner': 25.334188507835684}
Losses (41/50) {'ner': 43.10863948646541}
Losses (42/50) {'ner': 40.38405749123197}
Losses (43/50) {'ner': 27.84715623525123}
Losses (44/50) {'ner': 40.42559535160814}
Losses (45/50) {'ner': 36.49871629775001}
Losses (46/50) {'ner': 38.83127087194581}
Losses (47/50) {'ner': 34.598245798669566}
Losses (48/50) {'ner': 22.650398048742076}
Losses (49/50) {'ner': 43.557711509068724}
Losses (50/50) {'ner': 28.178007650354882}
final loss:  {'ner': 28.

Losses (41/50) {'ner': 20.102469723626143}
Losses (42/50) {'ner': 22.522330598451365}
Losses (43/50) {'ner': 24.00248281691108}
Losses (44/50) {'ner': 18.204749406594626}
Losses (45/50) {'ner': 21.100572618700394}
Losses (46/50) {'ner': 24.784985576473467}
Losses (47/50) {'ner': 49.913118532176156}
Losses (48/50) {'ner': 17.072393444128377}
Losses (49/50) {'ner': 30.30205692629643}
Losses (50/50) {'ner': 22.361713695643136}
final loss:  {'ner': 22.361713695643136}
test metrics:  {'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.6, 'ents_r': 0.5, 'ents_f': 0.5454545454545454, 'ents_per_type': {'NUM_PAYMENTS': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'PAYMENT_DATE': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'DATE': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'PAYMENT_AMOUNT': {'p': 0.0, 'r': 0.0, 'f': 0.0}}}
time taken: 1382.0181481838226

XXXXXXXXX

Model instantiation

Training Loop : 11
Losses (1/50) {'ner': 425.5082741729374}
Losses (2/50) {'ner': 756.5349874952275}
Losses (3/50) {'ner

In [27]:
len(nlps), len(losses), len(test_eval_metrics)

(11, 11, 11)

In [30]:
losses

[{'ner': 41.87135923200714},
 {'ner': 27.10235894175824},
 {'ner': 29.52039819118861},
 {'ner': 15.149938515304576},
 {'ner': 27.315871089080584},
 {'ner': 25.553136613272546},
 {'ner': 28.178007650354882},
 {'ner': 37.34293901327665},
 {'ner': 24.32594618982242},
 {'ner': 22.361713695643136},
 {'ner': 20.913982963928987}]

In [32]:
test_eval_metrics = pd.DataFrame(test_eval_metrics)
print(test_eval_metrics.shape)

(11, 8)


In [43]:
test_eval_metrics

Unnamed: 0,token_acc,token_p,token_r,token_f,ents_p,ents_r,ents_f,ents_per_type
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"{'DATE': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'NUM_..."
1,1.0,1.0,1.0,1.0,0.75,1.0,0.857143,"{'DATE': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'NUM_..."
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"{'DATE': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'NUM_..."
3,1.0,1.0,1.0,1.0,0.625,0.625,0.625,"{'DATE': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'NUM_..."
4,1.0,1.0,1.0,1.0,0.75,1.0,0.857143,"{'DATE': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'NUM_..."
5,1.0,1.0,1.0,1.0,0.333333,0.5,0.4,"{'DATE': {'p': 0.5, 'r': 1.0, 'f': 0.666666666..."
6,1.0,1.0,1.0,1.0,0.75,1.0,0.857143,"{'DATE': {'p': 1.0, 'r': 1.0, 'f': 1.0}, 'NUM_..."
7,1.0,1.0,1.0,1.0,0.5,0.4,0.444444,"{'DATE': {'p': 0.5, 'r': 0.5, 'f': 0.5}, 'PAYM..."
8,1.0,1.0,1.0,1.0,0.4,0.666667,0.5,"{'PAYMENT_AMOUNT': {'p': 0.5, 'r': 1.0, 'f': 0..."
9,1.0,1.0,1.0,1.0,0.6,0.5,0.545455,"{'NUM_PAYMENTS': {'p': 1.0, 'r': 1.0, 'f': 1.0..."


In [42]:
test_eval_metrics['ents_per_type'][10]

{'DATE': {'p': 1.0, 'r': 1.0, 'f': 1.0},
 'NUM_PAYMENTS': {'p': 0.5, 'r': 1.0, 'f': 0.6666666666666666},
 'PAYMENT_AMOUNT': {'p': 1.0, 'r': 1.0, 'f': 1.0},
 'PAYMENT_DATE': {'p': 1.0, 'r': 1.0, 'f': 1.0}}

In [34]:
print('Average Precision across 11 runs: ', test_eval_metrics['ents_p'].mean())
print('Average Recall across 11 runs: ', test_eval_metrics['ents_r'].mean())
print('Average F-Score across 11 runs: ', test_eval_metrics['ents_f'].mean())

Average Precision across 11 runs:  0.6825757575757575
Average Recall across 11 runs:  0.7901515151515152
Average F-Score across 11 runs:  0.7250196772924046


## Experiment 2 - Flair

In [18]:
# data preparation
def _tokenize(text, case_sensitive): 
      
    if not case_sensitive: 
        text = text.lower() 

    words = re.sub( 
         # there is a space or an end of a string after it 
         r"[^\w#@&]+(?=\s|$)|" 
         # there is a space or beginning of a string before it 
         # not followed by a number 
         r"(\s|^)[^\w#@&]+(?=[^0-9\s])|" 
         # not in between numbers and not . or @ or & or - or # 
         # e.g. 10'000.00 or blabla@gmail.com 
         # and not url characters 
         r"(?<=[^0-9\s])[^\w._~:/?#\[\]()@!$&*+,;=-]+(?=[^0-9\s])", 
         " ", 
         text, 
    ).split() 

    return words


def _format_input(example, case_sensitive):
    inp, entities = example.get('text'), example.get('entities')
    if entities:
        texts = [] 
        new_start = 0
        for entity in entities: 
            start, end = entity['start'], entity['end'] 
            other_text_tokens = [(x, 'O') for x in
                                 _tokenize(inp[new_start:start],
                                                case_sensitive)] 
            if other_text_tokens: 
                texts += other_text_tokens
            text_tokens = []
            for token in _tokenize(entity['value'], case_sensitive):
                text_tokens.append((token, entity['entity']))
            texts += text_tokens
            #value = entity['value'].replace(' ', '_')
            #texts += [(value, entity['entity'])] 
            new_start = end
        other_text_tokens = [(x, 'O') for x in _tokenize(inp[new_start:], case_sensitive)]
        if other_text_tokens:
            texts += other_text_tokens
    else:
        texts = [(x, 'O') for x in _tokenize(inp, case_sensitive)]

    return texts


def prepare_data_for_flair(row):
    out = []
    for ent in row['entity']:
        ent_n = {'start': ent['span'][0],
                 'end': ent['span'][-1],
                 'value': ent['string'],
                 'entity': ent['type']}
        out.append(ent_n)
    return {'text': row['text'], 'entities': out}


def _save_data_for_training(examples, file_name, model_dir, case_sensitive=True):
        
    file_name = os.path.join(model_dir, file_name)
    fn = open(file_name, 'w')
    for example in examples:
        example_prep = prepare_data_for_flair(example)
        out = _format_input(example_prep, case_sensitive)
        fn.write('\n'.join(str(x[0])+' '+str(x[1]) for x in out))
        fn.write('\n\n')
    fn.close()

In [19]:
FLAIR_FT_MODEL_DIR = os.path.join(MODEL_DIR, "flair_ft_v1")
FLAIR_MODEL_DIR = os.path.join(MODEL_DIR, "flair_v1")
_save_data_for_training(df_pre, 'train.txt', FLAIR_FT_MODEL_DIR, True)

In [20]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = FLAIR_FT_MODEL_DIR

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt')

2021-06-20 20:12:28,512 Reading data from /Users/varunnathan/Documents/General/ExternalTest/Prodigal/model/flair_ft_v1
2021-06-20 20:12:28,513 Train: /Users/varunnathan/Documents/General/ExternalTest/Prodigal/model/flair_ft_v1/train.txt
2021-06-20 20:12:28,513 Dev: None
2021-06-20 20:12:28,514 Test: None


In [21]:
print(len(corpus.train))
print(corpus.train[0].to_tagged_string('ner'))

41
nine seven three and then let us just go over with this arrangement and then we will be completely set please state mister is december <DATE> twenty <DATE> first <DATE> two <DATE> thousand <DATE> twenty <DATE> authorize authorized one <NUM_PAYMENTS> time <NUM_PAYMENTS> transaction and of twelve <PAYMENT_AMOUNT> dollar <PAYMENT_AMOUNT> and <PAYMENT_AMOUNT> ninety <PAYMENT_AMOUNT> seven <PAYMENT_AMOUNT> cents <PAYMENT_AMOUNT> be address from your card will be processed as an electronic and solutions and your stevens if you just need to cause it has recording shipping is your electronic signature can please speak your full name ai i agree


In [33]:
import flair
from flair.embeddings import (TokenEmbeddings,StackedEmbeddings, FlairEmbeddings)
from flair.trainers import ModelTrainer
from flair.models import SequenceTagger
import torch


def train(corpus, model_dir, fine_tuning=True, config={"use_crf": True,
                "hidden_size": 256,
                "learning_rate": 0.1,
                "mini_batch_size": 32,
                "max_epochs": 75}):

    flair.device = torch.device('cpu')
    
    print('create tag dictionary')
    tag_type = 'ner'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    if not fine_tuning:
        print('training from scratch\n')
        
        print('Embeddings init')
        embedding_types: List[TokenEmbeddings] = [
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward'),
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)

        print('Sequence tagger init')
        tagger: SequenceTagger = SequenceTagger(hidden_size=config['hidden_size'],
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=config['use_crf'])
    else:
        print('fine-tuning\n')
        tagger: SequenceTagger = SequenceTagger.load('ner')

    print('Model Trainer')
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    print('Training begins')
    trainer.train(model_dir, learning_rate=config['learning_rate'],
                  mini_batch_size=config['mini_batch_size'],
                  max_epochs=config['max_epochs'])

    print('load model')
    model = SequenceTagger.load(os.path.join(model_dir, 'final-model.pt'))
    
    return model

In [74]:
from flair.data import Sentence
import copy
flair.device = torch.device('cpu')


def combine_contiguous_words(entity_n, label_k='pred_label', start_pos_k='start_pos',
                             end_pos_k='end_pos', text_k='text', score_k='pred_score'):
    out = []
    for i, ent in enumerate(entity_n[1:]):
        if i == 0:
            prev_ent = entity_n[0]
        else:
            if out:
                prev_ent = out[-1]
            else:
                prev_ent = entity_n[i-1]
        if ((prev_ent[label_k] == ent[label_k])
            and (prev_ent[end_pos_k] == ent[start_pos_k] - 1)):
            d = {text_k: prev_ent[text_k] + ' ' + ent[text_k],
                 start_pos_k: prev_ent[start_pos_k], end_pos_k: ent[end_pos_k],
                 label_k: prev_ent[label_k],
                 score_k: min([prev_ent[score_k], ent[score_k]])}
            if i == 0:
                out.append(d)
            else:
                out[-1] = d
        else:
            out.append(ent)
    return out


def predict(model, text):
    text_pre = preprocess(text)
    sentence = Sentence(text_pre)
    model.predict(sentence)
    out = sentence.to_dict(tag_type='ner')
    entity = out.get('entities', [])
    entity_n = []
    for row in entity:
        row1 = copy.deepcopy(row)
        row1['pred_label'] = row['labels'][0]._value
        row1['pred_score'] = row['labels'][0]._score
        entity_n.append(row1)
    
    # combine contiguous parts
    out = combine_contiguous_words(entity_n)
    
    return entity_n, out

In [35]:
%%time
config = {"use_crf": True, "hidden_size": 100, "learning_rate": 0.1, 
          "mini_batch_size": 8, "max_epochs": 75}
model = train(corpus, FLAIR_FT_MODEL_DIR, fine_tuning=True, config=config)

create tag dictionary
fine-tuning

2021-06-20 21:22:46,095 loading file /Users/varunnathan/.flair/models/en-ner-conll03-v0.4.pt
Model Trainer
Training begins
2021-06-20 21:22:49,359 ----------------------------------------------------------------------------------------------------
2021-06-20 21:22:49,362 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('/home/aakbik/.flair/embeddings/glove.gensim')
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )


2021-06-20 21:24:06,961 epoch 6 - iter 4/6 - loss 19.28216553 - samples/sec: 4.86 - lr: 0.100000
2021-06-20 21:24:09,067 epoch 6 - iter 5/6 - loss 18.70411987 - samples/sec: 3.80 - lr: 0.100000
2021-06-20 21:24:09,393 epoch 6 - iter 6/6 - loss 16.83625793 - samples/sec: 24.61 - lr: 0.100000
2021-06-20 21:24:09,394 ----------------------------------------------------------------------------------------------------
2021-06-20 21:24:09,395 EPOCH 6 done: loss 16.8363 - lr 0.1000000
2021-06-20 21:24:09,751 DEV : loss 6.406829833984375 - score 0.0
2021-06-20 21:24:09,753 BAD EPOCHS (no improvement): 1
2021-06-20 21:24:09,754 ----------------------------------------------------------------------------------------------------
2021-06-20 21:24:12,042 epoch 7 - iter 1/6 - loss 9.27673340 - samples/sec: 3.50 - lr: 0.100000
2021-06-20 21:24:14,472 epoch 7 - iter 2/6 - loss 13.48899841 - samples/sec: 3.29 - lr: 0.100000
2021-06-20 21:24:16,323 epoch 7 - iter 3/6 - loss 19.89616394 - samples/sec: 4.

2021-06-20 21:25:33,400 epoch 14 - iter 4/6 - loss 7.19572067 - samples/sec: 4.03 - lr: 0.025000
2021-06-20 21:25:35,468 epoch 14 - iter 5/6 - loss 7.56903687 - samples/sec: 3.87 - lr: 0.025000
2021-06-20 21:25:35,815 epoch 14 - iter 6/6 - loss 9.77489726 - samples/sec: 23.12 - lr: 0.025000
2021-06-20 21:25:35,816 ----------------------------------------------------------------------------------------------------
2021-06-20 21:25:35,817 EPOCH 14 done: loss 9.7749 - lr 0.0250000
2021-06-20 21:25:36,174 DEV : loss 3.838226318359375 - score 0.0
2021-06-20 21:25:36,176 BAD EPOCHS (no improvement): 0
saving best model
2021-06-20 21:25:38,716 ----------------------------------------------------------------------------------------------------
2021-06-20 21:25:40,943 epoch 15 - iter 1/6 - loss 6.75820923 - samples/sec: 3.59 - lr: 0.025000
2021-06-20 21:25:42,586 epoch 15 - iter 2/6 - loss 6.39347839 - samples/sec: 4.87 - lr: 0.025000
2021-06-20 21:25:44,520 epoch 15 - iter 3/6 - loss 7.0668233

2021-06-20 21:27:01,465 epoch 22 - iter 4/6 - loss 6.63114166 - samples/sec: 3.65 - lr: 0.025000
2021-06-20 21:27:03,317 epoch 22 - iter 5/6 - loss 6.86806335 - samples/sec: 4.32 - lr: 0.025000
2021-06-20 21:27:03,677 epoch 22 - iter 6/6 - loss 6.08947500 - samples/sec: 22.23 - lr: 0.025000
2021-06-20 21:27:03,678 ----------------------------------------------------------------------------------------------------
2021-06-20 21:27:03,679 EPOCH 22 done: loss 6.0895 - lr 0.0250000
2021-06-20 21:27:04,040 DEV : loss 4.98663330078125 - score 0.0
Epoch    22: reducing learning rate of group 0 to 1.2500e-02.
2021-06-20 21:27:04,042 BAD EPOCHS (no improvement): 4
2021-06-20 21:27:04,043 ----------------------------------------------------------------------------------------------------
2021-06-20 21:27:06,041 epoch 23 - iter 1/6 - loss 7.31309509 - samples/sec: 4.01 - lr: 0.012500
2021-06-20 21:27:08,320 epoch 23 - iter 2/6 - loss 7.21899796 - samples/sec: 3.51 - lr: 0.012500
2021-06-20 21:27:

2021-06-20 21:28:23,723 epoch 30 - iter 3/6 - loss 6.73493958 - samples/sec: 5.00 - lr: 0.006250
2021-06-20 21:28:25,639 epoch 30 - iter 4/6 - loss 6.71533966 - samples/sec: 4.18 - lr: 0.006250
2021-06-20 21:28:27,317 epoch 30 - iter 5/6 - loss 6.07390442 - samples/sec: 4.77 - lr: 0.006250
2021-06-20 21:28:27,581 epoch 30 - iter 6/6 - loss 6.46247609 - samples/sec: 30.48 - lr: 0.006250
2021-06-20 21:28:27,582 ----------------------------------------------------------------------------------------------------
2021-06-20 21:28:27,582 EPOCH 30 done: loss 6.4625 - lr 0.0062500
2021-06-20 21:28:27,942 DEV : loss 3.510650634765625 - score 0.0
2021-06-20 21:28:27,944 BAD EPOCHS (no improvement): 3
2021-06-20 21:28:27,945 ----------------------------------------------------------------------------------------------------
2021-06-20 21:28:29,603 epoch 31 - iter 1/6 - loss 4.28936768 - samples/sec: 4.83 - lr: 0.006250
2021-06-20 21:28:31,728 epoch 31 - iter 2/6 - loss 5.86947632 - samples/sec: 3

2021-06-20 21:29:51,970 epoch 38 - iter 3/6 - loss 5.23704020 - samples/sec: 4.00 - lr: 0.003125
2021-06-20 21:29:53,561 epoch 38 - iter 4/6 - loss 5.18378067 - samples/sec: 5.03 - lr: 0.003125
2021-06-20 21:29:55,568 epoch 38 - iter 5/6 - loss 5.46337585 - samples/sec: 3.99 - lr: 0.003125
2021-06-20 21:29:55,838 epoch 38 - iter 6/6 - loss 5.80941518 - samples/sec: 29.73 - lr: 0.003125
2021-06-20 21:29:55,839 ----------------------------------------------------------------------------------------------------
2021-06-20 21:29:55,840 EPOCH 38 done: loss 5.8094 - lr 0.0031250
2021-06-20 21:29:56,196 DEV : loss 3.452545166015625 - score 0.0
2021-06-20 21:29:56,198 BAD EPOCHS (no improvement): 1
2021-06-20 21:29:56,198 ----------------------------------------------------------------------------------------------------
2021-06-20 21:29:58,243 epoch 39 - iter 1/6 - loss 4.82949829 - samples/sec: 3.92 - lr: 0.003125
2021-06-20 21:29:59,847 epoch 39 - iter 2/6 - loss 4.91387939 - samples/sec: 4

2021-06-20 21:31:12,713 epoch 46 - iter 3/6 - loss 5.26794434 - samples/sec: 5.05 - lr: 0.000781
2021-06-20 21:31:14,793 epoch 46 - iter 4/6 - loss 5.83195114 - samples/sec: 3.85 - lr: 0.000781
2021-06-20 21:31:16,849 epoch 46 - iter 5/6 - loss 6.28192444 - samples/sec: 3.89 - lr: 0.000781
2021-06-20 21:31:17,277 epoch 46 - iter 6/6 - loss 5.73929087 - samples/sec: 18.80 - lr: 0.000781
2021-06-20 21:31:17,278 ----------------------------------------------------------------------------------------------------
2021-06-20 21:31:17,279 EPOCH 46 done: loss 5.7393 - lr 0.0007813
2021-06-20 21:31:17,672 DEV : loss 3.328399658203125 - score 0.0
2021-06-20 21:31:17,674 BAD EPOCHS (no improvement): 1
2021-06-20 21:31:17,675 ----------------------------------------------------------------------------------------------------
2021-06-20 21:31:19,773 epoch 47 - iter 1/6 - loss 6.28884888 - samples/sec: 3.82 - lr: 0.000781
2021-06-20 21:31:21,490 epoch 47 - iter 2/6 - loss 5.91800690 - samples/sec: 4

2021-06-20 21:32:37,281 epoch 54 - iter 3/6 - loss 5.30440776 - samples/sec: 3.70 - lr: 0.000195
2021-06-20 21:32:38,783 epoch 54 - iter 4/6 - loss 6.16687393 - samples/sec: 5.33 - lr: 0.000195
2021-06-20 21:32:40,397 epoch 54 - iter 5/6 - loss 5.63813477 - samples/sec: 4.96 - lr: 0.000195
2021-06-20 21:32:40,782 epoch 54 - iter 6/6 - loss 5.43918864 - samples/sec: 20.80 - lr: 0.000195
2021-06-20 21:32:40,783 ----------------------------------------------------------------------------------------------------
2021-06-20 21:32:40,784 EPOCH 54 done: loss 5.4392 - lr 0.0001953
2021-06-20 21:32:41,143 DEV : loss 3.343048095703125 - score 0.0
2021-06-20 21:32:41,145 BAD EPOCHS (no improvement): 1
2021-06-20 21:32:41,146 ----------------------------------------------------------------------------------------------------
2021-06-20 21:32:42,750 epoch 55 - iter 1/6 - loss 3.80401611 - samples/sec: 4.99 - lr: 0.000195
2021-06-20 21:32:44,739 epoch 55 - iter 2/6 - loss 5.90249634 - samples/sec: 4

In [36]:
text = "let's review the arrangement that he all have set up today today's date \
is monday december the twenty first and you have authorized the total i of two debit \
transactions in the amount of one hundred dollars should be taken from your debit card \
on today's date monday the twenty first and on january the sixteenth you understand that \
the payment you are authorizing will be processed as electronic service to your \
account and you can send to this recording that trying to signature for this payment \
arrangement please state your name"
predict(model, text)

contraction


[{'text': 'december',
  'start_pos': 83,
  'end_pos': 91,
  'labels': [<unk> (0.9993)]},
 {'text': 'the', 'start_pos': 92, 'end_pos': 95, 'labels': [<unk> (0.9804)]},
 {'text': 'twenty', 'start_pos': 96, 'end_pos': 102, 'labels': [<unk> (1.0)]},
 {'text': 'first',
  'start_pos': 103,
  'end_pos': 108,
  'labels': [<unk> (0.9987)]},
 {'text': 'one', 'start_pos': 188, 'end_pos': 191, 'labels': [<unk> (0.9673)]},
 {'text': 'hundred',
  'start_pos': 192,
  'end_pos': 199,
  'labels': [<unk> (1.0)]},
 {'text': 'dollars',
  'start_pos': 200,
  'end_pos': 207,
  'labels': [<unk> (1.0)]},
 {'text': 'twenty',
  'start_pos': 272,
  'end_pos': 278,
  'labels': [<unk> (0.9989)]},
 {'text': 'first',
  'start_pos': 279,
  'end_pos': 284,
  'labels': [<unk> (0.9988)]},
 {'text': 'january',
  'start_pos': 292,
  'end_pos': 299,
  'labels': [<unk> (0.9985)]},
 {'text': 'the', 'start_pos': 300, 'end_pos': 303, 'labels': [<unk> (0.9435)]},
 {'text': 'sixteenth',
  'start_pos': 304,
  'end_pos': 313,
  'l

In [37]:
%%time
config = {"use_crf": True, "hidden_size": 50, "learning_rate": 0.1, 
          "mini_batch_size": 8, "max_epochs": 75}
model1 = train(corpus, FLAIR_MODEL_DIR, fine_tuning=False, config=config)

create tag dictionary
training from scratch

Embeddings init
2021-06-20 21:41:05,242 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-forward-0.4.1.pt not found in cache, downloading to /var/folders/r7/l1g4z9lj3xj_hznsvvcdkzzh0000gn/T/tmpi_kkdwj0


100%|██████████| 73034624/73034624 [00:07<00:00, 9586994.37B/s] 

2021-06-20 21:41:13,530 copying /var/folders/r7/l1g4z9lj3xj_hznsvvcdkzzh0000gn/T/tmpi_kkdwj0 to cache at /Users/varunnathan/.flair/embeddings/news-forward-0.4.1.pt
2021-06-20 21:41:13,577 removing temp file /var/folders/r7/l1g4z9lj3xj_hznsvvcdkzzh0000gn/T/tmpi_kkdwj0





2021-06-20 21:41:14,497 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-backward-0.4.1.pt not found in cache, downloading to /var/folders/r7/l1g4z9lj3xj_hznsvvcdkzzh0000gn/T/tmp9lhlrglq


100%|██████████| 73034575/73034575 [00:07<00:00, 9213175.41B/s] 

2021-06-20 21:41:23,123 copying /var/folders/r7/l1g4z9lj3xj_hznsvvcdkzzh0000gn/T/tmp9lhlrglq to cache at /Users/varunnathan/.flair/embeddings/news-backward-0.4.1.pt
2021-06-20 21:41:23,153 removing temp file /var/folders/r7/l1g4z9lj3xj_hznsvvcdkzzh0000gn/T/tmp9lhlrglq





Sequence tagger init
Model Trainer
Training begins
2021-06-20 21:41:23,440 ----------------------------------------------------------------------------------------------------
2021-06-20 21:41:23,441 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4096, out_features=4096, bias=True)
  (rnn): LSTM(4096, 50, batch_first=True,

2021-06-20 21:43:54,911 epoch 6 - iter 6/6 - loss 22.31955274 - samples/sec: 30.37 - lr: 0.100000
2021-06-20 21:43:54,912 ----------------------------------------------------------------------------------------------------
2021-06-20 21:43:54,913 EPOCH 6 done: loss 22.3196 - lr 0.1000000
2021-06-20 21:43:55,238 DEV : loss 18.203536987304688 - score 0.9368
2021-06-20 21:43:55,240 BAD EPOCHS (no improvement): 0
saving best model
2021-06-20 21:43:55,479 ----------------------------------------------------------------------------------------------------
2021-06-20 21:43:56,993 epoch 7 - iter 1/6 - loss 13.47252655 - samples/sec: 5.29 - lr: 0.100000
2021-06-20 21:43:58,255 epoch 7 - iter 2/6 - loss 16.32611465 - samples/sec: 6.34 - lr: 0.100000
2021-06-20 21:43:59,582 epoch 7 - iter 3/6 - loss 16.23280970 - samples/sec: 6.04 - lr: 0.100000
2021-06-20 21:44:00,507 epoch 7 - iter 4/6 - loss 16.54917526 - samples/sec: 8.65 - lr: 0.100000
2021-06-20 21:44:01,778 epoch 7 - iter 5/6 - loss 17.286

2021-06-20 21:44:47,297 epoch 14 - iter 6/6 - loss 7.68620555 - samples/sec: 32.17 - lr: 0.100000
2021-06-20 21:44:47,298 ----------------------------------------------------------------------------------------------------
2021-06-20 21:44:47,298 EPOCH 14 done: loss 7.6862 - lr 0.1000000
2021-06-20 21:44:47,592 DEV : loss 6.638580322265625 - score 0.9782
2021-06-20 21:44:47,594 BAD EPOCHS (no improvement): 0
saving best model
2021-06-20 21:44:47,821 ----------------------------------------------------------------------------------------------------
2021-06-20 21:44:49,090 epoch 15 - iter 1/6 - loss 11.48314667 - samples/sec: 6.31 - lr: 0.100000
2021-06-20 21:44:50,196 epoch 15 - iter 2/6 - loss 10.72390366 - samples/sec: 7.24 - lr: 0.100000
2021-06-20 21:44:51,351 epoch 15 - iter 3/6 - loss 9.68699137 - samples/sec: 6.93 - lr: 0.100000
2021-06-20 21:44:52,381 epoch 15 - iter 4/6 - loss 8.83640289 - samples/sec: 7.77 - lr: 0.100000
2021-06-20 21:44:53,271 epoch 15 - iter 5/6 - loss 8.43

2021-06-20 21:45:38,603 epoch 22 - iter 6/6 - loss 5.70087687 - samples/sec: 29.11 - lr: 0.100000
2021-06-20 21:45:38,604 ----------------------------------------------------------------------------------------------------
2021-06-20 21:45:38,605 EPOCH 22 done: loss 5.7009 - lr 0.1000000
2021-06-20 21:45:38,892 DEV : loss 4.4131317138671875 - score 0.9869
2021-06-20 21:45:38,894 BAD EPOCHS (no improvement): 0
saving best model
2021-06-20 21:45:39,131 ----------------------------------------------------------------------------------------------------
2021-06-20 21:45:40,690 epoch 23 - iter 1/6 - loss 6.30076981 - samples/sec: 5.13 - lr: 0.100000
2021-06-20 21:45:41,687 epoch 23 - iter 2/6 - loss 7.02824593 - samples/sec: 8.03 - lr: 0.100000
2021-06-20 21:45:42,730 epoch 23 - iter 3/6 - loss 5.70746740 - samples/sec: 7.68 - lr: 0.100000
2021-06-20 21:45:43,828 epoch 23 - iter 4/6 - loss 5.44606113 - samples/sec: 7.29 - lr: 0.100000
2021-06-20 21:45:44,998 epoch 23 - iter 5/6 - loss 5.597

2021-06-20 21:46:30,332 epoch 30 - iter 6/6 - loss 4.76271375 - samples/sec: 19.16 - lr: 0.050000
2021-06-20 21:46:30,333 ----------------------------------------------------------------------------------------------------
2021-06-20 21:46:30,334 EPOCH 30 done: loss 4.7627 - lr 0.0500000
2021-06-20 21:46:30,727 DEV : loss 3.1699676513671875 - score 0.9891
2021-06-20 21:46:30,730 BAD EPOCHS (no improvement): 0
saving best model
2021-06-20 21:46:31,037 ----------------------------------------------------------------------------------------------------
2021-06-20 21:46:32,184 epoch 31 - iter 1/6 - loss 2.58766174 - samples/sec: 6.98 - lr: 0.050000
2021-06-20 21:46:33,564 epoch 31 - iter 2/6 - loss 3.73662567 - samples/sec: 5.80 - lr: 0.050000
2021-06-20 21:46:34,710 epoch 31 - iter 3/6 - loss 3.95170339 - samples/sec: 7.00 - lr: 0.050000
2021-06-20 21:46:36,392 epoch 31 - iter 4/6 - loss 3.56549263 - samples/sec: 4.76 - lr: 0.050000
2021-06-20 21:46:37,871 epoch 31 - iter 5/6 - loss 3.446

2021-06-20 21:47:22,192 epoch 38 - iter 6/6 - loss 2.65050507 - samples/sec: 28.36 - lr: 0.025000
2021-06-20 21:47:22,193 ----------------------------------------------------------------------------------------------------
2021-06-20 21:47:22,193 EPOCH 38 done: loss 2.6505 - lr 0.0250000
2021-06-20 21:47:22,495 DEV : loss 3.3042449951171875 - score 0.9891
2021-06-20 21:47:22,498 BAD EPOCHS (no improvement): 3
2021-06-20 21:47:22,499 ----------------------------------------------------------------------------------------------------
2021-06-20 21:47:23,689 epoch 39 - iter 1/6 - loss 2.75505066 - samples/sec: 6.73 - lr: 0.025000
2021-06-20 21:47:24,686 epoch 39 - iter 2/6 - loss 3.60279083 - samples/sec: 8.03 - lr: 0.025000
2021-06-20 21:47:25,723 epoch 39 - iter 3/6 - loss 3.09437815 - samples/sec: 7.72 - lr: 0.025000
2021-06-20 21:47:26,855 epoch 39 - iter 4/6 - loss 2.63043404 - samples/sec: 7.07 - lr: 0.025000
2021-06-20 21:47:28,071 epoch 39 - iter 5/6 - loss 2.73184204 - samples/se

2021-06-20 21:48:10,359 epoch 46 - iter 5/6 - loss 2.62784424 - samples/sec: 6.83 - lr: 0.006250
2021-06-20 21:48:10,639 epoch 46 - iter 6/6 - loss 2.46308390 - samples/sec: 28.65 - lr: 0.006250
2021-06-20 21:48:10,640 ----------------------------------------------------------------------------------------------------
2021-06-20 21:48:10,641 EPOCH 46 done: loss 2.4631 - lr 0.0062500
2021-06-20 21:48:10,932 DEV : loss 3.3594818115234375 - score 0.9869
2021-06-20 21:48:10,934 BAD EPOCHS (no improvement): 3
2021-06-20 21:48:10,935 ----------------------------------------------------------------------------------------------------
2021-06-20 21:48:12,169 epoch 47 - iter 1/6 - loss 2.92898560 - samples/sec: 6.49 - lr: 0.006250
2021-06-20 21:48:13,137 epoch 47 - iter 2/6 - loss 2.90645981 - samples/sec: 8.28 - lr: 0.006250
2021-06-20 21:48:14,119 epoch 47 - iter 3/6 - loss 3.05461884 - samples/sec: 8.15 - lr: 0.006250
2021-06-20 21:48:15,279 epoch 47 - iter 4/6 - loss 3.10236549 - samples/se

2021-06-20 21:49:00,242 epoch 54 - iter 4/6 - loss 3.25652504 - samples/sec: 8.21 - lr: 0.001563
2021-06-20 21:49:01,185 epoch 54 - iter 5/6 - loss 3.01940460 - samples/sec: 8.49 - lr: 0.001563
2021-06-20 21:49:01,463 epoch 54 - iter 6/6 - loss 2.69608180 - samples/sec: 28.89 - lr: 0.001563
2021-06-20 21:49:01,464 ----------------------------------------------------------------------------------------------------
2021-06-20 21:49:01,464 EPOCH 54 done: loss 2.6961 - lr 0.0015625
2021-06-20 21:49:01,759 DEV : loss 3.32586669921875 - score 0.9869
2021-06-20 21:49:01,761 BAD EPOCHS (no improvement): 3
2021-06-20 21:49:01,762 ----------------------------------------------------------------------------------------------------
2021-06-20 21:49:02,695 epoch 55 - iter 1/6 - loss 1.96430969 - samples/sec: 8.58 - lr: 0.001563
2021-06-20 21:49:03,658 epoch 55 - iter 2/6 - loss 1.77180862 - samples/sec: 8.31 - lr: 0.001563
2021-06-20 21:49:04,885 epoch 55 - iter 3/6 - loss 2.48303986 - samples/sec:

2021-06-20 21:49:47,801 epoch 62 - iter 3/6 - loss 1.93868128 - samples/sec: 6.71 - lr: 0.000391
2021-06-20 21:49:49,031 epoch 62 - iter 4/6 - loss 1.95111370 - samples/sec: 6.51 - lr: 0.000391
2021-06-20 21:49:50,087 epoch 62 - iter 5/6 - loss 2.11245651 - samples/sec: 7.58 - lr: 0.000391
2021-06-20 21:49:50,363 epoch 62 - iter 6/6 - loss 1.96470579 - samples/sec: 29.08 - lr: 0.000391
2021-06-20 21:49:50,364 ----------------------------------------------------------------------------------------------------
2021-06-20 21:49:50,364 EPOCH 62 done: loss 1.9647 - lr 0.0003906
2021-06-20 21:49:50,652 DEV : loss 3.2808074951171875 - score 0.9869
2021-06-20 21:49:50,654 BAD EPOCHS (no improvement): 3
2021-06-20 21:49:50,655 ----------------------------------------------------------------------------------------------------
2021-06-20 21:49:51,845 epoch 63 - iter 1/6 - loss 2.08188629 - samples/sec: 6.73 - lr: 0.000391
2021-06-20 21:49:52,962 epoch 63 - iter 2/6 - loss 1.76362991 - samples/se

In [13]:
import flair
from flair.models import SequenceTagger
import torch
from flair.data import Sentence
import copy
flair.device = torch.device('cpu')

model = SequenceTagger.load(os.path.join(FLAIR_MODEL_DIR, 'final-model.pt'))

2021-06-21 06:59:42,042 loading file /Users/varunnathan/Documents/General/ExternalTest/Prodigal/model/flair_v1/final-model.pt


In [26]:
text = "let's review the arrangement that he all have set up today today's date \
is monday december the twenty first and you have authorized the total i of two debit \
transactions in the amount of one hundred dollars should be taken from your debit card \
on today's date monday the twenty first and on january the sixteenth you understand that \
the payment you are authorizing will be processed as electronic service to your \
account and you can send to this recording that trying to signature for this payment \
arrangement please state your name"
out1, out2 = predict(model, text)

contraction


In [27]:
out1, out2

([{'text': 'december',
   'start_pos': 83,
   'end_pos': 91,
   'labels': [DATE (0.9815)],
   'pred_label': 'DATE',
   'pred_score': 0.9815053343772888},
  {'text': 'the',
   'start_pos': 92,
   'end_pos': 95,
   'labels': [DATE (0.9951)],
   'pred_label': 'DATE',
   'pred_score': 0.9951266050338745},
  {'text': 'twenty',
   'start_pos': 96,
   'end_pos': 102,
   'labels': [DATE (0.9991)],
   'pred_label': 'DATE',
   'pred_score': 0.9991453886032104},
  {'text': 'first',
   'start_pos': 103,
   'end_pos': 108,
   'labels': [DATE (0.9979)],
   'pred_label': 'DATE',
   'pred_score': 0.9979279041290283},
  {'text': 'two',
   'start_pos': 148,
   'end_pos': 151,
   'labels': [NUM_PAYMENTS (0.844)],
   'pred_label': 'NUM_PAYMENTS',
   'pred_score': 0.843999981880188},
  {'text': 'one',
   'start_pos': 188,
   'end_pos': 191,
   'labels': [PAYMENT_AMOUNT (0.989)],
   'pred_label': 'PAYMENT_AMOUNT',
   'pred_score': 0.9890416264533997},
  {'text': 'hundred',
   'start_pos': 192,
   'end_pos':

In [28]:
### free up memory
del model, model1

## Experiment 3 - Bert

In [51]:
MODEL_NAME = 'bert-base-cased'
BERT_MODEL_DIR = os.path.join(MODEL_DIR, "bert_base_cased_v1")

In [52]:
from transformers import AutoConfig, TFAutoModelForTokenClassification

config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=6)
model = TFAutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config)
model.summary()

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertForTokenClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['dropout_113', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_token_classification_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
_________________________________________________________________
dropout_113 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  4614      
Total params: 108,314,886
Trainable params: 108,314,886
Non-trainable params: 0
_________________________________________________________________


In [53]:
def load_data(filename: str):
    with open(filename, 'r') as file:
        lines = [line[:-1].split() for line in file]
    samples, start = [], 0
    for end, parts in enumerate(lines):
        if not parts:
            sample = [(token, tag) for token, tag in lines[start:end]]
            samples.append(sample)
            start = end + 1
    if start < end:
        samples.append(lines[start:end])
    return samples

train_samples = load_data(os.path.join(FLAIR_FT_MODEL_DIR, 'train.txt'))
samples = train_samples
schema = ['_'] + sorted({tag for sentence in samples for _, tag in sentence})

In [54]:
len(samples), samples[0], schema

(50,
 [('nine', 'O'),
  ('seven', 'O'),
  ('three', 'O'),
  ('and', 'O'),
  ('then', 'O'),
  ('let', 'O'),
  ('us', 'O'),
  ('just', 'O'),
  ('go', 'O'),
  ('over', 'O'),
  ('with', 'O'),
  ('this', 'O'),
  ('arrangement', 'O'),
  ('and', 'O'),
  ('then', 'O'),
  ('we', 'O'),
  ('will', 'O'),
  ('be', 'O'),
  ('completely', 'O'),
  ('set', 'O'),
  ('please', 'O'),
  ('state', 'O'),
  ('mister', 'O'),
  ('is', 'O'),
  ('december', 'DATE'),
  ('twenty', 'DATE'),
  ('first', 'DATE'),
  ('two', 'DATE'),
  ('thousand', 'DATE'),
  ('twenty', 'DATE'),
  ('authorize', 'O'),
  ('authorized', 'O'),
  ('one', 'NUM_PAYMENTS'),
  ('time', 'NUM_PAYMENTS'),
  ('transaction', 'O'),
  ('and', 'O'),
  ('of', 'O'),
  ('twelve', 'PAYMENT_AMOUNT'),
  ('dollar', 'PAYMENT_AMOUNT'),
  ('and', 'PAYMENT_AMOUNT'),
  ('ninety', 'PAYMENT_AMOUNT'),
  ('seven', 'PAYMENT_AMOUNT'),
  ('cents', 'PAYMENT_AMOUNT'),
  ('be', 'O'),
  ('address', 'O'),
  ('from', 'O'),
  ('your', 'O'),
  ('card', 'O'),
  ('will', 'O'),
  ('

In [55]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [56]:
tokenizer('recording')['input_ids'], tokenizer('speak')['input_ids']

([101, 2730, 102], [101, 2936, 102])

In [57]:
import numpy as np
import tqdm

def tokenize_sample(sample):
    seq = [
            (subtoken, tag)
            for token, tag in sample
            for subtoken in tokenizer(token)['input_ids'][1:-1]
           ]
    return [(101, 'O')] + seq + [(102, 'O')]


def preprocess_bert(samples, schema):
    tag_index = {tag: i for i, tag in enumerate(schema)}
    tokenized_samples = list(map(tokenize_sample, samples))
    max_len = max(map(len, tokenized_samples))
    X = np.zeros((len(samples), max_len), dtype=np.int32)
    y = np.zeros((len(samples), max_len), dtype=np.int32)
    for i, sentence in enumerate(tokenized_samples):
        for j, (subtoken_id, tag) in enumerate(sentence):
            X[i, j] = subtoken_id
            y[i, j] = tag_index[tag]
    return X, y


%time X_train, y_train = preprocess_bert(train_samples, schema)

CPU times: user 506 ms, sys: 2.4 ms, total: 509 ms
Wall time: 507 ms


In [58]:
X_train.shape, y_train.shape

((50, 191), (50, 191))

In [59]:
X_train[0], len(train_samples[0]), y_train[0]

(array([  101,  2551,  1978,  1210,  1105,  1173,  1519,  1366,  1198,
         1301,  1166,  1114,  1142,  6204,  1105,  1173,  1195,  1209,
         1129,  2423,  1383,  4268,  1352, 12791,  1200,  1110,  1260,
         2093, 10615,  2570,  1148,  1160,  4032,  2570,  2351,  3708,
         9320,  1141,  1159, 13618,  1105,  1104,  4030,  8876,  1105,
        16696,  1978, 18748,  1129,  4134,  1121,  1240,  3621,  1209,
         1129, 14659,  1112,  1126,  4828,  1105,  7995,  1105,  1240,
          188,  1566,  7912,  1116,  1191,  1128,  1198,  1444,  1106,
         2612,  1122,  1144,  2730,  8629,  1110,  1240,  4828,  8250,
         1169,  4268,  2936,  1240,  1554,  1271,   170,  1182,   178,
         5340,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [60]:
tag_index = {tag: i for i, tag in enumerate(schema)}

In [61]:
%%time
# fine-tuning
import tensorflow as tf


EPOCHS = 25
BATCH_SIZE = 8

optimizer = tf.keras.optimizers.Adam(lr=0.00001)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics='accuracy')
history = model.fit(tf.constant(X_train), tf.constant(y_train),
                    validation_split=0.1, epochs=EPOCHS, 
                    batch_size=BATCH_SIZE)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
CPU times: user 1h 20min 58s, sys: 20min 20s, total: 1h 41min 18s
Wall time: 12min 25s


In [62]:
# save
BERT_MODEL_DIR_V2 = os.path.join(MODEL_DIR, "bert_base_cased_v2")
model.save_pretrained(BERT_MODEL_DIR_V2)
tokenizer.save_pretrained(BERT_MODEL_DIR_V2)

('/Users/varunnathan/Documents/General/ExternalTest/Prodigal/model/bert_base_cased_v2/tokenizer_config.json',
 '/Users/varunnathan/Documents/General/ExternalTest/Prodigal/model/bert_base_cased_v2/special_tokens_map.json',
 '/Users/varunnathan/Documents/General/ExternalTest/Prodigal/model/bert_base_cased_v2/vocab.txt',
 '/Users/varunnathan/Documents/General/ExternalTest/Prodigal/model/bert_base_cased_v2/added_tokens.json')

In [63]:
# prediction
def aggregate(sample, predictions, schema):
    results = []
    i = 1
    for token, y_true in sample:
        nr_subtoken = len(tokenizer(token)['input_ids']) - 2
        pred = predictions[i: i+nr_subtoken]
        i += nr_subtoken
        y_pred = schema[np.argmax(np.sum(pred, axis=0))]
        results.append((token, y_true, y_pred))
    return results


y_probs = model.predict(X_train[1])[0]
print(y_probs.shape)
predictions = [aggregate(sample, predictions, schema)
               for sample, predictions in zip([train_samples[1]], y_probs)]

(191, 1, 6)


In [64]:
from transformers import pipeline, TFAutoModelForTokenClassification, AutoTokenizer

model1 = TFAutoModelForTokenClassification.from_pretrained(BERT_MODEL_DIR_V2)
tokenizer1 = AutoTokenizer.from_pretrained(BERT_MODEL_DIR_V2)

ner_model = pipeline('ner', model=model1, tokenizer=tokenizer1)

Some layers from the model checkpoint at /Users/varunnathan/Documents/General/ExternalTest/Prodigal/model/bert_base_cased_v2 were not used when initializing TFBertForTokenClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at /Users/varunnathan/Documents/General/ExternalTest/Prodigal/model/bert_base_cased_v2 and are newly initialized: ['dropout_151']
You should probably TRAIN this model on a down-stream task to be able to use it for p

In [76]:
tag_index_inv = {'LABEL_'+str(v): k for k, v in tag_index.items()}


def predict_bert(ner_model, tag_index_inv, text):
    
    # preprocess
    text_pre = preprocess(text)
    
    # prediction
    pred = ner_model(text_pre)
    
    # formatting the prediction
    pred_n = []
    for row in pred:
        if row['entity'] in ['LABEL_'+str(x) for x in [1, 2, 4, 5]]:
            row1 = copy.deepcopy(row)
            row1['entity'] = tag_index_inv[row1['entity']]
            row1['start_pos'] = row1['end_pos'] = row1['index']
            pred_n.append(row1)
    
    # combining contiguous predictions
    out = combine_contiguous_words(pred_n, label_k='entity', start_pos_k='start_pos',
                                   end_pos_k='end_pos', text_k='word', score_k='score')
    
    return pred_n, out

In [81]:
import json
json.dump(tag_index_inv, open(os.path.join(BERT_MODEL_DIR_V2, 'tag_index_inv.json'), 'w'))

In [77]:
%%time
text = "let's review the arrangement that he all have set up today today's date \
is monday december the twenty first and you have authorized the total i of two debit \
transactions in the amount of one hundred dollars should be taken from your debit card \
on today's date monday the twenty first and on january the sixteenth you understand that \
the payment you are authorizing will be processed as electronic service to your \
account and you can send to this recording that trying to signature for this payment \
arrangement please state your name"
out1, out2 = predict_bert(ner_model, tag_index_inv, text)

contraction
CPU times: user 765 ms, sys: 234 ms, total: 1e+03 ms
Wall time: 256 ms


In [78]:
print(out1)
print()
print(out2)

[{'word': 'de', 'score': 0.9796044826507568, 'entity': 'DATE', 'index': 20, 'start_pos': 20, 'end_pos': 20}, {'word': '##ce', 'score': 0.9726102948188782, 'entity': 'DATE', 'index': 21, 'start_pos': 21, 'end_pos': 21}, {'word': '##mber', 'score': 0.9820989370346069, 'entity': 'DATE', 'index': 22, 'start_pos': 22, 'end_pos': 22}, {'word': 'the', 'score': 0.9865468740463257, 'entity': 'DATE', 'index': 23, 'start_pos': 23, 'end_pos': 23}, {'word': 'twenty', 'score': 0.9836658239364624, 'entity': 'DATE', 'index': 24, 'start_pos': 24, 'end_pos': 24}, {'word': 'first', 'score': 0.983189046382904, 'entity': 'DATE', 'index': 25, 'start_pos': 25, 'end_pos': 25}, {'word': 'two', 'score': 0.4268897771835327, 'entity': 'NUM_PAYMENTS', 'index': 34, 'start_pos': 34, 'end_pos': 34}, {'word': 'one', 'score': 0.9655413627624512, 'entity': 'PAYMENT_AMOUNT', 'index': 42, 'start_pos': 42, 'end_pos': 42}, {'word': 'hundred', 'score': 0.9662790298461914, 'entity': 'PAYMENT_AMOUNT', 'index': 43, 'start_pos':