In [11]:
from __future__ import unicode_literals, print_function
import os
import pickle
import plac
import random
import warnings
from pathlib import Path
import docx
import spacy
from spacy.util import minibatch, compounding

In [2]:
LABEL = ["Agreement_start_date"]
# Loading training data 
with open ('./Train_Data_final', 'rb') as fp:
    TRAIN_DATA = pickle.load(fp)

In [14]:
# took from spacy docs
LABEL = "Agreement_start_date"
def ner_train(model=None, new_model_name="agreement_date", output_dir=None, n_iter=30):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    ner.add_label(LABEL)  # add new entity label to entity recognizer
    # Adding extraneous labels shouldn't mess anything up
    ner.add_label("VEGETABLE")
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "THIS RENTAL AGREEMENT is made and executed into on this first day of September 2011 (01-09-2011) at Bangalore"
    print("********************")
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

In [15]:
ner_train(output_dir = "./output")

Created blank 'en' model
Losses {'ner': 325.84619755458}
Losses {'ner': 607.0687399369272}
Losses {'ner': 388.9864876166245}
Losses {'ner': 1095.3322046393891}
Losses {'ner': 369.0838201758012}
Losses {'ner': 118.78022780315874}
Losses {'ner': 46.827406995610076}
Losses {'ner': 22.119562175418736}
Losses {'ner': 6.944464746641154}
Losses {'ner': 8.539448460979234}
Losses {'ner': 19.300927756450218}
Losses {'ner': 13.12943070552477}
Losses {'ner': 6.476039304835066}
Losses {'ner': 10.572703672951816}
Losses {'ner': 11.690214143992803}
Losses {'ner': 6.022254871822959}
Losses {'ner': 9.090501485605287}
Losses {'ner': 5.752521776278737}
Losses {'ner': 5.117576874731772}
Losses {'ner': 6.0765005574275675}
Losses {'ner': 15.494011484397273}
Losses {'ner': 8.321595338182908}
Losses {'ner': 2.76467962094256}
Losses {'ner': 5.112262194129607}
Losses {'ner': 7.852200318392593}
Losses {'ner': 5.796727666946247}
Losses {'ner': 5.495921599590493}
Losses {'ner': 7.210881793490394}
Losses {'ner': 7.

In [28]:
# testing the model
output_dir = "./output"
test_text = """This Rental Agreement is made and executed at Bangalore on this the 1st May 2005 by and between:"""
nlp2 = spacy.load(output_dir)
# Check the classes have loaded back consistently
# assert nlp2.get_pipe("ner").move_names == move_names
doc2 = nlp2(test_text)
for ent in doc2.ents:
    print(ent.label_, "-->", ent.text)

Agreement_start_date --> 1st May 2005


In [27]:
# lets try to find the agreement start date in Validation folder
# we got 5 out of 8 files to be right!!! we need more training data and some regex rules
output_dir = "./output"
nlp2 = spacy.load(output_dir)
Data_Folder = "./Validation_Data/"
def extract_entities(Data_Folder):
    for filename in os.listdir(Data_Folder):
        if filename.endswith(".docx"):
            doc = docx.Document(os.path.join(Data_Folder, filename))
            for para in doc.paragraphs:
                doc2 = nlp2(para.text)
                if len(doc2.ents) > 0:
                    for ent in doc2.ents:
                        print(ent.label_, "-->", ent.text)
                    break
extract_entities(Data_Folder)

Agreement_start_date --> first day of September 2011
Agreement_start_date --> 1st May 2005
Agreement_start_date --> 15th of December 2012
Agreement_start_date --> 2ndth day of July, 2013
Agreement_start_date --> 06th day of March 2013
Agreement_start_date --> 1st day of April 2008
Agreement_start_date --> 11 (Eleven
Agreement_start_date --> 07-072014
Agreement_start_date --> 2/
