In [None]:
"""
__author__ = "Varun Reddy"
__author_email__ = "varunreddymullangi@gmail.com"
"""

# imports
import os
import numpy as np
import pandas as pd
import docxpy
import spacy
nlp = spacy.load('en_core_web_sm')
import datefinder
import datetime
from dateutil.relativedelta import relativedelta
from docx import Document
from transformers import pipeline
nlp_qa = pipeline('question-answering')

In [None]:
#empty lists to store the final data
file_names = []
Agreement_value_ = []
Renewal_Notice_ = []
Party_One_ = []
Party_Two_ = []
Agreement_Start_Date_ = []
Agreement_End_Date_ = []
# questions to be asked for question answering model
questions = ["what is the owner name?", "what is the tenant or resident name?",
            "what is the rent money?", "what is the notice period?"]

In [31]:
# Spacy NER model for getting start date and end date for a given document 
def getDates(filename):
    """
    Arguments:
        filename - input file(docx)
    returns:
        startdate and enddate for the given document
    """
    text = docxpy.process(filename)
    doc = nlp(text)
    allDates = []
    for entity in doc.ents:
        if entity.label_ == "DATE":
            allDates.append(entity.text)
    print(f"All extracted date entities : {allDates}")
    x = ""
    for i in allDates:
        startDate = datefinder.find_dates(i)
        try:
            x = next(startDate)
            break
        except:
            pass
    add_months = 0
    flag = False
    for date in allDates:
        if 'month' in date.lower():
            toks = date.split()
            for tok in toks:
                if tok.isdigit():
                    add_months = int(tok)
                    flag = True
                    break
        if flag:
            break
    print(f"Rental duration: {add_months} months")  
    for dates in startDate:
        x = dates
    # add months to start date
    if not add_months:
        y = ''
    else:
        y = x + relativedelta(months =+ add_months)
        y = y + relativedelta(days =- 1)
    startDate = ''
    endDate = 'na'
    try:
        startDate = x.strftime("%d.%m.%Y")
        endDate = y.strftime("%d.%m.%Y")
    except:
        pass
    print(f"Start: {startDate}, End: {endDate}")
    return startDate,endDate


def qa(context, question):
    """
    Arguments:
        context - paragraph to be searched
        question - question we want to find answer to in context
    returns:
        dictionary containing answer and confidence score
    """
    return nlp_qa(context = context, question = question)

def hasNumbers(inputString):
    """
    Arguments:
        inputString - string
    return:
        True if input string has any digits
    """
    return any(char.isdigit() for char in inputString)

def text2int (textnum, numwords={}):
    """
    Taken from stackoverflow
    converts numbers written in months to digits in a string
    """
    if not numwords:
        units = [
        "zero", "one", "two", "three", "four",
        "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve",
        "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen",
        "nineteen",
        ]

        tens = ["", "", "twenty", "thirty", "forty",
                "fifty", "sixty", "seventy", "eighty",
                "ninety"]

        scales = ["hundred", "thousand", "million",
                  "billion", "trillion"]

        numwords["and"] = (1, 0)
        for idx, word in enumerate(units):  numwords[word] = (1, idx)
        for idx, word in enumerate(tens):       numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

    ordinal_words = {'first':1, 'second':2, 'third':3,
                     'fifth':5, 'eighth':8, 'ninth':9,
                     'twelfth':12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]

    textnum = textnum.replace('-', ' ')

    current = result = 0
    curstring = ""
    onnumber = False
    for word in textnum.split():
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0
            onnumber = True
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending):
                    word = "%s%s" % (word[:-len(ending)], replacement)

            if word not in numwords:
                if onnumber:
                    curstring += repr(result + current) + " "
                curstring += word + " "
                result = current = 0
                onnumber = False
            else:
                scale, increment = numwords[word]

                current = current * scale + increment
                if scale > 100:
                    result += current
                    current = 0
                onnumber = True

    if onnumber:
        curstring += repr(result + current)
    for i in curstring.split():
        if i.isdigit():
            return int(i)*30
    return -1

In [32]:
def extract_entities(Data_Folder):
    """
    Arguments:
        Data_Folder - folder containing documents
    return:
        downloads the results csv to the current folder
    """
    for filename in os.listdir(Data_Folder):
        # iterate through the files
        
        doc = docx.Document(os.path.join(Data_Folder, filename))
        Agreement_value = []
        Renewal_Notice = []
        Party_One = []
        Party_Two = []
        ans_lists = [Party_One, Party_Two, Agreement_value, Renewal_Notice]
        
        # As BERT QA is optimal for the paragraph input each doc  
        # is divided into paragraphs and bert model is applied
        
        for para in doc.paragraphs:
            context = para.text
            # Iterate only when len of paragraphn greater that 5
            if len(context.split())>5:
                print("context",context)
                # for each paragraph we ask a set of predefined questions and
                # store the answers and scores in the corresponding lists
                for i in range(len(questions)):
                    print("question",questions[i])
                    try:
                        res = qa(context,questions[i])
                        print("answer",res["answer"])
                        # store the answer only when entity label is PERSON or GPE
                        # or ORG, for first party and second party Ideally it should be PERSON but 
                        # due to shortcomings of spacy model we choose 3 types
                        if i==0 or i==1:
                            flag = 0
                            text = res["answer"]
                            doc = nlp_spacy(text.lower())
                            for entity in doc.ents:
                                if entity.label_ == "PERSON" or entity.label_ == "GPE" or entity.label_ == "ORG":
                                    flag=1
                                    break
                            if flag==1:
                                ans_lists[i].append((res["answer"], res["score"]))
                        # Get the rental amount
                        elif i==2 :
                            text = res["answer"]
                            if hasNumbers(text):
                                text = text.replace("-", " ")
                                if len(text.split())<=2 and "%" not in text:
                                    ans_lists[i].append((res["answer"], res["score"]))
                        # Get the notice period
                        else:
                            text = res["answer"]
                            if "month" in text.lower():
                                ans_lists[i].append((res["answer"], res["score"]))
                    except KeyError:
                        ans_lists[i].append(("",0))
        # get the Agreement start and end dates
        startDate,endDate = getDates(Data_Folder, filename)
        # sort the answers based on the confidence scores
        for i in ans_lists:
            i.sort(key=lambda x: x[1], reverse = True)
        file_names.append(filename)
        Party_One_.append(ans_lists[0][0][0])
        Party_Two_.append(ans_lists[1][0][0])
        Agreement_value_.append(ans_lists[2][0][0])
        Renewal_Notice_.append(text2int(ans_lists[3][0][0].lower()))
        Agreement_Start_Date_.append(startDate)
        Agreement_End_Date_.append(endDate)
    # load into a data frame and save the csv
    df = pd.DataFrame()
    df["File Name"] = file_names
    df["Aggrement Value"] = Agreement_value_
    df["Aggrement Start Date"] = Agreement_Start_Date_
    df["Aggrement End Date"] = Agreement_End_Date_
    df["Renewal Notice (Days)"] = Renewal_Notice_
    df["Party One"] = Party_One_
    df["Party Two"] = Party_Two_ 
    df.to_csv("results.csv")

In [None]:
# Run the entity extraction function
Data_Folder = "./Validation_Data"
extract_entities(Data_Folder)