In [20]:
from azure.core.credentials import AzureKeyCredential
import openai

endpoint = "https://di-document-prod.cognitiveservices.azure.com/"
az_key = "4aeec35765f342a7a29bf6044bccd31d"

openai.api_key = "sk-svcacct-WqOWrSDc4rSGBDG7-QcN4cZqh6SiGuIEPb44EL3_nhxAaoeeMpOhTLbbhA8CrwT3BlbkFJ1RwNoTMV9rjb57Ir5WtfPhMB54f7MKYGjaPgLdsPAHTplF-u-6m3Gb0R-_3-0A"



In [11]:
import pandas as pd


#------ Paragraph Class ----------------
class Paragraph:
    """
    Class to store all Paragraphs present in a document
    """    
       
    def __init__(self,data :str, coordinates:dict):       
        self.content = data
        self.content_type = "Paragraph"
        self.content_coordinates = coordinates         
#------------------------------------------

# ------ Table Class -----------------------
class Table:
    """
    Class to store all Tables present in a document. All tables are converted to string inorder to send the table to GPT
    """
    
    def __init__(self, cell_data:list, coordinates:dict) -> None:
        self.content = """Below is a table in the form of double pipe seperated values denoted by triple $ signs
$$$
#-TABLE-#
$$$
        """
        self.content_type = "Table" #Variable that stores type of content extracted from document
        self.table_info = "  " #Variable that store the table in a string format
        self.content_coordinates = coordinates #Variable that store the location of the extracted table
        self.process_cells(cell_data)
        
        
    def process_cells(self,cell_data: list)->None:
        """
        Function that processes the table and stores it in a string format

        Inputs:
            cell_data: Its a list that stores all the cell related information extracted using Azure Document Intelligence service

        Output:
            None 
        """
        
        # Fetching cell Header
        for cell in cell_data:
            if cell.kind == 'columnHeader':
                self.table_info += cell.content.replace("\n", "")+" || "
                
        self.table_info += '\n'
        
        #Fetching cell content
        row_index=0
        for index in range(len(cell_data)):
                if cell_data[index].kind == 'content':
                    if cell_data[index].row_index != row_index:
                        data = cell_data[index].content.replace('\n', '')
                        self.table_info += f"\n {row_index+1}) {data}"
                        row_index=cell_data[index].row_index
                    else:
                        self.table_info += f" || {cell_data[index].content}" 
                        
        self.content = self.content.replace("#-TABLE-#", self.table_info+"\n")
#----------------------------------------------


#--------- Page Class -------------------------
class Page:
    """
    Class to store all the contents that a page has in a page object
    """
    def __init__(self, page_number):
        self.page_number = page_number
        self.content_list = []
        self.content_dataframe = pd.DataFrame()
        
    def convert_to_dataframe(self)->None:
        """
        Function to convert a list of contents into a dataframe
        """
        self.content_dataframe = pd.concat(self.content_list, ignore_index=True)
        self.content_dataframe.reset_index(inplace=True, drop=True)
#----------------------------------------------


#--------- Document Class -----------------
class Document:
    """
    Class that store all the page details in a document object
    """
    def __init__(self) -> None:
        self.page_list = []

    def add_page(self, page:Page)->None:
        """
        Function to add a new page object to page_list variable when a perticular page is processed
        """
        self.page_list.append(page)
#-------------------------------------------

In [12]:
import logging
import traceback

import pandas as pd
from azure.ai.formrecognizer import DocumentAnalysisClient


class DocumentParser:
    
    def __init__(self):
        pass
        
        
    #----------------------------------------------------
    def overlap_percentage(self, box1:list, box2:list)->float:
        """
        Function that calculates overlaps percentage between 2 boxes

        Inputs:
            box1: A List that consists of coordinates of X0,Y0,X2,Y2
            box2: A List that consists of coordinates of X0,Y0,X2,Y2
        
        Output:
            overlap_percentage: Float value that measures the overlap percentage
        """
        # box = [x1, y1, x2, y2], where (x1, y1) and (x2, y2) are opposite corners of the bounding box

        # Calculate the intersection rectangle
        x_overlap = max(0, min(box1[2], box2[2]) - max(box1[0], box2[0]))
        y_overlap = max(0, min(box1[3], box2[3]) - max(box1[1], box2[1]))

        # Calculate the areas of the two bounding boxes and the intersection
        area_box1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
        area_box2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
        intersection_area = x_overlap * y_overlap

        # Calculate the overlap percentage
        overlap_percentage = (intersection_area / min(area_box1, area_box2)) * 100

        return overlap_percentage
    #----------------------------------------------------
        
    def parse_document(self, azure_analyse_result : DocumentAnalysisClient):
        
        if azure_analyse_result!=None:
            #Getting unique pages
            unique_page_numbers = set(region.page_number for paragraph in azure_analyse_result.paragraphs for region in paragraph.bounding_regions)
            
            #Total Page count
            total_pages = len(unique_page_numbers)
            
            #New Document Object
            doc = Document()
            for index in range(total_pages):
                doc.page_list.append(Page(page_number = index+1))
                
                
            #Fetching per page Paragraphs
            #----------------------------------------------------------------------
            for para in azure_analyse_result.paragraphs:
                
                coordinates =  {
                    
                    "X0" : para.bounding_regions[0].polygon[0].x,
                    "Y0" : para.bounding_regions[0].polygon[0].y,
                    "X1" : para.bounding_regions[0].polygon[1].x,
                    "Y1" : para.bounding_regions[0].polygon[1].y,
                    "X2" : para.bounding_regions[0].polygon[2].x,
                    "Y2" : para.bounding_regions[0].polygon[2].y,
                    "X3" : para.bounding_regions[0].polygon[3].x,
                    "Y3" : para.bounding_regions[0].polygon[3].y,     
                }
                
                paragraph_object = Paragraph(data=para.content, coordinates=coordinates)
                paragraph_page_index = para.bounding_regions[0].page_number - 1 
                doc.page_list[paragraph_page_index].content_list.append(
                    pd.DataFrame(
                        {
                            "object":[paragraph_object],
                            "content_type":[paragraph_object.content_type],
                            "X0":[paragraph_object.content_coordinates["X0"]],
                            "Y0":[paragraph_object.content_coordinates["Y0"]],
                            "X1":[paragraph_object.content_coordinates["X1"]],
                            "Y1":[paragraph_object.content_coordinates["Y1"]],
                            "X2":[paragraph_object.content_coordinates["X2"]],
                            "Y2":[paragraph_object.content_coordinates["Y2"]],
                            "X3":[paragraph_object.content_coordinates["X3"]],
                            "Y3":[paragraph_object.content_coordinates["Y3"]],
                        }
                    )
                )
                        
            #Fetching per page tables
            #------------------------------------------------------------------------
            for table in azure_analyse_result.tables:
                coordinates = {
                    
                    "X0" : table.bounding_regions[0].polygon[0].x,
                    "Y0" : table.bounding_regions[0].polygon[0].y,
                    "X1" : table.bounding_regions[0].polygon[1].x,
                    "Y1" : table.bounding_regions[0].polygon[1].y,
                    "X2" : table.bounding_regions[0].polygon[2].x,
                    "Y2" : table.bounding_regions[0].polygon[2].y,
                    "X3" : table.bounding_regions[0].polygon[3].x,
                    "Y3" : table.bounding_regions[0].polygon[3].y, 
                }
                
                table_object = Table(cell_data=table.cells, coordinates=coordinates)
                table_page_index = table.bounding_regions[0].page_number - 1
                
                doc.page_list[table_page_index].content_list.append(
                    pd.DataFrame(
                        {
                            "object":[table_object],
                            "content_type":[table_object.content_type],
                            "X0":[table_object.content_coordinates["X0"]],
                            "Y0":[table_object.content_coordinates["Y0"]],
                            "X1":[table_object.content_coordinates["X1"]],
                            "Y1":[table_object.content_coordinates["Y1"]],
                            "X2":[table_object.content_coordinates["X2"]],
                            "Y2":[table_object.content_coordinates["Y2"]],
                            "X3":[table_object.content_coordinates["X3"]],
                            "Y3":[table_object.content_coordinates["Y3"]],
                        }
                    )
                )
                
            #------------------------------------------------------------------------ 
            
                
            for index in range(total_pages):
                doc.page_list[index].convert_to_dataframe()
                
            for index in range(len(doc.page_list)):
            
                per_page = doc.page_list[index]
                page_df = per_page.content_dataframe #Getting per page dataframe
                table_count = page_df.loc[page_df['content_type']=='Table',:].shape[0] #Counting how many tables are there per page
                
                if table_count!=0:
                    
                    table_df = page_df.loc[page_df['content_type']=='Table',:]# Table dataframe
                    paragraph_df = page_df.loc[page_df['content_type']=='Paragraph',:]# Paragraph dataframe
                    indexes_to_drop = []
                    
                    for idx, row in table_df.iterrows():
                        
                        table_coords = [ row['X0'], row['Y0'], row['X2'], row['Y2'] ]
                        # Creating new percentage column
                        paragraph_df['overlap_percentage'] = paragraph_df.apply(lambda row_para: self.overlap_percentage([row_para["X0"], row_para["Y0"], row_para["X2"], row_para["Y2"]], table_coords), axis=1)
                        
                        indices_to_drop = paragraph_df[(paragraph_df['overlap_percentage'] > 20)].index            
                        paragraph_df.drop(indices_to_drop, inplace=True)# Drop the rows from paragraph_rows
                        indexes_to_drop.extend(indices_to_drop.tolist())
                    
                    doc.page_list[index].content_dataframe.drop(indexes_to_drop, inplace=True)
                    indexes_to_drop = []
                    doc.page_list[index].content_dataframe.sort_values(by='Y0', inplace=True)

            
            # Looping over per page
            doc_text_list = [] # a list to store per page text
            for index in range(len(doc.page_list)):
                
                logging.info(f"started processing pageno:- {index+1}")
                data = None
                per_page = doc.page_list[index]
                data_text = "\n"
                # looping over per content in a page
                for idx,row in per_page.content_dataframe.iterrows():
                    
                    data_text += row['object'].content + "\n"
                
                # adding data
                doc_text_list.append(data_text)

            return doc_text_list
    

In [4]:
file_path = r'D:\E\vat_table_extraction\MVF-Test-Files\02.jpg'

In [6]:
def azure_data_extraction(file_path,endpoint,key):
    document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(az_key))
    print("Started")

    # Sending file for Data Extraction to Azure
    with open(file_path, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
        "prebuilt-invoice", document=f
    )

    result = poller.result()

    return result,result.key_value_pairs, result.tables

res,invoice_data,invoice_tabels = azure_data_extraction(file_path,endpoint,az_key)


Started


In [13]:
text_azure_data = DocumentParser().parse_document(res)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paragraph_df['overlap_percentage'] = paragraph_df.apply(lambda row_para: self.overlap_percentage([row_para["X0"], row_para["Y0"], row_para["X2"], row_para["Y2"]], table_coords), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paragraph_df.drop(indices_to_drop, inplace=True)# Drop the rows from paragraph_rows


In [25]:
len(text_azure_data[0])

2782

TypeError: RecursiveCharacterTextSplitter.split_text() takes 2 positional arguments but 4 were given

1

'["\\nRoger Skinner Limited The Mills Stradbroke Eye Suffolk IP21 5HL\\nEST. 1688\\nSKINNER\'S\\nPhone: 01379 384247 Fax: 01379 388143 Email: info@skinnerspetfoods.co.uk Website: www.skinnerspetfoods.co.uk VAT Reg No. : 291 8176 74 Company Reg. No. : 1272854\\nBORN TO BE OUTDOORS\\nINVOICE\\nYour VAT No. GB 143 2150 14\\nInvoice Address:\\nDelivery Address:\\nMole Valley Farmers Ltd. Holsworthy 6\\nMole Valley Farmers Ltd. Holsworthy 6\\nExmoor House\\nUnderlane\\nLime Way\\nHolsworthy\\nPathfields Business Park\\nDevon\\nSouth Molton Devon\\nEX22 6BL\\nEX36 3LH\\nBelow is a table in the form of double pipe seperated values denoted by triple $ signs\\n$$$\\n  Your Ref || Account || Our Ref || Type || Date || Number || \\n\\n 1) PO 200366454 || MOL06 || 386112 || INV Page: 1 || 21/12/2022 || 315406\\n\\n$$$\\n        \\nBelow is a table in the form of double pipe seperated values denoted by triple $ signs\\n$$$\\n  Product || Description || Unit || Quantity || Price £ || Line Total £ * 