In [12]:
from loguru import logger
import boto3
import numpy as np
import cv2

In [13]:
import numpy as np
import cv2
import fitz  # PyMuPDF

def _get_image(file_path):
    # Determine the file type from the extension
    file_type = file_path.split('.')[-1].lower()
    
    if file_type in ['png', 'jpg', 'jpeg', 'gif']:
        # Read the image using OpenCV
        image = cv2.imread(file_path)
        if image is None:
            return None
        _, im_buf_arr = cv2.imencode(f".{file_type}", image)
        byte_img = im_buf_arr.tobytes()
        return [byte_img]  # Return as a list for consistency with PDF handling
    
    elif file_type == 'pdf':
        # Open the PDF file
        doc = fitz.open(file_path)
        images = []
        for page in doc:
            pix = page.get_pixmap()
            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
            _, im_buf_arr = cv2.imencode(".jpg", img)
            byte_img = im_buf_arr.tobytes()
            images.append(byte_img)
        doc.close()
        return images
    
    else:
        return None


In [14]:
def make_request(image) -> dict:
    try:
        # logger.info(f"Making request to AWS Textract version {self._version}.")
        client = boto3.client("textract")
        response = client.analyze_document(
            Document={"Bytes": image}, FeatureTypes=["TABLES"]
        )
    except Exception as e:
        message = f"[Error] while making request to AWS Textract client. Exception: {str(e)}"
        logger.error(message)
        response = {}
    return response

In [15]:
# #without row numbers

# from trp import Document
# doc = Document(document)

# for page_number, page in enumerate(doc.pages):
#     print(f"### Page {page_number + 1}\n")  # Optional: Page header in Markdown
#     for table_number, table in enumerate(page.tables):
#         print(f"**Table {table_number + 1}**\n")  # Optional: Table header in Markdown

#         # Assuming the first column is the header, we process it separately
#         headers = [cell.text.strip() for cell in table.rows[0].cells]
        
#         # Create the header row
#         header_row = "| " + " | ".join(headers) + " |"
#         print(header_row)
        
#         # Create the separator row
#         separator_row = "| " + " | ".join(["---"] * len(headers)) + " |"
#         print(separator_row)

#         # Print each row of the table, starting from the second row since the first row is used as the header
#         for row in table.rows[1:]:
#             row_data = "| " + " | ".join([cell.text.strip() for cell in row.cells]) + " |"
#             print(row_data)
        
#         print("\n")  # Add a newline for better separation between tables

In [16]:
def get_table_markdown(doc):

    for page_number, page in enumerate(doc.pages):
        for table_number, table in enumerate(page.tables):

            # Create a row for column numbers
            column_numbers = ["Row\\Col"] + [f"Col {c+1}" for c, _ in enumerate(table.rows[0].cells)]
            column_number_row = "| " + " | ".join(column_numbers) + " |"
            print(column_number_row)

            # Create the separator for column numbers
            column_number_separator = "| " + " | ".join(["---"] * len(column_numbers)) + " |"
            print(column_number_separator)

            # Assuming the first column is the header, we process it separately
            headers = [" "] + [cell.text.strip() for cell in table.rows[0].cells]
            
            # Create the header row
            header_row = "| " + " | ".join(headers) + " |"
            print(header_row)

            # Print each row of the table, starting from the second row since the first row is used as the header
            for r, row in enumerate(table.rows[1:], start=1):  # Start counting from 1 for data rows
                row_data = "| " + f"Row {r} " + "| " + " | ".join([cell.text.strip() for cell in row.cells]) + " |"
                print(row_data)
            
            print("\n")  # Add a newline for better separation between tables1

In [17]:
images = _get_image('/Users/avinash/Desktop/Personal projects/ocr_to_layout-text/test_dataset/Demo Documents/Freight Invoice/1270802_EMLM_CROWN_IMPORTS_LLC-1.pdf')

In [18]:
from trp import Document

response = make_request(images[0])
doc = Document(response)

In [19]:
get_table_markdown(doc)

| Row\Col | Col 1 | Col 2 | Col 3 | Col 4 |
| --- | --- | --- | --- | --- |
|   | BILL OF LADING DATE | VESSEL | VOY | SHIPMENT NUMBER |
| Row 1 | 09-07-2021 | MATSONIA | 016 | 1804244-000 |


| Row\Col | Col 1 | Col 2 |
| --- | --- | --- |
|   | PORT OF LOADING | PORT OF DISCHARGE |
| Row 1 | LOS ANGELES | HONOLULU |
| Row 2 | LOAD/DISCHARGE SERVICE | PLACE OF RECEIPT/PLACE OF DELIVERY |
| Row 3 | PJT /CY | ONTARIO |
| Row 4 | SHIPPER'S REFERENCE | P.O.NUMBER |
| Row 5 | 0080470320 | 15978 |


| Row\Col | Col 1 | Col 2 | Col 3 | Col 4 | Col 5 | Col 6 | Col 7 |
| --- | --- | --- | --- | --- | --- | --- | --- |
|   | No Pkgs | Kind Pkg | Commodity Description | - | Quantity | Rate | Charges |
| Row 1 |  |  | SHIPPER'S LOAD AND COUNT SAID BEER, ALCOHOL OR NON ALCOHOL BEER | TO EA | CONTAIN 1 | 2,332.00 | 2,332.00 |
| Row 2 |  |  | 37631 LBS 0 CFT |  |  |  |  |
| Row 3 |  |  | INVASIVE SPECIES FEE | UNIT | 38 | 0.75 | 28.50 |
| Row 4 |  |  | PORT SECURITY CHARGE | EA | 1 | 21.68 | 21.68 |

In [20]:
doc.pages[0].tables[0].geometry.boundingBox.height

0.047688886523246765

In [21]:
doc.pages[0].tables[2].geometry.boundingBox.top

0.3333817720413208

In [22]:
for document in response['Blocks']:
    if document['BlockType'] == 'TABLE':
        print(document)

{'BlockType': 'TABLE', 'Confidence': 84.375, 'Geometry': {'BoundingBox': {'Width': 0.530988872051239, 'Height': 0.047688886523246765, 'Left': 0.4356519281864166, 'Top': 0.0694987028837204}, 'Polygon': [{'X': 0.43565499782562256, 'Y': 0.0694987028837204}, {'X': 0.9666302800178528, 'Y': 0.0696440190076828}, {'X': 0.966640830039978, 'Y': 0.11718758940696716}, {'X': 0.4356519281864166, 'Y': 0.11703479290008545}]}, 'Id': '8191a1e6-b52e-46ca-a5a9-731e2debef2a', 'Relationships': [{'Type': 'CHILD', 'Ids': ['ccc820b6-20b8-4dfd-85f2-2cc13584086d', '923ac41b-35b8-4bd1-9caa-cec207eb642e', 'af82af32-2db4-4d85-9b79-6016f40710ff', 'e9a4a50c-f12b-4555-ac4f-7b960dfcf218', '06394628-ba63-4e82-897f-990b80e98697', '143a7d4e-05c0-455c-ad6e-dd5d81cdd2a1', 'c5e97405-89db-4e9a-bcaf-ffc87e1ccfd3', '921639eb-846a-46b0-a8c4-380b5de9e0d2']}], 'EntityTypes': ['SEMI_STRUCTURED_TABLE']}
{'BlockType': 'TABLE', 'Confidence': 76.806640625, 'Geometry': {'BoundingBox': {'Width': 0.5313464999198914, 'Height': 0.1553191542