In [3]:
import numpy as np

def process_text_data(bounding_boxes, recognized_strings, rt, rr):
    # Step 1: Calculate average height of bounding boxes
    heights = [box[3] - box[1] for box in bounding_boxes]  # Assuming box format is [xmin, ymin, xmax, ymax]
    rh = np.mean(heights)

    # Step 2: Normalize y coordinates
    for box in bounding_boxes:
        box[1] /= rh
        box[3] /= rh

    # Step 3: Sort bounding boxes by y and then x
    bounding_boxes.sort(key=lambda x: (x[1], x[0]))

    # Step 4: Get unique y values of center points
    y_centers = [(box[1] + box[3]) / 2 for box in bounding_boxes]
    y_unique = sorted(set(y_centers))

    # Step 5-6: Adjust y coordinates
    ye = [y_unique[0]]
    for i in range(1, len(y_unique)):
        if y_unique[i] - y_unique[i - 1] <= rt:
            dy = 1
        else:
            dy = max(1, (y_unique[i] - y_unique[i - 1]) / rr)
        ye.append(ye[-1] + dy)

    # Step 7: Map old y centers to new ones
    y_map = {old: new for old, new in zip(y_unique, ye)}
    for box in bounding_boxes:
        center_y = (box[1] + box[3]) / 2
        new_center_y = y_map[center_y]
        offset = new_center_y - center_y
        box[1] += offset
        box[3] += offset

    # Step 8: Split into token-level boxes (simplified)
    token_boxes = []
    for box, text in zip(bounding_boxes, recognized_strings):
        step = (box[2] - box[0]) / len(text)
        for i, char in enumerate(text):
            token_boxes.append([box[0] + i * step, box[1], box[0] + (i + 1) * step, box[3]])

    # Step 9: Calculate average width of token boxes
    widths = [box[2] - box[0] for box in token_boxes]
    rw = np.mean(widths)

    # Step 10: Normalize x coordinates
    for box in token_boxes:
        box[0] /= rw
        box[2] /= rw

    # Step 11-32: Initialize and fill the TextLattice matrix
    x_centers = [(box[0] + box[2]) / 2 for box in token_boxes]
    y_centers = [(box[1] + box[3]) / 2 for box in token_boxes]
    xmin, xmax = min(x_centers), max(x_centers)
    ymin, ymax = min(y_centers), max(y_centers)

    # Initialize the TextLattice matrix
    I = np.zeros((int(ymax - ymin + 1), int(xmax - xmin + 1)))

    # Fill the TextLattice matrix
    for box, char in zip(token_boxes, ''.join(recognized_strings)):
        x_center = int((box[0] + box[2]) / 2 - xmin)
        y_center = int((box[1] + box[3]) / 2 - ymin)
        I[y_center, x_center] = ord(char)  # Simple embedding using ASCII values

    return I

In [4]:
# Example usage
bounding_boxes = [
    [10, 20, 110, 70],  # Coordinates for bounding box 1
    [120, 20, 220, 70],  # Coordinates for bounding box 2
    [230, 20, 330, 70],  # Coordinates for bounding box 3
    [10, 80, 110, 130],  # Coordinates for bounding box 4
    [120, 80, 220, 130],  # Coordinates for bounding box 5
    [230, 80, 330, 130],  # Coordinates for bounding box 6
    [10, 140, 110, 190],  # Coordinates for bounding box 7
    [120, 140, 220, 190],  # Coordinates for bounding box 8
    [230, 140, 330, 190],  # Coordinates for bounding box 9
    [10, 200, 110, 250],  # Coordinates for bounding box 10
    [120, 200, 220, 250],  # Coordinates for bounding box 11
    [230, 200, 330, 250],  # Coordinates for bounding box 12
    [10, 260, 110, 310],  # Coordinates for bounding box 13
    [120, 260, 220, 310],  # Coordinates for bounding box 14
    [230, 260, 330, 310]   # Coordinates for bounding box 15
]

recognized_strings = [
    "Sample",  # Text recognized in bounding box 1
    "text",    # Text recognized in bounding box 2
    "data",    # Text recognized in bounding box 3
    "for",     # Text recognized in bounding box 4
    "testing", # Text recognized in bounding box 5
    "the",     # Text recognized in bounding box 6
    "Python",  # Text recognized in bounding box 7
    "code",    # Text recognized in bounding box 8
    "above",   # Text recognized in bounding box 9
    "This",    # Text recognized in bounding box 10
    "is",      # Text recognized in bounding box 11
    "a",       # Text recognized in bounding box 12
    "simple",  # Text recognized in bounding box 13
    "example", # Text recognized in bounding box 14
    "input"    # Text recognized in bounding box 15
]
rt = 10
rr = 1.5

processed_data = process_text_data(bounding_boxes, recognized_strings, rt, rr)
print(processed_data)

[[ 97. 109. 108. 101.   0. 116. 101. 120. 116.   0. 100.  97. 116.  97.]
 [102. 111.   0. 114. 116. 101. 116. 105. 103.   0. 116. 104.   0. 101.]
 [121. 116. 111. 110.   0.  99. 111. 100. 101.  97.  98. 111. 118. 101.]
 [ 84. 104. 105. 115.   0. 105.   0. 115.   0.   0.   0.  97.   0.   0.]
 [105. 109. 108. 101. 101. 120. 109. 112. 101. 105. 110. 112. 117. 116.]]


# Layout parser

In [1]:
import layoutparser as lp
import cv2

In [2]:
image = cv2.imread("/Users/avinash/Desktop/Personal projects/ocr_to_layout-text/dataset_images/Year Ending Cash Flow Statement/page_1.jpg")

In [3]:
model = lp.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config', 
                                 extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],x
                                 label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})

AttributeError: module layoutparser has no attribute Detectron2LayoutModel