## LayoutParser

In [2]:
# Install dependencies
!apt-get install -qq poppler-utils  # pdf2image dependency
!apt-get install -qq tesseract-ocr  # Tesseract OCR Engine
!pip install layoutparser torchvision pdf2image
!pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.5#egg=detectron2"
!pip install "layoutparser[ocr]"

# Import libraries
import pdf2image
import numpy as np
import layoutparser as lp
import torchvision.ops.boxes as bops
import torch

# Layout Detection
pdf_file= '/content/tiff2pdf.pdf'  # Adjust the filepath of your input image accordingly
img = np.asarray(pdf2image.convert_from_path(pdf_file)[0])

model1 = lp.Detectron2LayoutModel('lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config',
                                  extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
                                  label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})

model2 = lp.Detectron2LayoutModel('lp://PubLayNet/mask_rcnn_R_50_FPN_3x/config',
                                  extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
                                  label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})

model3 = lp.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
                                  extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
                                  label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})

model4 = lp.Detectron2LayoutModel('lp://PrimaLayout/mask_rcnn_R_50_FPN_3x/config',
                                  extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
                                  label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})

# Loading Images to the Detectron models
layout_result1 = model1.detect(img)
layout_result2 = model2.detect(img)
layout_result3 = model3.detect(img)
layout_result4 = model4.detect(img)

lp.draw_box(img, layout_result1,  box_width=5, box_alpha=0.2, show_element_type=True)
lp.draw_box(img, layout_result2,  box_width=5, box_alpha=0.2, show_element_type=True)
lp.draw_box(img, layout_result3,  box_width=5, box_alpha=0.2, show_element_type=True)
lp.draw_box(img, layout_result4,  box_width=5, box_alpha=0.2, show_element_type=True)

# Drawing boxes
text_blocks1 = lp.Layout([b for b in layout_result1])
title_blocks1 = lp.Layout([b for b in layout_result1 if b.type=='Title'])

lp.draw_box(img, text_blocks1,  box_width=5, box_alpha=0.2, show_element_type=True, show_element_id=True)

text_blocks2 = lp.Layout([b for b in layout_result2])
title_blocks2 = lp.Layout([b for b in layout_result2 if b.type=='Title'])

lp.draw_box(img, text_blocks2,  box_width=5, box_alpha=0.2, show_element_type=True, show_element_id=True)

text_blocks3 = lp.Layout([b for b in layout_result3])
title_blocks3 = lp.Layout([b for b in layout_result3 if b.type=='Title'])

lp.draw_box(img, text_blocks3,  box_width=5, box_alpha=0.2, show_element_type=True, show_element_id=True)

text_blocks4 = lp.Layout([b for b in layout_result4])
title_blocks4 = lp.Layout([b for b in layout_result4 if b.type=='Title'])

lp.draw_box(img, text_blocks4,  box_width=5, box_alpha=0.2, show_element_type=True, show_element_id=True)

# Sorting the Text
ocr_agent = lp.TesseractAgent(languages='eng')

def sort_blocks(text_blocks, img):
    image_width = len(img[0])

    left_interval = lp.Interval(0, image_width/2, axis='x').put_on_canvas(img)
    left_blocks = text_blocks.filter_by(left_interval, center=True)._blocks
    left_blocks.sort(key = lambda b:b.coordinates[1])

    right_blocks = [b for b in text_blocks if b not in left_blocks]
    right_blocks.sort(key = lambda b:b.coordinates[1])

    sorted_blocks = lp.Layout([b.set(id = idx) for idx, b in enumerate(left_blocks + right_blocks)])
    return sorted_blocks

text_blocks1 = sort_blocks(text_blocks1, img)
text_blocks2 = sort_blocks(text_blocks2, img)
text_blocks3 = sort_blocks(text_blocks3, img)
text_blocks4 = sort_blocks(text_blocks4, img)

# Performing OCR
def perform_ocr(text_blocks, img, ocr_agent):
    for block in text_blocks:
        segment_image = (block
                           .pad(left=15, right=15, top=5, bottom=5)
                           .crop_image(img))

        text = ocr_agent.detect(segment_image)
        block.set(text=text, inplace=True)

perform_ocr(text_blocks1, img, ocr_agent)
perform_ocr(text_blocks2, img, ocr_agent)
perform_ocr(text_blocks3, img, ocr_agent)
perform_ocr(text_blocks4, img, ocr_agent)

# Printing the Text
def print_text_blocks(text_blocks):
    for txt in text_blocks:
        print("Text =", txt.text)
        print("x_1=", txt.block, end='\n---\n')

print_text_blocks(text_blocks1)
print_text_blocks(text_blocks2)
print_text_blocks(text_blocks3)
print_text_blocks(text_blocks4)


Collecting detectron2@ git+https://github.com/facebookresearch/detectron2.git@v0.5#egg=detectron2
  Cloning https://github.com/facebookresearch/detectron2.git (to revision v0.5) to /tmp/pip-install-todf2a3y/detectron2_3907b2751f56439baf157fac5e26f3c7
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-install-todf2a3y/detectron2_3907b2751f56439baf157fac5e26f3c7
  Running command git checkout -q 82a57ce0b70057685962b352535147d9a8118578
  Resolved https://github.com/facebookresearch/detectron2.git to commit 82a57ce0b70057685962b352535147d9a8118578
  Preparing metadata (setup.py) ... [?25l[?25hdone


  proposal_generator.anchor_generator.cell_anchors.{0, 1, 2, 3, 4}
  proposal_generator.anchor_generator.cell_anchors.{0, 1, 2, 3, 4}
  proposal_generator.anchor_generator.cell_anchors.{0, 1, 2, 3, 4}
  pixel_mean
  pixel_std
  proposal_generator.anchor_generator.cell_anchors.{0, 1, 2, 3, 4}


Text = UCC FINANCING STATEMENT

x_1= Rectangle(x_1=105.67134094238281, y_1=237.61947631835938, x_2=543.77294921875, y_2=279.7265930175781)
---
Text = B NAME & PHONE OF CONTACT AT FILER (optional)

x_1= Rectangle(x_1=116.15306091308594, y_1=316.5672607421875, x_2=587.778564453125, y_2=342.9100646972656)
---
Text =  

1a. ORGANIZATION'S NAME

OR

 

 

1o. INDIVIDUAL'S SURNAME FIRST PERSONAL NAME ADDITIONAL NAME(S)/INITIAL(S) SUFFIX
HUEBNER RICHIE HICKS
1c. MAILING ADDRESS: CITY, STATE |POSTAL CODE COUNTRY
LEAD HILL AR | 72644 USA

12282 N RIGGS

   

 

 

x_1= Rectangle(x_1=93.34196472167969, y_1=738.3434448242188, x_2=1604.1690673828125, y_2=952.584716796875)
---
Text =  

 
 
   

2a. ORGANIZATION'S NAME

   
 
 
 

OR

   

20. INDIVIDUAL'S SURNAME FIRST PERSONAL NAME ADDITIONAL NAME(S)/INITIAL(S) SUFFIX

 

 

2c. MAILING ADDRESS: POSTAL CODE

   

3. SECURED PARTY'S NAME jor NAME of ASSIGNEE of ASSIGNOR SECURED PARTY): Provide only one Secured Party name (3a or 3b)
3a. ORGANIZA

In [5]:
import json

# Function to convert text blocks to JSON
def text_blocks_to_json(text_blocks):
    json_data = []
    for block in text_blocks:
        block_data = {
            "text": block.text,
            "coordinates": {
                "x1": block.coordinates[0],
                "y1": block.coordinates[1],
                "x2": block.coordinates[2],
                "y2": block.coordinates[3]
            }
        }
        json_data.append(block_data)
    return json_data

# Convert each layout model's text blocks to JSON
json_output1 = text_blocks_to_json(text_blocks1)
json_output2 = text_blocks_to_json(text_blocks2)
json_output3 = text_blocks_to_json(text_blocks3)
json_output4 = text_blocks_to_json(text_blocks4)

# Combine all JSON outputs into one JSON structure if needed
combined_json_output = {
    "block1": json_output1,
    "block2": json_output2,
    "block3": json_output3,
    "block4": json_output4
}

# Save the JSON data to a file
with open('output.json', 'w') as json_file:
    json.dump(combined_json_output, json_file, indent=4)

print("JSON data saved to output.json")


JSON data saved to output.json
