In [9]:
import layoutparser as lp
import fitz
from PIL import Image
import cv2


In [10]:
pdf_path = "data/NAB/Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen.PDF"
with fitz.Document(pdf_path) as doc:
    page = doc.load_page(2)
    pixmap = page.get_pixmap()
    image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
    image.show() 

In [15]:
image = cv2.imread("data/NAB/Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen/1738573781993-dd7bddee-9b9c-48fb-9564-a697e39665ab_14.jpg")
# image = cv2.imread("data/test/1062.pdf")
image = image[..., ::-1] 


In [12]:
color_map = {
    'text':   'red',
    'title':  'blue',
    'list':   'green',
    'table':  'purple',
    'figure': 'pink',
}

In [13]:
model = lp.Detectron2LayoutModel('lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config', 
                                 extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
                                 label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})

The checkpoint state_dict contains keys that are not used by the model:
  [35mproposal_generator.anchor_generator.cell_anchors.{0, 1, 2, 3, 4}[0m


In [16]:
layout = model.detect(image)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [17]:
layout_predicted = model.detect(image)

In [None]:
lp.draw_box(image, 
              [b.set(id=f'{b.type}/{b.score:.2f}') for b in layout_predicted],
              color_map=color_map,
              show_element_id=True, id_font_size=10, 
              id_text_background_color='grey',
              id_text_color='white')

In [39]:
layout[0]

TextBlock(block=Rectangle(x_1=101.998779296875, y_1=695.2202758789062, x_2=623.79296875, y_2=814.9324951171875), text=None, id=None, type=Text, parent=None, next=None, score=0.9967831373214722)

In [40]:
text_blocks = lp.Layout([b for b in layout if b.type=='Text'])
figure_blocks = lp.Layout([b for b in layout if b.type=='Figure'])

In [41]:
text_blocks = lp.Layout([b for b in text_blocks \
                   if not any(b.is_in(b_fig) for b_fig in figure_blocks)])

In [42]:
h, w = image.shape[:2]

left_interval = lp.Interval(0, w/2*1.05, axis='x').put_on_canvas(image)

left_blocks = text_blocks.filter_by(left_interval, center=True)
left_blocks.sort(key = lambda b:b.coordinates[1], inplace=True)
# The b.coordinates[1] corresponds to the y coordinate of the region
# sort based on that can simulate the top-to-bottom reading order 
right_blocks = lp.Layout([b for b in text_blocks if b not in left_blocks])
right_blocks.sort(key = lambda b:b.coordinates[1], inplace=True)

# And finally combine the two lists and add the index
text_blocks = lp.Layout([b.set(id = idx) for idx, b in enumerate(left_blocks + right_blocks)])

In [43]:
ocr_agent = lp.TesseractAgent(languages='eng') 

In [44]:
for block in text_blocks:
    segment_image = (block
                       .pad(left=5, right=5, top=5, bottom=5)
                       .crop_image(image))
        # add padding in each image segment can help
        # improve robustness 
        
    text = ocr_agent.detect(segment_image)
    block.set(text=text, inplace=True)

In [None]:
for txt in text_blocks.get_texts():
    print(txt, end='\n---\n')