In [1]:
import os
import pymupdf
from PIL import Image
import io


from text import  create_text_lines, create_text_blocks
from bounding_box import cluster_drawings

In [2]:
base_dir = os.getcwd()
filename ="Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen.PDF"
filename = "1062_6.pdf"
pdf_path = os.path.join(base_dir, "data/input/text", filename)

In [3]:
out_dir = os.path.join(base_dir, "data/test", os.path.splitext(filename)[0])
os.makedirs(out_dir, exist_ok=True)

In [4]:
##extract images and drawings from document

with pymupdf.Document(pdf_path) as doc:
    for page_index,page in enumerate(doc):
        page_number = page_index + 1
        #extract images
        image_list = page.get_images()        
        for image_idx, img in enumerate(image_list):
            xref = img[0] 
        # extract image bytes 
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            # get image extension
            image_ext = base_image["ext"]
            
            pil_image = Image.open(io.BytesIO(image_bytes))
            # Save the image to disk
            image_path = os.path.join(out_dir,f"image_{page_number}_{image_idx}.{image_ext}")
            pil_image.save(image_path)

        #extract drawings
        drawings = page.get_drawings()
        rotation = page.rotation
        if drawings:
            out_doc = pymupdf.open()
            outpage = out_doc.new_page(width=page.mediabox_size.x, height=page.mediabox_size.y)
            shape = outpage.new_shape() 
            for draw_idx,drawing in enumerate(drawings):
        
                for item in drawing["items"]:  # these are the draw commands
                    if item[0] == "l":  # line
                        shape.draw_line(item[1], item[2])
                    elif item[0] == "re":  # rectangle
                        shape.draw_rect(item[1])
                    elif item[0] == "qu":  # quad
                        shape.draw_quad(item[1])
                    elif item[0] == "c":  # curve
                        shape.draw_bezier(item[1], item[2], item[3], item[4])
                    else:
                        raise ValueError("unhandled drawing", item)
                        
                shape.finish(
                    fill=drawing.get("fill", None),  # Fill color
                    color=drawing.get("color", None),  # Line color
                    dashes=drawing.get("dashes", None),  # Line dashing
                )
            shape.commit()
            outpage.set_rotation(rotation)
            out_drawing= os.path.join(out_dir,f"drawing_page_{page_number}.pdf")
            out_doc.save(out_drawing)
print(f"Images and drawings extracted to: {out_dir}")




Images and drawings extracted to: /home/lillemor/Documents/lgd-utils/asset-data-extraction/data/test/1062_6


In [5]:

##draw bounding boxes of text and drawings and texblocks
with pymupdf.open(pdf_path) as doc:
    for page_index,page in enumerate(doc):
        page_number = page_index +1
        matrix = page.derotation_matrix

        lines_per_page= create_text_lines(page,page_number)
        reading_blocks = create_text_blocks(lines_per_page)
        shape = page.new_shape()

        for block in reading_blocks:
            padded_block = pymupdf.Rect(block.rect.x0 - 3, block.rect.y0 - 3, block.rect.x1 + 3, block.rect.y1 + 3)
            if page.rotation != 0:
                oriented_block = padded_block*matrix
                shape.draw_rect(oriented_block)
            else:
                shape.draw_rect(padded_block)
        
        shape.finish(color=(0, 0, 1), width=1)
        
        for text_line in lines_per_page:
            if page.rotation!= 0:
                oriented_bbox = text_line.rect*matrix
                shape.draw_rect(oriented_bbox)
            else:
                shape.draw_rect(text_line.rect)
        shape.finish(color=(1, 0, 0), width=1)
        

        drawings = page.get_drawings()
        if drawings:
            clustered_boxes = cluster_drawings(drawings)
            print(f"Page number:{page_number}, drawings with length {len(drawings)}, clustered {len(clustered_boxes)}")

            for cluster in clustered_boxes:
                shape.draw_rect(cluster)
            shape.finish(color =(148/255, 0, 211/255),width = 1)
        shape.commit()

    doc.save(os.path.join(out_dir, filename))    