In [1]:
import os
import pymupdf
from PIL import Image
import io

from text import create_text_lines
from bounding_box import cluster_drawings

In [2]:
base_dir = os.getcwd()
filename ="Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen.PDF"
# filename = "Berichte_NAB 12-051_Sedimentologie und Stratigraphie des ‘Brauner Doggers’ und seiner westlichen.PDF"
# filename = "Berichte_NAB 12-040_Untere Rahmengesteine des Opalinustons  Hydrogeologische Einheiten, Gesteinsparam.PDF"
pdf_path = os.path.join(base_dir, "data/NAB", filename)

In [3]:
out_dir = os.path.join(base_dir, "data/test", os.path.splitext(filename)[0])
os.makedirs(out_dir, exist_ok=True)

In [4]:
##extract images from document

with pymupdf.Document(pdf_path) as doc:
    for page_number,page in enumerate(doc):
        image_list = page.get_images()
        drawing_list = page.get_drawings()
        for image_idx, img in enumerate(image_list):
            xref = img[0] 
        # extract image bytes 
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            # get image extension
            image_ext = base_image["ext"]
            
            pil_image = Image.open(io.BytesIO(image_bytes))
            # Save the image to disk
            image_path = os.path.join(out_dir,f"image_{page_number}_{image_idx}.{image_ext}")
            pil_image.save(image_path)
            
print(f"Images extracted to: {out_dir}")




Images extracted to: /home/lillemor/Documents/lgd-utils/asset-data-extraction/data/test/Berichte_NAB 10-025_Kurzarbeitsprogram Geothermiebohrung Schlattingen


In [5]:
##extract drawings from document

with pymupdf.open(pdf_path) as doc:
    for page_index,page in enumerate(doc):
        page_number = page_index +1
        drawings = page.get_drawings()
        if drawings:
            out_doc = pymupdf.open()
            outpage = out_doc.new_page(width=page.rect.width, height=page.rect.height)
            shape = outpage.new_shape() 
            for draw_idx,drawing in enumerate(drawings):
        
                for item in drawing["items"]:  # these are the draw commands
                    if item[0] == "l":  # line
                        shape.draw_line(item[1], item[2])
                    elif item[0] == "re":  # rectangle
                        shape.draw_rect(item[1])
                    elif item[0] == "qu":  # quad
                        shape.draw_quad(item[1])
                    elif item[0] == "c":  # curve
                        shape.draw_bezier(item[1], item[2], item[3], item[4])
                    else:
                        raise ValueError("unhandled drawing", item)
                        
                shape.finish(
                    fill=drawing.get("fill", None),  # Fill color
                    color=drawing.get("color", None),  # Line color
                    dashes=drawing.get("dashes", None),  # Line dashing
                )
            shape.commit()
            out_drawing= os.path.join(out_dir,f"drawing_page_{page_number}.pdf")
            out_doc.save(out_drawing)

In [6]:
##draw boudning box of drawing and text

##draw bounding boxes
with pymupdf.open(pdf_path) as doc:
    for page_index,page in enumerate(doc):
        page_number = page_index +1
        
        lines_per_page= create_text_lines(page,page_number)
        drawings = page.get_drawings()


        shape = page.new_shape() 
        for text_line in lines_per_page:
            shape.draw_rect(text_line.rect)
            shape.finish(color=(1, 0, 0), width=1)
        

        if drawings:
            clustered_boxes = cluster_drawings(drawings)
            print(f"Page number:{page_number}, drawings with length {len(drawings)}, clustered {len(clustered_boxes)}")

            for cluster in clustered_boxes:
                shape.draw_rect(cluster)
                shape.finish(color =(148/255, 0, 211/255),width = 1)
                
        shape.commit()
    
    doc.save(os.path.join(out_dir, filename))    

Page number:1, drawings with length 7, clustered 2
Page number:3, drawings with length 6, clustered 1
Page number:14, drawings with length 44, clustered 7
Page number:17, drawings with length 1, clustered 1
Page number:23, drawings with length 171, clustered 1
Page number:27, drawings with length 16, clustered 8
