In [12]:
import os
import pymupdf
from PIL import Image
import io
import sys
from pathlib import Path

repo_root = Path.cwd().parent.resolve()

src_path = repo_root / "src"
sys.path.append(str(src_path))


from text import  create_text_lines, create_text_blocks
from bounding_box import cluster_drawings
from bounding_box import expand_bbox
from utils import cluster_text_elements

In [23]:
def get_pdf_files(input_path: Path) -> list[Path]:
    if input_path.is_dir():
        return [f for f in input_path.rglob("*.pdf")]
    elif input_path.is_file() and input_path.suffix.lower() == '.pdf':
        return [input_path]
    return []

In [24]:
input_path = Path(repo_root / "data/input/single_pages/unknown/1801_35.pdf" )
file_paths = get_pdf_files(input_path)
print(input_path)

/home/lillemor/Documents/lgd-utils/asset-data-extraction/data/input/single_pages/unknown/1801_35.pdf


In [20]:
##extract images and drawings from document
for pdf_path in file_paths:
    filename = pdf_path.name
    out_dir = repo_root / "data/test/" / filename
    out_dir.mkdir(parents=True, exist_ok=True)
    with pymupdf.Document(pdf_path) as doc:
        for page_index,page in enumerate(doc):
            page_number = page_index + 1
            #extract images
            image_list = page.get_images()        
            for image_idx, img in enumerate(image_list):
                xref = img[0] 
            # extract image bytes 
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                # get image extension
                image_ext = base_image["ext"]
                
                pil_image = Image.open(io.BytesIO(image_bytes))
                # Save the image to disk
                image_path = os.path.join(out_dir,f"image_{page_number}_{image_idx}.{image_ext}")
                pil_image.save(image_path)

            #extract drawings
            drawings = page.get_drawings()
            rotation = page.rotation
            if drawings:
                out_doc = pymupdf.open()
                outpage = out_doc.new_page(width=page.mediabox_size.x, height=page.mediabox_size.y)
                shape = outpage.new_shape() 
                for draw_idx,drawing in enumerate(drawings):
            
                    for item in drawing["items"]:  # these are the draw commands
                        if item[0] == "l":  # line
                            shape.draw_line(item[1], item[2])
                        elif item[0] == "re":  # rectangle
                            shape.draw_rect(item[1])
                        elif item[0] == "qu":  # quad
                            shape.draw_quad(item[1])
                        elif item[0] == "c":  # curve
                            shape.draw_bezier(item[1], item[2], item[3], item[4])
                        else:
                            raise ValueError("unhandled drawing", item)
                            
                    shape.finish(
                        fill=drawing.get("fill", None),  # Fill color
                        color=drawing.get("color", None),  # Line color
                        dashes=drawing.get("dashes", None),  # Line dashing
                    )
                shape.commit()
                outpage.set_rotation(rotation)
                out_drawing= os.path.join(out_dir,filename,f"drawing_page_{page_number}.pdf")
                out_doc.save(out_drawing)
    print(f"Images and drawings extracted to: {out_dir}")

Images and drawings extracted to: /home/lillemor/Documents/lgd-utils/asset-data-extraction/data/test/1801_35.pdf


In [None]:
##draw bounding boxes of text and drawings and texblocks
for pdf_path in file_paths:
    filename = pdf_path.name
    out_dir = repo_root / "data/test" / pdf_path.stem
    out_dir.mkdir(parents=True, exist_ok=True)
    with pymupdf.open(pdf_path) as doc:
        for page_index,page in enumerate(doc):
            page_number = page_index +1
            matrix = page.derotation_matrix

            lines_per_page= create_text_lines(page,page_number)
            reading_blocks = create_text_blocks(lines_per_page)
            # filtered_lines = [line for line in lines_per_page if len(line.words) < 4]
            # clusters = cluster_text_elements(filtered_lines,
            #                              key_fn= lambda line:line.rect.x0)
            shape = page.new_shape()

            for block in reading_blocks:
                padded_block = expand_bbox(block.rect, 3)
                shape.draw_rect(padded_block * matrix if page.rotation else padded_block)
            
            shape.finish(color=(0, 0, 1), width=1)

            # if filtered_lines:
            #     for cluster in clusters:
                    
            #         cluster_rect =pymupdf.Rect(min([line.rect.x0 for line in cluster]),
            #                                     min([line.rect.y0 for line in cluster]),
            #                                     max([line.rect.x1 for line in cluster]),
            #                                     max([line.rect.y1 for line in cluster]))
            #         shape.draw_rect(cluster_rect* matrix if page.rotation else cluster_rect) 
                
            #     shape.finish(color=(0, 0, 1), width=1)

            
            for text_line in lines_per_page:
                shape.draw_rect(text_line.rect * matrix if page.rotation else text_line.rect)

            shape.finish(color=(1, 0, 0), width=1)
            
            drawings = page.get_drawings()
            if drawings:
                clustered_boxes = cluster_drawings(drawings)
                print(f"Page number:{page_number}, drawings with length {len(drawings)}, clustered {len(clustered_boxes)}")
                for cluster in clustered_boxes:
                    shape.draw_rect(cluster)
                
            shape.finish(color =(148/255, 0, 211/255),width = 1)
            shape.commit()

        doc.save(os.path.join(out_dir, f"{filename}"))    
        print(f"{os.path.join(out_dir, f'{filename}')}")    


/home/lillemor/Documents/lgd-utils/asset-data-extraction/data/test/1801_35/1801_35.pdf
