In [4]:
from pdf2image import convert_from_path
import os
from PIL import Image

In [5]:
def is_white(pixel):
    # RGB 모드
    if isinstance(pixel, tuple) and len(pixel) == 3:
        return all(channel >= 250 for channel in pixel)
    # RGBA 모드
    elif isinstance(pixel, tuple) and len(pixel) == 4:
        return all(channel >= 250 for channel in pixel[:3])
    # 그레이스케일 모드
    elif isinstance(pixel, int):
        return pixel >= 250
    return False

In [6]:
def split_col_by_height_rgb(cols, crop_height=40):
    cols_segments = []

    for col in cols:
        width, height = col.size
        segments = []
        current_segment = None
        flag = 0

        for y in range(0, height, crop_height):
            # 박스 정의 (왼쪽, 위쪽, 오른쪽, 아래쪽)
            box = (0, y, width, min(y + crop_height, height))
            crop_segment = col.crop(box)
            pixels = list(crop_segment.getdata())

            # 흰색 구간 여부 확인
            is_white_segment = all(pixel == (255, 255, 255) for pixel in pixels)
            
            if is_white_segment:
                # 흰색 구간일 때
                if flag == 1 and current_segment:
                    # flag가 1이었으나 흰색 구간이 나오면 현재까지의 구간을 segments에 추가
                    segments.append(current_segment)
                    current_segment = None
                flag = 0
            else:
                # 흰색이 아닌 구간일 때
                if flag == 0:
                    # 새로운 구간 시작
                    current_segment = crop_segment.copy()
                else:
                    # 연속된 비흰색 구간 합치기
                    new_height = current_segment.height + crop_segment.height
                    combined_segment = Image.new('RGB', (width, new_height))
                    combined_segment.paste(current_segment, (0, 0))
                    combined_segment.paste(crop_segment, (0, current_segment.height))
                    current_segment = combined_segment

                flag = 1

        # 마지막 세그먼트를 추가합니다. - 마지막은 페이지 번호이므로 제외
        # if current_segment:
        #     segments.append(current_segment)

        # 열 정보 저장
        cols_segments.append(segments)

    return cols_segments

In [7]:
def split_image_by_columns(image, col_count=2):
    width, height = image.size
    column_width = (width - (col_count - 1) * 3) // col_count  # 열 간의 경계선 3px 제외

    cropped_col_images = []

    for i in range(col_count):
        left = i * (column_width + 3)
        right = left + column_width
        cropped_col_image = image.crop((left, 0, right, height))
        cropped_col_images.append(cropped_col_image)

    return cropped_col_images

In [8]:
def pdf_to_images(pdf_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    images = convert_from_path(pdf_path)

    return images

In [9]:
def pdf_to_split_boxes(pdf_path, output_folder, col_count):
    images = pdf_to_images(pdf_path, output_folder)
    pdf_dict = {}

    for page_num, image in enumerate(images):
        page_key = f'page_{page_num + 1}'
        pdf_dict[page_key] = {
                'image': image,
            }

        # output_page_folder = os.path.join(output_folder, page_key)
        # if not os.path.exists(output_page_folder):
        #     os.makedirs(output_page_folder)

        col_images = split_image_by_columns(image, col_count)
        cols_box_images = split_col_by_height_rgb(col_images)

        for col_num, col_val in enumerate(col_images):
            col_key = f'col_{col_num + 1}'

            pdf_dict[page_key][col_key] = {
                'col_image': col_val
            }

            # col_path = os.path.join(output_page_folder, f'{col_key}.png')
            # col_val.save(col_path, 'PNG')

            col_box_images = cols_box_images[col_num]
            for box_num, box_val in enumerate(col_box_images):
                box_key = f'box_{box_num + 1}'

                pdf_dict[page_key][col_key][box_key] = {
                    'box_image': box_val
                }

                format_nums = f"C{col_num + 1:02d}B{box_num + 1:04d}"
                box_path = os.path.join(output_folder, f'{page_key}_{format_nums}.png')
                box_val.save(box_path, 'PNG')
        
    return pdf_dict

In [10]:
def convert_all_pdfs_in_folder(folder_path, col_count=2):
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, file_name)
            output_folder = os.path.join(folder_path, os.path.splitext(file_name)[0])
            
            print(f"{pdf_path} 작업 시작")
            pdf_dict = {"pdf_name": file_name, "pdf_val": pdf_to_split_boxes(pdf_path, output_folder, col_count)}

            for key, val in pdf_dict.items():
                print(f'{key}: {val}')

In [11]:
folder_path = "../test_data/test_pdf"
pdf_col_count = 2

convert_all_pdfs_in_folder(folder_path, pdf_col_count)

../test_data/test_pdf/test_pdf_2.pdf 작업 시작
pdf_name: test_pdf_2.pdf
pdf_val: {'page_1': {'image': <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=2023x2864 at 0x11AD0D5A0>, 'col_1': {'col_image': <PIL.Image.Image image mode=RGB size=1010x2864 at 0x11AD38190>, 'box_1': {'box_image': <PIL.Image.Image image mode=RGB size=1010x480 at 0x11ABBE5C0>}, 'box_2': {'box_image': <PIL.Image.Image image mode=RGB size=1010x240 at 0x11AB89780>}, 'box_3': {'box_image': <PIL.Image.Image image mode=RGB size=1010x200 at 0x11ABBE740>}, 'box_4': {'box_image': <PIL.Image.Image image mode=RGB size=1010x240 at 0x11A8AAC20>}, 'box_5': {'box_image': <PIL.Image.Image image mode=RGB size=1010x200 at 0x11AB4DEA0>}, 'box_6': {'box_image': <PIL.Image.Image image mode=RGB size=1010x80 at 0x11AA2E2C0>}}, 'col_2': {'col_image': <PIL.Image.Image image mode=RGB size=1010x2864 at 0x11AD3B4F0>, 'box_1': {'box_image': <PIL.Image.Image image mode=RGB size=1010x200 at 0x11AA2F2B0>}, 'box_2': {'box_image': <PIL.Image.Image