# Convert PDF files to jpg format 
#### (PP has problems with png format)
#### then save in folder named after the pdf file name
#### also mark if the pdf file is scanned image or adobe format

## 1. Define utility functions

In [1]:
# utility functions
import fitz
import os
import pikepdf
# import loguru
from pathlib import Path

def check_pdf_lock(doc):
    """ check if a pdf document is locked

    Args:
        doc (_fitz.Document_): pymupdf doc object

    ref: (0: allowed, 1: not allowed)

        0b00000001 (1): Print permission
        0b00000010 (2): Modify permission
        0b00000100 (4): Copy permission
        0b00001000 (8): Annotate permission
        0b00010000 (16): Form filling permission
        0b00100000 (32): Accessibility permission
        0b01000000 (64): Document assembly permission
        0b10000000 (128): Print high-quality permission
    """

    # Check if the document is encrypted (password-protected)
    if doc.is_encrypted:
        print("The PDF document is encrypted (password-protected).")
        locked = True

    # Check the document's permissions
    permissions = doc.permissions # int. See https://pymupdf.readthedocs.io/en/latest/document.html#Document.permissions

    # convert to binary
    permissions_bin = bin(permissions)[2:].zfill(8)

    print_permission = bool(permissions & 1)
    modify_permission = bool(permissions & 2)
    copy_permission = bool(permissions & 4)
    annotate_permission = bool(permissions & 8)
    form_fill_permission = bool(permissions & 16)
    accessibility_permission = bool(permissions & 32)
    assemble_permission = bool(permissions & 64)
    print_hq_permission = bool(permissions & 128)

    # Print the individual permissions
    print("Print Permission:", print_permission)
    print("Modify Permission:", modify_permission)
    print("Copy Permission:", copy_permission)
    print("Annotate Permission:", annotate_permission)
    print("Form Fill Permission:", form_fill_permission)
    print("Accessibility Permission:", accessibility_permission)
    print("Document Assembly Permission:", assemble_permission)
    print("Print High-Quality Permission:", print_hq_permission)

    return permissions, permissions_bin


def unlock_pdf(pdf_path, out_path):
    """unlock pdf modification permissions

    Args:
        doc (_fitz.Document_): pymupdf doc object
    """

    # Open the PDF file with PyMuPDF
    doc = fitz.open(pdf_path)

    # Define the permission flags
    NO_PERMISSION = 0
    PRINT_PERMISSION = 1
    MODIFY_PERMISSION = 2
    COPY_PERMISSION = 4
    ANNOTATE_PERMISSION = 8
    FORM_FILL_PERMISSION = 16
    ACCESSIBILITY_PERMISSION = 32
    ASSEMBLE_PERMISSION = 64
    PRINT_HQ_PERMISSION = 128

    # Set the desired permissions
    permissions = PRINT_PERMISSION | MODIFY_PERMISSION | COPY_PERMISSION | ANNOTATE_PERMISSION| ASSEMBLE_PERMISSION

    # Save the modified PDF with the new permissions
    doc.save(
        out_path, #incremental=True,
        encryption=fitz.PDF_ENCRYPT_KEEP,
        permissions=permissions)

    # Close the document
    doc.close()


def convert_pdf_to_images(
        pdf_folder:str = '',
        output_folder:str = "",
        scan_threshold:int = 100,
        format = 'jpg'
        ) -> None:
    """converts all pdfs in a folder to images and saves them in individual folders

    Args:
        pdf_folder (str): where PDFs located
        output_folder (str): where to save imaes (can be subfolder or pdf_folder)
        scan_threshold (int): length of text to determine if page is scanned image
    """

    doclist = os.listdir(pdf_folder)

    for pdf_path in doclist:

        try:
            if not pdf_path.endswith('.pdf'):
                continue

            pdf_name = os.path.splitext(pdf_path)[0]
            # output_folder = os.path.join(pdf_folder, pdf_name)
            # if not os.path.isdir(output_folder):
            #     os.mkdir(output_folder)

            open_path = os.path.join(pdf_folder, pdf_path)
            doc = fitz.open(open_path)

            # if check_pdf_lock(doc):
            #     print(f"----------------------- PDF {pdf_name} is locked, skipping-----------------")
            #     continue

            for page_num in range(len(doc)):
                page = doc.load_page(page_num)

                # check for scanned image wrapped in PDF foramt by extracting texts
                # extract all texts from page
                text = page.get_text()
                doctype = 'scan' if len(text) < scan_threshold else 'adobe'

                # save_path = os.path.join(output_folder, f'{doctype}_{pdf_name}', f'{doctype}_{pdf_name}_{page_num}.png')
                save_path = os.path.join(output_folder, f'{doctype}_{pdf_name}_{page_num}.{format}')
                # if not os.path.isdir(os.path.join(output_folder, f'{doctype}_{pdf_name}')):
                #     os.mkdir(os.path.join(output_folder, f'{doctype}_{pdf_name}'))
                if not os.path.isdir(output_folder):
                    os.mkdir(output_folder)
                if not os.path.isfile(save_path):
                    page.get_pixmap(dpi=200).save(save_path)
                    print(f"{pdf_name}/{page_num} saved.")
                else:
                    print(f"------------------ {save_path} exists! ---------------------")

            doc.close()
        except Exception as e:
            print(f"ERROR on {pdf_path}: {e}")


def get_image_from_pdf(doc):

    # import display from ipython
    from IPython.display import display, Image

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pixmap = page.get_pixmap()
        # return pixmap

        # display pixmap in Jupyter notebook
        img = Image(data=pixmap.samples, format='png')
        # img.save(os.path.join())
        display(img)

        # Get the list of images on the page
        image_list = page.get_images()

        # Iterate over each image
        for image_index, img in enumerate(image_list, start=1):
            # Get the image's bounding box
            xref = img[0]  # Xref number of the image
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Get the image's dimensions
            image_width = base_image["width"]
            image_height = base_image["height"]

            # Save the image to a file
            with open(f"image_{page_num}_{image_index}.png", "wb") as f:
                f.write(image_bytes)

            print(f"Image {image_index} saved")


## 2. Unlock PDFs if locked (PASS. No effect)
#### If already unlocked, copy to the folder where the unlocked PDFs are.

In [10]:
pdf_folder = './testdata/misc'  # PDFs_original
# pdf_folder = '/home/lstm/fiftyone/열람판례_PDF_20240416-20240417'
# out_folder = '/home/lstm/fiftyone/temp'
out_folder = './testdata/misc/output'
pdf_list = [file for file in os.listdir(pdf_folder) if os.path.isfile(os.path.join(pdf_folder, file))]

# check permission and unlock if locked
for idx, pdf_path in enumerate(pdf_list):

    filename = Path(pdf_path).name
    print(filename)

    # check permission
    check_pdf_lock(fitz.open(os.path.join(pdf_folder, filename)))

    unlock_pdf(
        os.path.join(pdf_folder, filename),
        os.path.join(out_folder, filename)
        )

    if idx >= 5: break


lawnb_01031929931.pdf
Print Permission: False
Modify Permission: False
Copy Permission: True
Annotate Permission: True
Form Fill Permission: True
Accessibility Permission: True
Document Assembly Permission: True
Print High-Quality Permission: True


In [None]:
check_pdf_lock(os.path.join(pdf_folder, filename))

## 3. Convert PDF to JPEG (PP not fully supporting PNG)

### in single folder

In [5]:

# pdf_folder = '/home/lstm/fiftyone/열람판례_PDF_20240416-20240417'
# output_folder = '/home/lstm/fiftyone/_img_열람판례_PDF_20240416-20240417'

# pdf_folder = './testdata/PDFs_original'
# output_folder = './testdata/CONVERTED_JPEG'

# pdf_folder = '/home/lstm/fiftyone/열람판례_PDF_20240416-20240417'
# output_folder = './testdata/CONVERTED_JPEG_batch2'

# pdf_folder = './testdata/PDFs_original/patent_cases'
# output_folder = './testdata/CONVERTED_JPEG_batch_patent'

pdf_folder = '/home/lstm/KR case dataset/열람판례 PDF_20240521-20240522'
output_folder = '/home/lstm/KR case dataset/_img_열람판례 PDF_20240521-20240522'

convert_pdf_to_images(pdf_folder, output_folder)


서울중앙지법_2014나71025_판결서/0 saved.
서울중앙지법_2014나71025_판결서/1 saved.
서울중앙지법_2014나71025_판결서/2 saved.
서울중앙지법_2014나71025_판결서/3 saved.
서울중앙지법_2014나71025_판결서/4 saved.
서울중앙지법_2014나71025_판결서/5 saved.
진주지원_2014가단13675_판결서/0 saved.
진주지원_2014가단13675_판결서/1 saved.
진주지원_2014가단13675_판결서/2 saved.
진주지원_2014가단13675_판결서/3 saved.
진주지원_2014가단13675_판결서/4 saved.
진주지원_2014가단13675_판결서/5 saved.
진주지원_2014가단13675_판결서/6 saved.
진주지원_2014가단13675_판결서/7 saved.
진주지원_2014가단13675_판결서/8 saved.
부산고법_2014나2211_판결서/0 saved.
부산고법_2014나2211_판결서/1 saved.
부산고법_2014나2211_판결서/2 saved.
부산고법_2014나2211_판결서/3 saved.
부산고법_2014나2211_판결서/4 saved.
부산고법_2014나2211_판결서/5 saved.
부산고법_2014나2211_판결서/6 saved.
부산고법_2014나2211_판결서/7 saved.
부산고법_2014나2211_판결서/8 saved.
부산고법_2014나2211_판결서/9 saved.
부산고법_2014나2211_판결서/10 saved.
부산고법_2014나2211_판결서/11 saved.
창원지법_2014가단77003_판결서/0 saved.
창원지법_2014가단77003_판결서/1 saved.
창원지법_2014가단77003_판결서/2 saved.
창원지법_2014가단77003_판결서/3 saved.
창원지법_2014가단77003_판결서/4 saved.
창원지법_2014가단77003_판결서/5 saved.
서울중앙지법_2016가단4165_판결서/0 sa

### in subfolders

In [None]:
pdf_folder = './testdata/PDFs_original'
output_folder = './testdata/CONVERTED_JPEG'

convert_pdf_to_images(pdf_folder, output_folder)