In [None]:
import numpy as np
from pdf2image import convert_from_path
from docx import Document
from docx.shared import Inches
from pathlib import Path
import shutil
from typing import Tuple
from PIL import Image
from docx.shared import Inches, Pt

In [None]:
def crop_to_content_box(image: Image.Image, threshold: int = 245) -> Image.Image:
    """
    Automatically crop white margins from an image using brightness threshold.

    Args:
        image (PIL.Image): The image to crop.
        threshold (int): Pixel brightness threshold (0–255); lower is darker.
                         Pixels brighter than this will be considered "margin".

    Returns:
        PIL.Image: Cropped image containing only the main content.
    """
    gray = image.convert("L")
    arr = np.array(gray)
    mask = arr < threshold
    coords = np.argwhere(mask)

    if coords.size == 0:
        return image

    y0, x0 = coords.min(axis=0)
    y1, x1 = coords.max(axis=0) + 1
    return image.crop((x0, y0, x1, y1))


def auto_crop_and_embed_to_word_fit_page(
    pdf_filename: str,
    input_dir: str = "inputs",
    output_dir: str = "outputs",
    dpi: int = 400,
    page_margins: Tuple[float, float, float, float] = (1.0, 1.0, 1.0, 1.0),  # top, bottom, left, right
    shrink_scale: float = 1.0,
    threshold: int = 245
):
    """
    Convert a multi-page PDF into a Word document where:
    - Each page is auto-cropped based on visible content (removes white margins)
    - Each cropped page is inserted as an image on its own Word page
    - The image is resized to fit within the printable area (A4) after accounting for custom margins

    Args:
        pdf_filename (str): PDF file to process (should be in `input_dir`).
        input_dir (str): Folder where input PDF is located.
        output_dir (str): Folder to save output Word document.
        dpi (int): Resolution for converting PDF to images.
        page_margins (Tuple[float, float, float, float]):
            Page margins in inches — order is (top, bottom, left, right).
        shrink_scale (float): Optional extra downscale factor (e.g. 0.90 = reduce image size by 10%).
        threshold (int): Brightness threshold for auto-cropping margins (0–255).

    Returns:
        None
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    temp_dir = Path("temp_img_autocrop")
    input_path.mkdir(exist_ok=True)
    output_path.mkdir(exist_ok=True)
    temp_dir.mkdir(exist_ok=True)

    pdf_path = input_path / pdf_filename
    output_docx_path = output_path / f"{pdf_path.stem.replace(' ', '_')}_autocrop.docx"

    if not pdf_path.exists():
        print(f"❌ PDF not found: {pdf_path}")
        return

    print("📄 Converting PDF to high-res images...")
    images = convert_from_path(str(pdf_path), dpi=dpi)

    print("✂️ Auto-cropping and resizing images...")
    doc = Document()

    # Set default font
    style = doc.styles['Normal']
    font = style.font
    font.name = 'Arial'
    font.size = Pt(11)

    # Apply actual Word page margins
    top, bottom, left, right = page_margins
    section = doc.sections[0]
    section.top_margin = Inches(top)
    section.bottom_margin = Inches(bottom)
    section.left_margin = Inches(left)
    section.right_margin = Inches(right)

    # Define printable area in inches for A4 size page (8.27 x 11.69 inches)
    printable_width = 8.27 - (left + right)
    printable_height = 11.69 - (top + bottom)

    for i, img in enumerate(images):
        cropped = crop_to_content_box(img, threshold=threshold)

        # Convert pixel size to inches using known DPI
        w_px, h_px = cropped.size
        w_in = w_px / dpi
        h_in = h_px / dpi

        # Scale to fit inside printable area, then shrink by user-specified factor
        scale = min(printable_width / w_in, printable_height / h_in) * shrink_scale
        target_width = Inches(w_in * scale)
        target_height = Inches(h_in * scale)

        temp_img_path = temp_dir / f"page_{i+1}.jpg"
        cropped.save(temp_img_path)
        doc.add_picture(str(temp_img_path), width=target_width, height=target_height)
        cropped.close()

    doc.save(output_docx_path)
    print(f"✅ Saved to: {output_docx_path}")

    shutil.rmtree(temp_dir)


In [None]:
auto_crop_and_embed_to_word_fit_page("your_file.pdf")