In [1]:
!pip install requests PyMuPDF Pillow

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.7-cp312-none-macosx_10_9_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.6 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.6-py3-none-macosx_10_9_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.7-cp312-none-macosx_10_9_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading PyMuPDFb-1.24.6-py3-none-macosx_10_9_x86_64.whl (15.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.7 PyMuPDFb-1.24.6


In [8]:
import requests
import fitz  # PyMuPDF
from PIL import Image, ImageChops
import io
import logging
import os

logging.basicConfig(level=logging.INFO)

def fetch_pdf_from_arxiv(url: str) -> bytes:
    """
    Fetches the PDF file from the given arXiv URL.
    
    Args:
        url (str): The URL of the arXiv PDF.
        
    Returns:
        bytes: The content of the PDF file.
    """
    logging.info("Fetching PDF from URL: %s", url)
    response = requests.get(url)
    response.raise_for_status()
    logging.info("PDF fetched successfully")
    return response.content

def trim_white_margins(img: Image.Image, padding: int = 10) -> Image.Image:
    """
    Trims the white margins from an image and adds a padding around the cropped area.
    
    Args:
        img (Image.Image): The input image.
        padding (int): The padding to add around the cropped area.
        
    Returns:
        Image.Image: The cropped image with white margins removed and padding added.
    """
    bg = Image.new(img.mode, img.size, img.getpixel((0,0)))
    diff = ImageChops.difference(img, bg)
    bbox = diff.getbbox()
    if bbox:
        left, upper, right, lower = bbox
        left = max(0, left - padding)
        upper = max(0, upper - padding)
        right = min(img.width, right + padding)
        lower = min(img.height, lower + padding)
        return img.crop((left, upper, right, lower))
    return img

def remove_vertical_text(img: Image.Image, vertical_text_width: int = 100) -> Image.Image:
    """
    Removes the vertical text from the left side of the image.
    
    Args:
        img (Image.Image): The input image.
        vertical_text_width (int): The width of the vertical text to be removed.
        
    Returns:
        Image.Image: The image with the vertical text removed.
    """
    width, height = img.size
    return img.crop((vertical_text_width, 0, width, height))

def convert_pdf_to_images(pdf_content: bytes, padding: int = 10, vertical_text_width: int = 100) -> list:
    """
    Converts each page of the PDF content to a cropped PNG image.
    
    Args:
        pdf_content (bytes): The content of the PDF file.
        padding (int): The padding to add around the cropped area.
        vertical_text_width (int): The width of the vertical text to be removed.
        
    Returns:
        list: A list of PIL Image objects representing each page.
    """
    images = []
    pdf_document = fitz.open("pdf", pdf_content)
    logging.info("Converting PDF to images")
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        img_no_vertical_text = remove_vertical_text(img, vertical_text_width)
        cropped_img = trim_white_margins(img_no_vertical_text, padding)
        images.append(cropped_img)
        logging.info("Converted and cropped page %d to image", page_num + 1)
    logging.info("All pages converted and cropped successfully")
    return images

def save_images(images: list, output_dir: str):
    """
    Saves the list of images to the specified output directory.
    
    Args:
        images (list): A list of PIL Image objects.
        output_dir (str): The directory to save the images.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    for i, img in enumerate(images):
        img_path = os.path.join(output_dir, f"page_{i + 1}.png")
        img.save(img_path, "PNG")
        logging.info("Saved image: %s", img_path)

def main(url: str, output_dir: str, padding: int = 10, vertical_text_width: int = 100):
    pdf_content = fetch_pdf_from_arxiv(url)
    images = convert_pdf_to_images(pdf_content, padding, vertical_text_width)
    save_images(images, output_dir)

if __name__ == "__main__":
    arxiv_url = "https://arxiv.org/pdf/2405.17401"
    output_directory = "output"  # Replace with your desired output directory
    padding_amount = 10  # Set the desired padding amount
    vertical_text_width = 100  # Set the width of the vertical text to be removed
    main(arxiv_url, output_directory, padding_amount, vertical_text_width)


INFO:root:Fetching PDF from URL: https://arxiv.org/pdf/2405.17401
INFO:root:PDF fetched successfully
INFO:root:Converting PDF to images
INFO:root:Converted and cropped page 1 to image
INFO:root:Converted and cropped page 2 to image
INFO:root:Converted and cropped page 3 to image
INFO:root:Converted and cropped page 4 to image
INFO:root:Converted and cropped page 5 to image
INFO:root:Converted and cropped page 6 to image
INFO:root:Converted and cropped page 7 to image
INFO:root:Converted and cropped page 8 to image
INFO:root:Converted and cropped page 9 to image
INFO:root:Converted and cropped page 10 to image
INFO:root:Converted and cropped page 11 to image
INFO:root:Converted and cropped page 12 to image
INFO:root:Converted and cropped page 13 to image
INFO:root:Converted and cropped page 14 to image
INFO:root:Converted and cropped page 15 to image
INFO:root:Converted and cropped page 16 to image
INFO:root:Converted and cropped page 17 to image
INFO:root:Converted and cropped page 18 