In [None]:
def extract_text_and_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text_content = ""
    images = []

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text_content += page.get_text("text")  # 提取文字

        # 提取图片
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_path = f"image_page_{page_num+1}_{img_index}.{image_ext}"
            with open(image_path, "wb") as image_file:
                image_file.write(image_bytes)
            images.append(image_path)
    
    return text_content, images

In [None]:
import pymupdf4llm
md_text = pymupdf4llm.to_markdown("btp.pdf")
md_text

In [None]:
import os
import fitz  # PyMuPDF
from pymupdf4llm import to_markdown  # PyMuPDF4LLM

def process_pdf_folder(pdf_folder):
    # Iterate over all PDF files in the folder
    for file_name in os.listdir(pdf_folder):
        if file_name.endswith('.pdf'):
            # Open the PDF file
            pdf_path = os.path.join(pdf_folder, file_name)
            doc = fitz.open(pdf_path)

            # Create a folder for each PDF file based on its name (without extension)
            base_name = os.path.splitext(file_name)[0]
            output_folder = os.path.join(pdf_folder, base_name)
            os.makedirs(output_folder, exist_ok=True)

            # Set the image output folder
            image_folder = os.path.join(output_folder, "images")
            os.makedirs(image_folder, exist_ok=True)

            # Process the PDF and extract the Markdown and images
            markdown_text = to_markdown(doc, 
                                        write_images=True, 
                                        image_path=image_folder, 
                                        image_format='png')

            # Save the Markdown content in the corresponding folder
            markdown_file = os.path.join(output_folder, f"{base_name}.md")
            with open(markdown_file, 'w', encoding='utf-8') as md_file:
                md_file.write(markdown_text)

            print(f"Processed {file_name}, Markdown and images saved to {output_folder}")

# Example usage
pdf_folder_path = 'Input'  # Specify the folder containing PDFs
process_pdf_folder(pdf_folder_path)


In [23]:
import os
import fitz  # PyMuPDF
from PIL import Image
import openai

# Step 1: Convert PDF pages to images
def save_pdf_as_images(pdf_path, output_folder, dpi=120):
    """
    Convert each page of a PDF to an image and save it to the output folder.
    
    Args:
    pdf_path (str): Path to the PDF file.
    output_folder (str): Folder where the images will be saved.
    dpi (int): Dots per inch for the generated images.
    
    Returns:
    List of image file paths.
    """
    doc = fitz.open(pdf_path)
    os.makedirs(output_folder, exist_ok=True)
    
    image_files = []
    for page_number in range(len(doc)):
        page = doc.load_page(page_number)
        pix = page.get_pixmap(dpi=dpi)  # Convert page to image
        image_path = os.path.join(output_folder, f"page_{page_number + 1}.png")
        pix.save(image_path)
        image_files.append(image_path)
        print(f"Saved {image_path}")
    
    return image_files

# Step 2: Call OpenAI API to extract text and image coordinates
def extract_text_from_image(image_path):
    """
    Send an image to OpenAI's API to extract text and image coordinates.
    
    Args:
    image_path (str): Path to the image.
    
    Returns:
    dict: Extracted Markdown and image coordinates.
    """
    with open(image_path, 'rb') as image_file:
        # Call OpenAI API (Assume task supports this functionality)
        response = openai.Image.create(
            file=image_file,
            model="text-davinci-003",  # Use appropriate OpenAI model
            task="extract-text",  # Hypothetical task
            return_coordinators=True  # Request coordinates of images
        )
    
    # Assuming the response contains 'markdown' and 'coordinates'
    markdown_text = response['markdown']
    image_coordinates = response.get('coordinates', [])
    
    return markdown_text, image_coordinates

# Step 3: Process all images and extract Markdown and image coordinates
def process_images(image_folder):
    """
    Process all images in a folder, extract text, and return results.
    
    Args:
    image_folder (str): Folder containing image files.
    
    Returns:
    List of dicts containing extracted Markdown and image coordinates.
    """
    markdown_results = []
    for image_file in os.listdir(image_folder):
        if image_file.endswith('.png'):
            image_path = os.path.join(image_folder, image_file)
            markdown_text, image_coordinates = extract_text_from_image(image_path)
            markdown_results.append({
                'image_file': image_file,
                'markdown': markdown_text,
                'coordinates': image_coordinates
            })
            print(f"Processed {image_file}")
    
    return markdown_results

# Step 4: Crop images based on coordinates
def crop_image(image_path, coordinates, output_folder):
    """
    Crop the image based on the provided coordinates and save it.
    
    Args:
    image_path (str): Path to the image file.
    coordinates (tuple): (x1, y1, x2, y2) coordinates of the image to crop.
    output_folder (str): Folder where cropped images will be saved.
    
    Returns:
    str: Path to the saved cropped image.
    """
    with Image.open(image_path) as img:
        cropped_img = img.crop(coordinates)
        cropped_img_path = os.path.join(output_folder, f"cropped_{os.path.basename(image_path)}")
        cropped_img.save(cropped_img_path)
        print(f"Cropped image saved at {cropped_img_path}")
    
    return cropped_img_path

# Step 5: Process cropped images for all pages
def process_cropped_images(image_folder, markdown_results):
    """
    Process cropped images based on the coordinates from Markdown results.
    
    Args:
    image_folder (str): Folder where original images are stored.
    markdown_results (list): List of extracted markdown and coordinates.
    """
    for result in markdown_results:
        image_file = result['image_file']
        coordinates_list = result['coordinates']
        
        image_path = os.path.join(image_folder, image_file)
        for coordinates in coordinates_list:
            crop_image(image_path, coordinates, image_folder)

# Main function to run the entire workflow
def process_pdf(pdf_path, output_folder):
    """
    Main function to process a PDF and extract text and images.
    
    Args:
    pdf_path (str): Path to the PDF file.
    output_folder (str): Folder where the results (images and markdown) will be saved.
    """
    # Step 1: Convert PDF to images
    image_files = save_pdf_as_images(pdf_path, output_folder)
    
    # Step 2: Extract text and image coordinates using OpenAI API
    markdown_results = process_images(output_folder)
    
    # Step 3: Save Markdown content for each page
    for result in markdown_results:
        markdown_file = os.path.join(output_folder, f"{os.path.splitext(result['image_file'])[0]}.md")
        with open(markdown_file, 'w', encoding='utf-8') as md_file:
            md_file.write(result['markdown'])
        print(f"Saved Markdown for {result['image_file']} at {markdown_file}")
    
    # Step 4: Crop images based on coordinates
    process_cropped_images(output_folder, markdown_results)

# Example usage
pdf_path = 'Input/btp.pdf'  # Specify your PDF file path
output_folder = 'Output'  # Specify where you want to save results
save_pdf_as_images(pdf_path, output_folder)

Saved Output/page_1.png
Saved Output/page_2.png
Saved Output/page_3.png
Saved Output/page_4.png
Saved Output/page_5.png
Saved Output/page_6.png
Saved Output/page_7.png
Saved Output/page_8.png
Saved Output/page_9.png
Saved Output/page_10.png
Saved Output/page_11.png


['Output/page_1.png',
 'Output/page_2.png',
 'Output/page_3.png',
 'Output/page_4.png',
 'Output/page_5.png',
 'Output/page_6.png',
 'Output/page_7.png',
 'Output/page_8.png',
 'Output/page_9.png',
 'Output/page_10.png',
 'Output/page_11.png']

In [None]:
from openai import OpenAI

client = OpenAI()

response = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {
      "role": "user",
      "content": [
        {"type": "text", "text": "What’s in this image?"},
        {
          "type": "image_url",
          "image_url": {
            "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
          },
        },
      ],
    }
  ],
  max_tokens=300,
)

print(response.choices[0])

In [25]:
import base64
import requests
import json

# OpenAI API Key (Remember to keep your key secure and not hardcoded in production)
api_key = "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"

# Function to encode the image in Base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Path to your image
image_path = "Output/page_2.png"

# Get the Base64-encoded string of the image
base64_image = encode_image(image_path)

# OpenAI API request headers
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

# Payload for the request
payload = {
    "model": "gpt-4o-mini-2024-07-18",
    "messages": [
        {
            "role": "system",
            "content": "The uploaded file is a screenshot for one page of a pdf file."
        },
        {
            "role": "user",
            "content": (
                "Return me the following: \n"
                "1. The coordinates for each image if detected. \n"
                "All the images will have a caption description, which could help you figure out the boundary. "
                "The coordinates should be in JSON format. Do not provide any information beyond the requested data. Thanks!"
            )
        },
        {
            "role": "user",
            "content": f"Here is the image in base64: {base64_image}"
        }
    ],
    "max_tokens": 4096
}

# Make the request to the OpenAI API
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

# Check if the request was successful
if response.status_code == 200:
    # Save the response to a file
    with open("response_output.json", "w", encoding='utf-8') as f:
        json.dump(response.json(), f, ensure_ascii=False, indent=4)
    print("Response has been saved successfully!")
else:
    print(f"Request failed with status code: {response.status_code}")
    print(response.text)


Request failed with status code: 400
{
  "error": {
    "message": "This model's maximum context length is 128000 tokens. However, your messages resulted in 360285 tokens. Please reduce the length of the messages.",
    "type": "invalid_request_error",
    "param": "messages",
    "code": "context_length_exceeded"
  }
}
