In [3]:
import fitz  # PyMuPDF
from markdownify import markdownify as md
import os

def pdf_to_markdown(pdf_path, output_folder):
    # 确保输出文件夹存在
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # 打开PDF文件
    doc = fitz.open(pdf_path)
    markdown_content = ""

    # 遍历每一页
    for page_num in range(len(doc)):
        page = doc[page_num]

        # 提取文本内容
        text = page.get_text()
        markdown_content += md(text)

        # 提取页面中的图片
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list, start=1):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            # 构建图片文件名和路径
            image_name = f"image_page{page_num + 1}_{img_index}.png"
            image_path = os.path.join(output_folder, image_name)
            # 保存图片
            with open(image_path, "wb") as image_file:
                image_file.write(image_bytes)

            # 将图片Markdown标记添加到内容中
            markdown_content += f"![{image_name}]({image_name})\n"

    # 将Markdown内容写入文件
    markdown_file_path = os.path.join(output_folder, "output.md")
    with open(markdown_file_path, "w", encoding="utf-8") as md_file:
        md_file.write(markdown_content)

    print(f"Markdown content written to {markdown_file_path}")

# 使用函数
pdf_path = "79.pdf"  # 替换为你的PDF文件路径
output_folder = "./output/"  # 替换为你想要的输出文件夹路径
pdf_to_markdown(pdf_path, output_folder)


Markdown content written to ./output.md
