### 以下代码为使用 kaggle 平台的 GPU P100运行的输出
### 使用 mineru 将 pdf 文件转化为 md 文件
实际也可使用 MinerU 的官方 api 或是软件进行批量转化

In [1]:
!pip install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple
!pip install uv -i https://mirrors.aliyun.com/pypi/simple

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Collecting pip
[0m  Downloading https://mirrors.aliyun.com/pypi/packages/29/a2/d40fb2460e883eca5199c62cfc2463fd261f760556ae6290f88488c362c0/pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Collecting uv
  Downloading https://mirrors.aliyun.com/pypi/packages/4a/77/82755b46e9b0fa04682c04566fe0ae99bbba5b8525e061e78cbdf84e5666/uv-0.7.16-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.4/18.4 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hI

In [3]:
!uv pip install mineru[all] -i https://pypi.tuna.tsinghua.edu.cn/simple

[2mUsing Python 3.11.11 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 157ms[0m[0m


In [4]:
!mineru --version

2025-06-28 02:32:48.576700: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751077968.771392     127 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751077968.831880     127 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
mineru, version 2.0.6


In [None]:
import argparse
import os
import subprocess
from pathlib import Path
import shutil
from tqdm import tqdm

def convert_pdf_to_md(input_pdf, output_dir):
    try:
        cmd = [
        "mineru",
        "-p", str(input_pdf),
        "-o", output_dir,
        "--method", "auto",
        "--device", "cuda",
        "--ocr-mode", "fast",    # 保留有效加速参数[6](@ref)
        "--dpi", "150",           # 降低分辨率提速
        "--parallel", "4"         # 页面级并行
        ]
        subprocess.run(cmd, check=True)
        return True
    except Exception as e:
        print(f"❌ 转换失败: {input_pdf} - {str(e)}")
        return False

def process_directory(input_dir, output_base_dir):
    pdf_files = []
    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_files.append(Path(root) / file)
    
    with tqdm(total=len(pdf_files), desc="转换PDF", unit="file") as pbar:
        for pdf_path in pdf_files:
            rel_path = pdf_path.relative_to(input_dir)
            # 核心修改：输出目录 = 原PDF所在目录（保持路径不变）
            output_dir = Path(output_base_dir) / rel_path.parent
            output_dir.mkdir(parents=True, exist_ok=True)
            
            pbar.set_postfix(file=str(rel_path)[:30] + "...")
            if convert_pdf_to_md(pdf_path, str(output_dir)):
                # 清理非目标文件
                for item in output_dir.iterdir():
                    if item.is_file() and item.suffix != ".md":
                        item.unlink()
                    elif item.is_dir() and item.name != "img":
                        shutil.rmtree(item)
                print(f"✓ 转换成功: {rel_path}")
            pbar.update(1)

# ====================== 参数设置部分 ======================
def main(input_dir=None, output_dir=None):
    # 1. 修复路径格式（删除双斜杠）
    DEFAULT_INPUT = "/kaggle/input/economy-data/output_renamed"
    DEFAULT_OUTPUT = "/kaggle/working/output"  # 修正路径

    # 2. 修改参数解析逻辑（关键修复）
    parser = argparse.ArgumentParser(
        description="批量转换PDF为Markdown",
        allow_abbrev=False  # 禁止缩写参数[6](@ref)
    )
    parser.add_argument("-i", "--input", help="PDF输入目录")
    parser.add_argument("-o", "--output", help="Markdown输出目录")
    
    # 使用parse_known_args()忽略未定义参数[6,8](@ref)
    args, unknown = parser.parse_known_args()  # 忽略Colab自动传递的-f参数
    
    # 3. 最终参数确定
    final_input = args.input or input_dir or DEFAULT_INPUT
    final_output = args.output or output_dir or DEFAULT_OUTPUT

    print(f"忽略未识别参数: {unknown}")  # 打印被忽略的参数[4](@ref)
    print(f"输入目录: {final_input}")
    print(f"输出目录: {final_output}")
    process_directory(final_input, final_output)

if __name__ == "__main__":
    # 示例：直接在代码中设置参数（取消注释即可使用）
    # main(input_dir="/my/custom/input", output_dir="/my/custom/output")
    
    # 保留命令行调用功能
    main()