In [None]:
# %pip install qwen-vl-utils
import os
from typing import List, Tuple, Optional, Dict
import logging
import cv2

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
import fitz
import concurrent.futures

from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import numpy as np
import fitz
import logging

from PIL import Image

from rapid_layout import RapidLayout, VisLayout


VLM_PATH = "/mnt/d/weights/Qwen2-VL-7B-Instruct-AWQ"

layout_engine = RapidLayout(conf_thres=0.5, model_type="pp_layout_cdla")

model = Qwen2VLForConditionalGeneration.from_pretrained(
    VLM_PATH,
    torch_dtype="auto",
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(VLM_PATH, trust_remote_code=True)

min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor = AutoProcessor.from_pretrained(
    VLM_PATH,
    min_pixels=min_pixels,
    max_pixels=max_pixels,
)

# This Default Prompt Using Chinese and could be changed to other languages.
DEFAULT_PROMPT = """使用markdown语法，将图片中识别到的文字转换为markdown格式输出。你必须做到：
1. 输出和使用识别到的图片的相同的语言，例如，识别到英语的字段，输出的内容必须是英语。
2. 不要解释和输出无关的文字，直接输出图片中的内容。例如，严禁输出 “以下是我根据图片内容生成的markdown文本：”这样的例子，而是应该直接输出markdown。
3. 内容不要包含在```markdown ```中、段落公式使用 $$ $$ 的形式、行内公式使用 $ $ 的形式、忽略掉长直线、忽略掉页码。
再次强调，不要解释和输出无关的文字，直接输出图片中的内容。
"""
DEFAULT_RECT_PROMPT = """图片中用带颜色的矩形框和名称(%s)标注出了一些区域。如果区域是表格或者图片，使用 ![]() 的形式插入到输出内容中，否则直接输出文字内容。
"""
DEFAULT_ROLE_PROMPT = """你是一个PDF文档解析器，使用markdown和latex语法输出图片的内容。
"""




In [None]:


def _parse_pdf_to_images(pdf_path: str, output_dir: str) -> List[Tuple[str, List[str]]]:
    image_infos = []
    pdf_document = fitz.open(pdf_path)
    for page_index, page in enumerate(pdf_document):
        logging.info(f"parse page: {page_index}")
        # 保存页面为图片
        pix = page.get_pixmap(matrix=fitz.Matrix(4, 4))
        pix = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        boxes, scores, class_names, elapse = layout_engine(pix)
        
        rect_images = []
        boxes_ = []
        scores_ = []
        class_names_ = []
        rect_index = 0
        for class_name, box, score in zip(class_names, boxes, scores):
            if class_name == "figure" or class_name == "table":
                rect_index += 1
                name = f"{page_index}_{rect_index}.png"
                sub_pix = pix.crop(box)
                sub_pix.save(os.path.join(output_dir, name))
                rect_images.append(name)

                boxes_.append(box)
                scores_.append(score)
                class_name = f"{page_index}_{rect_index}.png"
                class_names_.append(class_name)

        page_image = os.path.join(output_dir, f"{page_index}.png")
        pix = np.array(pix)
        pix = cv2.cvtColor(pix, cv2.COLOR_RGB2BGR)
        print(boxes_, scores_, class_names_)
        ploted_img = VisLayout.draw_detections(pix, boxes_, scores_, class_names_)
        if ploted_img is not None:
            cv2.imwrite(page_image, ploted_img)
        # ploted_img.save(page_image)
        image_infos.append((page_image, rect_images))
    pdf_document.close()
    return image_infos

def _gpt_parse_images(
    image_infos: List[Tuple[str, List[str]]],
    prompt_dict: Optional[Dict] = None,
    output_dir: str = "./",
    api_key: Optional[str] = None,
    base_url: Optional[str] = None,
    # model: str = 'gpt-4o',
    verbose: bool = False,
    gpt_worker: int = 1,
    **args,
) -> str:
    """
    Parse images to markdown content.
    """

    if isinstance(prompt_dict, dict) and "prompt" in prompt_dict:
        prompt = prompt_dict["prompt"]
        logging.info("prompt is provided, using user prompt.")
    else:
        prompt = DEFAULT_PROMPT
        logging.info("prompt is not provided, using default prompt.")
    if isinstance(prompt_dict, dict) and "rect_prompt" in prompt_dict:
        rect_prompt = prompt_dict["rect_prompt"]
        logging.info("rect_prompt is provided, using user prompt.")
    else:
        rect_prompt = DEFAULT_RECT_PROMPT
        logging.info("rect_prompt is not provided, using default prompt.")
    if isinstance(prompt_dict, dict) and "role_prompt" in prompt_dict:
        role_prompt = prompt_dict["role_prompt"]
        logging.info("role_prompt is provided, using user prompt.")
    else:
        role_prompt = DEFAULT_ROLE_PROMPT
        logging.info("role_prompt is not provided, using default prompt.")

    def _process_page(index: int, image_info: Tuple[str, List[str]]) -> Tuple[int, str]:
        logging.info(f"gpt parse page: {index}")

        # agent = Agent(role=role_prompt, api_key=api_key, base_url=base_url, disable_python_run=True, model=model, **args)
        page_image, rect_images = image_info
        local_prompt = prompt
        local_prompt = role_prompt+local_prompt
        if rect_images:
            local_prompt = local_prompt % (rect_prompt % ", ".join(rect_images))
        else:
            local_prompt = local_prompt % ""
        # content = agent.run([local_prompt, {'image': page_image}], display=verbose)
        messages = [
            # {
            #     "role": "system",
            #     "content": DEFAULT_ROLE_PROMPT
            # },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": page_image,
                    },
                    {"type": "text", "text": local_prompt},
                ],
            }
        ]

        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        print(text)
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")

        # Inference: Generation of the output
        generated_ids = model.generate(**inputs, max_new_tokens=2000, num_beams=1)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :]
            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )
        return index, output_text

    contents = [None] * len(image_infos)
    with concurrent.futures.ThreadPoolExecutor(max_workers=gpt_worker) as executor:
        futures = [
            executor.submit(_process_page, index, image_info)
            for index, image_info in enumerate(image_infos)
        ]
        for future in concurrent.futures.as_completed(futures):
            index, content = future.result()
            content = content[0]
            print(content)

            # 在某些情况下大模型还是会输出 ```markdown ```字符串
            if "```markdown" in content:
                content = content.replace("```markdown\n", "")
                last_backticks_pos = content.rfind("```")
                if last_backticks_pos != -1:
                    content = (
                        content[:last_backticks_pos] + content[last_backticks_pos + 3 :]
                    )

            contents[index] = content

    output_path = os.path.join(output_dir, "output.md")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(contents))

    return "\n\n".join(contents)


def parse_pdf(
    pdf_path: str,
    base_output_dir="../../data/gen",
    prompt: Optional[Dict] = None,
    api_key: Optional[str] = None,
    base_url: Optional[str] = None,
    model: str = "gpt-4o",
    verbose: bool = False,
    gpt_worker: int = 1,
    **args,
) -> Tuple[str, List[str]]:
    """
    Parse a PDF file to a markdown file.
    """

    output_dir = os.path.join(base_output_dir, os.path.basename(pdf_path).split(".")[0])
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    image_infos = _parse_pdf_to_images(pdf_path, output_dir=output_dir)
    print(image_infos)
    content = _gpt_parse_images(
        image_infos=image_infos,
        output_dir=output_dir,
        prompt_dict=prompt,
        api_key=api_key,
        base_url=base_url,
        model=model,
        verbose=verbose,
        gpt_worker=gpt_worker,
        **args,
    )

    all_rect_images = []
    # remove all rect images
    if not verbose:
        for page_image, rect_images in image_infos:
            if os.path.exists(page_image):
                os.remove(page_image)
            all_rect_images.extend(rect_images)
    return content, all_rect_images




In [None]:

DEFAULT_PROMPT = """使用markdown语法，将图片中识别到的文字转换为markdown格式输出。你必须做到：
%s1. 不要解释和输出无关的文字，直接输出图片中的内容。例如，严禁输出 “以下是我根据图片内容生成的markdown文本：”这样的例子，而是应该直接输出markdown。
2. 内容不要包含在```markdown ```中、段落公式使用 $$ $$ 的形式、行内公式使用 $ $ 的形式、忽略掉长直线、忽略掉页码。
再次强调，不要解释和输出无关的文字，直接输出图片中的内容。
"""
DEFAULT_RECT_PROMPT = """图片中用带颜色的矩形框和名称(%s)标注出了一些区域。这些区域要使用 ![]('名称') 的形式插入到markdown内容中，这很重要。
"""
DEFAULT_ROLE_PROMPT = """你是一个PDF文档解析器，使用markdown和latex语法输出图片的内容。
"""


result = parse_pdf(
    pdf_path="../../data/生命四元素.pdf",
    base_output_dir="../../data/gen",
    verbose=False,
    gpt_worker=1,
)

## 使用小钢炮

In [None]:
import os
import re
from typing import List, Tuple, Optional, Dict
import logging
import cv2

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
import fitz
import concurrent.futures

from transformers import AutoModel, AutoTokenizer, AutoProcessor
import numpy as np

from PIL import Image

from rapid_layout import RapidLayout, VisLayout



VLM_PATH = "/mnt/d/weights/MiniCPM-V-2_6-int4"

layout_engine = RapidLayout(conf_thres=0.5, model_type="pp_layout_cdla")

model = AutoModel.from_pretrained(VLM_PATH, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(VLM_PATH, trust_remote_code=True)

min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor = AutoProcessor.from_pretrained(
    VLM_PATH,
    trust_remote_code=True,
    min_pixels=min_pixels,
    max_pixels=max_pixels,
)


In [29]:

def _gpt_parse_images(
    image_infos: List[Tuple[str, List[str]]],
    prompt_dict: Optional[Dict] = None,
    output_dir: str = "./",
    api_key: Optional[str] = None,
    base_url: Optional[str] = None,
    # model: str = 'gpt-4o',
    verbose: bool = False,
    gpt_worker: int = 1,
    **args,
) -> str:
    """
    Parse images to markdown content.
    """

    if isinstance(prompt_dict, dict) and "prompt" in prompt_dict:
        prompt = prompt_dict["prompt"]
        logging.info("prompt is provided, using user prompt.")
    else:
        prompt = DEFAULT_PROMPT
        logging.info("prompt is not provided, using default prompt.")
    if isinstance(prompt_dict, dict) and "rect_prompt" in prompt_dict:
        rect_prompt = prompt_dict["rect_prompt"]
        logging.info("rect_prompt is provided, using user prompt.")
    else:
        rect_prompt = DEFAULT_RECT_PROMPT
        logging.info("rect_prompt is not provided, using default prompt.")
    if isinstance(prompt_dict, dict) and "role_prompt" in prompt_dict:
        role_prompt = prompt_dict["role_prompt"]
        logging.info("role_prompt is provided, using user prompt.")
    else:
        role_prompt = DEFAULT_ROLE_PROMPT
        logging.info("role_prompt is not provided, using default prompt.")

    def _process_page(index: int, image_info: Tuple[str, List[str]]) -> Tuple[int, str]:
        logging.info(f"gpt parse page: {index}")

        # agent = Agent(role=role_prompt, api_key=api_key, base_url=base_url, disable_python_run=True, model=model, **args)
        page_image, rect_images = image_info
        local_prompt = role_prompt + prompt
        if rect_images:
            local_prompt = local_prompt % (rect_prompt % ", ".join(rect_images))
        else:
            local_prompt = local_prompt % ""

        image = Image.open(page_image).convert('RGB')

        msgs = [{'role': 'user', 'content': [image, local_prompt]}]

        res = model.chat(
            image=None,
            msgs=msgs,
            tokenizer=tokenizer,
            processor=processor,
            # system_prompt=role_prompt,
            sampling=True,
            temperature=0.7,
            stream=True
        )

        generated_text = ""
        for new_text in res:
            generated_text += new_text
            print(new_text, flush=True, end='')

        return index, generated_text

    contents = [None] * len(image_infos)
    with concurrent.futures.ThreadPoolExecutor(max_workers=gpt_worker) as executor:
        futures = [
            executor.submit(_process_page, index, image_info)
            for index, image_info in enumerate(image_infos)
        ]
        for future in concurrent.futures.as_completed(futures):
            index, content = future.result()
            print(content)

            # 在某些情况下大模型还是会输出 ```markdown ```字符串
            if "```markdown" in content:
                content = content.replace("```markdown\n", "")
                last_backticks_pos = content.rfind("```")
                if last_backticks_pos != -1:
                    content = (
                        content[:last_backticks_pos] + content[last_backticks_pos + 3 :]
                    )

            contents[index] = content

    output_path = os.path.join(output_dir, "output.md")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(contents))

    return "\n\n".join(contents)



In [30]:
_gpt_parse_images([image_infos[0]], output_dir="../../data/gen1")

2024-09-10 18:34:33,468 - INFO - prompt is not provided, using default prompt.
2024-09-10 18:34:33,469 - INFO - rect_prompt is not provided, using default prompt.
2024-09-10 18:34:33,469 - INFO - role_prompt is not provided, using default prompt.
2024-09-10 18:34:33,470 - INFO - gpt parse page: 0


# Four Elements of Astrology, Psychology

![](0_1.png)

生命四元素  
占星与心理学  

史蒂芬·阿若优 著  
胡因梦 译  

占星心理学大师畅销全球的经典力作  

云南出版集团公司# Four Elements of Astrology, Psychology

![](0_1.png)

生命四元素  
占星与心理学  

史蒂芬·阿若优 著  
胡因梦 译  

占星心理学大师畅销全球的经典力作  

云南出版集团公司


'# Four Elements of Astrology, Psychology\n\n![](0_1.png)\n\n生命四元素  \n占星与心理学  \n\n史蒂芬·阿若优 著  \n胡因梦 译  \n\n占星心理学大师畅销全球的经典力作  \n\n云南出版集团公司'

In [31]:
DEFAULT_PROMPT = """使用markdown语法，将图片中识别到的文字转换为markdown格式输出。你必须做到：
%s1. 不要解释和输出无关的文字，直接输出图片中的内容。例如，严禁输出 “以下是我根据图片内容生成的markdown文本：”这样的例子，而是应该直接输出markdown。
2. 内容不要包含在```markdown ```中、段落公式使用 $$ $$ 的形式、行内公式使用 $ $ 的形式、忽略掉长直线、忽略掉页码。
再次强调，不要解释和输出无关的文字，直接输出图片中的内容。
"""
DEFAULT_RECT_PROMPT = """图片中用带颜色的矩形框和名称(%s)标注出了一些区域。这些区域要使用 ![]('名称') 的形式插入到markdown内容中，这很重要。
"""
DEFAULT_ROLE_PROMPT = """你是一个PDF文档解析器，使用markdown和latex语法输出图片的内容。
"""


result = parse_pdf(
    pdf_path="../../data/生命四元素.pdf",
    base_output_dir="../../data/gen",
    verbose=False,
    gpt_worker=1,
)

2024-09-10 18:37:05,925 - INFO - parse page: 0


[array([ 169.94206096,  191.13674955, 2555.84851203, 2867.47954901])] [0.7510790824890137] ['0_1.png']


2024-09-10 18:37:06,500 - INFO - parse page: 1
2024-09-10 18:37:06,747 - INFO - parse page: 2


[] [] []


2024-09-10 18:37:07,053 - INFO - parse page: 3


[] [] []


2024-09-10 18:37:07,348 - INFO - parse page: 4


[] [] []


2024-09-10 18:37:07,645 - INFO - parse page: 5


[] [] []


2024-09-10 18:37:07,946 - INFO - parse page: 6


[] [] []


2024-09-10 18:37:08,234 - INFO - parse page: 7


[] [] []


2024-09-10 18:37:08,541 - INFO - parse page: 8


[] [] []


2024-09-10 18:37:08,822 - INFO - parse page: 9


[] [] []


2024-09-10 18:37:09,099 - INFO - parse page: 10


[] [] []


2024-09-10 18:37:09,427 - INFO - parse page: 11


[array([ 401.52155752, 2951.1392122 , 2595.89004202, 3811.43398382])] [0.9714148044586182] ['10_1.png']


2024-09-10 18:37:09,860 - INFO - parse page: 12


[array([ 434.74053057,  589.79211234, 2621.14796733, 3543.77794534])] [0.9796044230461121] ['11_1.png']


2024-09-10 18:37:10,273 - INFO - parse page: 13


[array([ 371.23547482, 1823.66235113, 2577.95901314, 3852.0138674 ])] [0.9791556596755981] ['12_1.png']
[array([ 437.43804541,  566.85555348, 2639.36469945, 2296.20674073]), array([ 438.75370886, 2612.67350286, 2659.55175839, 3801.18270579])] [0.958814799785614, 0.9587647318840027] ['13_1.png', '13_2.png']


2024-09-10 18:37:10,725 - INFO - parse page: 14
2024-09-10 18:37:11,136 - INFO - parse page: 15


[array([ 378.10205936,  570.82575883, 2559.93304288, 3585.65486196])] [0.9778209328651428] ['14_1.png']


2024-09-10 18:37:11,458 - INFO - parse page: 16


[array([ 401.69990228,  585.05546272, 2582.1932818 , 2139.04589959])] [0.9434552192687988] ['15_1.png']


2024-09-10 18:37:11,731 - INFO - parse page: 17


[] [] []


2024-09-10 18:37:12,029 - INFO - parse page: 18


[] [] []


2024-09-10 18:37:12,314 - INFO - parse page: 19


[] [] []


2024-09-10 18:37:12,622 - INFO - parse page: 20


[] [] []


2024-09-10 18:37:12,894 - INFO - parse page: 21


[] [] []


2024-09-10 18:37:13,163 - INFO - parse page: 22


[] [] []


2024-09-10 18:37:13,442 - INFO - parse page: 23


[] [] []


2024-09-10 18:37:13,728 - INFO - parse page: 24


[] [] []
[] [] []


2024-09-10 18:37:14,052 - INFO - parse page: 25


[] [] []


2024-09-10 18:37:14,382 - INFO - parse page: 26
2024-09-10 18:37:14,680 - INFO - parse page: 27


[] [] []


2024-09-10 18:37:14,991 - INFO - parse page: 28


[] [] []


2024-09-10 18:37:15,263 - INFO - parse page: 29


[] [] []


2024-09-10 18:37:15,544 - INFO - parse page: 30


[] [] []


2024-09-10 18:37:15,817 - INFO - parse page: 31


[] [] []


2024-09-10 18:37:16,081 - INFO - parse page: 32


[] [] []


2024-09-10 18:37:16,343 - INFO - parse page: 33


[] [] []


2024-09-10 18:37:16,636 - INFO - parse page: 34


[] [] []


2024-09-10 18:37:16,950 - INFO - parse page: 35


[] [] []


2024-09-10 18:37:17,250 - INFO - parse page: 36


[] [] []


2024-09-10 18:37:17,557 - INFO - parse page: 37


[] [] []


2024-09-10 18:37:17,859 - INFO - parse page: 38


[] [] []


2024-09-10 18:37:18,176 - INFO - parse page: 39


[] [] []


2024-09-10 18:37:18,472 - INFO - parse page: 40


[] [] []
[] [] []


2024-09-10 18:37:18,812 - INFO - parse page: 41
2024-09-10 18:37:19,121 - INFO - parse page: 42


[] [] []


2024-09-10 18:37:19,348 - INFO - parse page: 43


[] [] []


2024-09-10 18:37:19,627 - INFO - parse page: 44


[] [] []


2024-09-10 18:37:19,928 - INFO - parse page: 45


[] [] []


2024-09-10 18:37:20,228 - INFO - parse page: 46


[] [] []


2024-09-10 18:37:20,523 - INFO - parse page: 47


[] [] []


2024-09-10 18:37:20,813 - INFO - parse page: 48


[] [] []
[] [] []


2024-09-10 18:37:21,145 - INFO - parse page: 49
2024-09-10 18:37:21,443 - INFO - parse page: 50


[] [] []


2024-09-10 18:37:21,730 - INFO - parse page: 51


[] [] []


2024-09-10 18:37:22,017 - INFO - parse page: 52


[] [] []


2024-09-10 18:37:22,328 - INFO - parse page: 53


[] [] []
[] [] []


2024-09-10 18:37:22,693 - INFO - parse page: 54


[] [] []


2024-09-10 18:37:23,033 - INFO - parse page: 55
2024-09-10 18:37:23,317 - INFO - parse page: 56


[] [] []


2024-09-10 18:37:23,604 - INFO - parse page: 57


[] [] []


2024-09-10 18:37:23,899 - INFO - parse page: 58


[] [] []
[] [] []
[('../../data/gen/生命四元素/0.png', ['0_1.png']), ('../../data/gen/生命四元素/1.png', []), ('../../data/gen/生命四元素/2.png', []), ('../../data/gen/生命四元素/3.png', []), ('../../data/gen/生命四元素/4.png', []), ('../../data/gen/生命四元素/5.png', []), ('../../data/gen/生命四元素/6.png', []), ('../../data/gen/生命四元素/7.png', []), ('../../data/gen/生命四元素/8.png', []), ('../../data/gen/生命四元素/9.png', []), ('../../data/gen/生命四元素/10.png', ['10_1.png']), ('../../data/gen/生命四元素/11.png', ['11_1.png']), ('../../data/gen/生命四元素/12.png', ['12_1.png']), ('../../data/gen/生命四元素/13.png', ['13_1.png', '13_2.png']), ('../../data/gen/生命四元素/14.png', ['14_1.png']), ('../../data/gen/生命四元素/15.png', ['15_1.png']), ('../../data/gen/生命四元素/16.png', []), ('../../data/gen/生命四元素/17.png', []), ('../../data/gen/生命四元素/18.png', []), ('../../data/gen/生命四元素/19.png', []), ('../../data/gen/生命四元素/20.png', []), ('../../data/gen/生命四元素/21.png', []), ('../../data/gen/生命四元素/22.png', []), ('../../data/gen/生命四元素/23.png', []), ('../../data/gen/生命四元素

TypeError: _gpt_parse_images() got an unexpected keyword argument 'api_key'