In [1]:
"""
由langchain中支持的loader实现加载文档。
"""

from langchain_pymupdf4llm import PyMuPDF4LLMLoader
from langchain_community.document_loaders.parsers import RapidOCRBlobParser
from langchain_community.document_loaders.parsers import LLMImageBlobParser

from langchain_openai import ChatOpenAI

from langchain_core.language_models import BaseChatModel
from langchain_core.documents import Document

import os
from pathlib import Path


class PymupdfTextLoader:
    """
    pymupdf不同加载pdf的方法。
    3种方法对应不同精细程度的加载方法。
    """
    def __init__(
        self,
        pdf_path: str | Path,
    ):
        self.pdf_path = Path(pdf_path)
        self.loader = None

    def run(
        self,
        loading_method: str = 'rule',
    ) -> list[Document]:
        """
        主要方法。

        Args:
            loading_method: 加载pdf的方法，指定为['rule', 'ocr', 'vlm', ]中的一个。

        Returns:
            langchain中的Document对象。
            我的默认设置使得加载结果是markdown格式的一个文本对象，会在之后被node-parser处理。
        """
        if loading_method == "rule":
            self.set_rule_loader()
            print(f"loading text pdf by {loading_method}")
        elif loading_method == "ocr":
            print(f"loading text pdf by {loading_method}")
            self.set_ocr_loader()
            # loader = PyMuPDF4LLMLoader(
            #     file_path=self.pdf_path,
            #     mode='single',
            #     extract_images=True,
            #     images_parser=RapidOCRBlobParser(),
            #     table_strategy='lines',
            # )
            # return loader.load()
        elif loading_method == "vlm":
            print(f"loading text pdf by {loading_method}")
            self.set_vlm_loader()
        documents: list[Document] = self.loader.load()
        return documents

    def set_rule_loader(self):
        """
        仅提取文档中的文字。
        """
        loader = PyMuPDF4LLMLoader(
            file_path=self.pdf_path,
            mode='single',
            table_strategy='lines',
        )
        self.loader = loader

    def set_ocr_loader(self):
        """
        使用ocr强化文档识别。
        """
        loader = PyMuPDF4LLMLoader(
            file_path=self.pdf_path,
            mode='single',
            extract_images=True,
            images_parser=RapidOCRBlobParser(),
            table_strategy='lines',
        )
        self.loader = loader

    def set_vlm_loader(self):
        """
        使用VLM强化文档识别。
        """
        loader = PyMuPDF4LLMLoader(
            file_path=self.pdf_path,
            mode='single',
            extract_images=True,
            images_parser=LLMImageBlobParser(
                model=self._get_vlm()
            ),
            table_strategy='lines',
        )
        self.loader = loader

    def _get_vlm(self) -> BaseChatModel:
        """
        获取VLM，用于识别pdf文档。
        仅在set_vlm_loader中使用。

        Returns:
            VLM。这里用的是qwen最好的VLM。
        """
        vlm = ChatOpenAI(
            model='qwen-vl-max',
            base_url=os.environ['DASHSCOPE_API_BASE_URL'],
            api_key=os.environ['DASHSCOPE_API_KEY'],
        )
        return vlm



In [2]:
text_loader = PymupdfTextLoader(r"D:\dataset\risk_mas_t\original_pdf\1910.13461v1.pdf")

In [3]:
str(Path(r"D:\dataset\risk_mas_t\original_pdf\1910.13461v1.pdf"))

'D:\\dataset\\risk_mas_t\\original_pdf\\1910.13461v1.pdf'

In [6]:
text_loader.run(loading_method="vlm")

loading text pdf by vlm


[Document(metadata={'producer': 'pdfTeX-1.40.17', 'creator': 'LaTeX with hyperref package', 'creationdate': '2019-10-31T00:48:45+00:00', 'source': 'D:\\dataset\\risk_mas_t\\original_pdf\\1910.13461v1.pdf', 'file_path': 'D:\\dataset\\risk_mas_t\\original_pdf\\1910.13461v1.pdf', 'total_pages': 10, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2019-10-31T00:48:45+00:00', 'trapped': '', 'modDate': 'D:20191031004845Z', 'creationDate': 'D:20191031004845Z'}, page_content='## **BART: Denoising Sequence-to-Sequence Pre-training for Natural** **Language Generation, Translation, and Comprehension**\n### **Mike Lewis*, Yinhan Liu*, Naman Goyal*, Marjan Ghazvininejad,** **Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, Luke Zettlemoyer** Facebook AI { mikelewis,yinhanliu,naman } @fb.com\n\n### **Abstract**\n\nWe present BART, a denoising autoencoder\nfor pretraining sequence-to-sequence models.\nBART is trained by (1) corrupting text with an\narbitrary noi

In [5]:
loader = PyMuPDF4LLMLoader(
    file_path=Path(r"D:\dataset\risk_mas_t\original_pdf\1910.13461v1.pdf"),
    mode='single',
    # extract_images=True,
    # images_parser=LLMImageBlobParser(model=llm),
    extract_images=True,
    images_parser=RapidOCRBlobParser(),
    table_strategy='lines'
)
loader.load()

[Document(metadata={'producer': 'pdfTeX-1.40.17', 'creator': 'LaTeX with hyperref package', 'creationdate': '2019-10-31T00:48:45+00:00', 'source': 'D:\\dataset\\risk_mas_t\\original_pdf\\1910.13461v1.pdf', 'file_path': 'D:\\dataset\\risk_mas_t\\original_pdf\\1910.13461v1.pdf', 'total_pages': 10, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2019-10-31T00:48:45+00:00', 'trapped': '', 'modDate': 'D:20191031004845Z', 'creationDate': 'D:20191031004845Z'}, page_content='## **BART: Denoising Sequence-to-Sequence Pre-training for Natural** **Language Generation, Translation, and Comprehension**\n### **Mike Lewis*, Yinhan Liu*, Naman Goyal*, Marjan Ghazvininejad,** **Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, Luke Zettlemoyer** Facebook AI { mikelewis,yinhanliu,naman } @fb.com\n\n### **Abstract**\n\nWe present BART, a denoising autoencoder\nfor pretraining sequence-to-sequence models.\nBART is trained by (1) corrupting text with an\narbitrary noi