### OpenParse

In [1]:
!pip install openparse[ml]

Collecting openparse[ml]
  Downloading openparse-0.5.7-py3-none-any.whl.metadata (8.6 kB)
Collecting PyMuPDF>=1.23.2 (from openparse[ml])
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting pypdf>=4.0.0 (from openparse[ml])
  Downloading pypdf-4.3.1-py3-none-any.whl.metadata (7.4 kB)
Collecting pdfminer.six>=20200401 (from openparse[ml])
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting tiktoken>=0.3 (from openparse[ml])
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting openai>=1.0.0 (from openparse[ml])
  Downloading openai-1.40.3-py3-none-any.whl.metadata (22 kB)
Collecting httpx<1,>=0.23.0 (from openai>=1.0.0->openparse[ml])
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting jiter<1,>=0.4.0 (from openai>=1.0.0->openparse[ml])
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6

### PDF Parser

In [None]:
import openparse

basic_doc_path = "/content/your.pdf"
parser = openparse.DocumentParser()
parsed_basic_doc = parser.parse(basic_doc_path)

for node in parsed_basic_doc.nodes:
    display(node)

In [None]:
pdf = openparse.Pdf(basic_doc_path)
pdf.display_with_bboxes(
    parsed_basic_doc.nodes,
)

In [None]:
# JSON
parsed_basic_doc.model_dump()

### TableData Parser

In [None]:
doc_with_tables_path = "/contant/your.pdf"

parser = openparse.DocumentParser(
    table_args={"parsing_algorithm": "table-transformers"}
)
parsed_doc2 = parser.parse(doc_with_tables_path)

for node in parsed_doc2.nodes:
    display(node)

In [None]:
pdf = openparse.Pdf(doc_with_tables_path)
pdf.display_with_bboxes(
    parsed_doc2.nodes,
)

In [None]:
parsed_doc2.model_dump()

### Metadata Parser

In [None]:
meta_path = "/contant/your.pdf"

parser = openparse.DocumentParser(table_args={"parsing_algorithm": "pymupdf"})
parsed = parser.parse(meta_path)

doc = openparse.Pdf(file=meta_path)
doc.display_with_bboxes(parsed.nodes)

### Advanced Parser for combined type data

In [None]:
from openparse import processing, Node
from typing import List


class CustomCombineTables(processing.ProcessingStep):
    """
    Let's combine tables that are next to each other
    """

    def process(self, nodes: List[Node]) -> List[Node]:
        new_nodes = []
        print("Combining concurrent tables")
        for i in range(len(nodes) - 1):
            if "table" in nodes[i].variant and "table" in nodes[i + 1].variant:
                new_node = nodes[i] + nodes[i + 1]
                new_nodes.append(new_node)
            else:
                new_nodes.append(nodes[i])

        return new_nodes


# copy the default pipeline (or create a new one)
custom_pipeline = processing.BasicIngestionPipeline()
custom_pipeline.append_transform(CustomCombineTables())

parser = openparse.DocumentParser(
    table_args={"parsing_algorithm": "pymupdf"}, processing_pipeline=custom_pipeline
)
custom = parser.parse(meta_path)

doc = openparse.Pdf(file=meta_path)
doc.display_with_bboxes(custom.nodes)
doc.model_dump()