In [15]:
import time
from difflib import SequenceMatcher
import os
import json
import logging
from pathlib import Path
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from docling.chunking import HybridChunker
from docling.datamodel.pipeline_options import PdfPipelineOptions
from llmsherpa.readers import LayoutPDFReader
from paperrag.document.document_loader import DocumentLoader
from paperrag.document.figure_parser import FigureExtractor
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.document_converter import DocumentConverter, PdfFormatOption
import nest_asyncio 

nest_asyncio.apply()

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

IMAGE_RESOLUTION_SCALE = 2.0

In [18]:
# Define parser functions
def docling_parser(document_path):
    """Docling parser implementation."""
    start_time = time.time()
    try:
        converter = DocumentConverter()
        doc = converter.convert(document_path).document
        # output = result.document.export_to_markdown()
        chunker = HybridChunker()
        chunk_iter = chunker.chunk(dl_doc=doc)
        output = chunker.iter
    except Exception as e:
        logger.error(f"Docling parser failed: {e}")
        output = ""
    elapsed_time = time.time() - start_time
    return output, elapsed_time

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

def docling_images_parser(document_path):
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time = time.time()
    conv_res = doc_converter.convert(document_path)

    chunker = HybridChunker()
    chunk_iter = chunker.chunk(dl_doc=conv_res.document)
    output = chunk_iter

    output_dir = Path("output_images")
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = output_dir / f"{doc_filename}-table-{table_counter}.png"
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")
        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    output = conv_res.document.export_to_markdown()
    elapsed_time = time.time() - start_time
    return output, elapsed_time

def llmsherpa_parser(document_path):
    """LLMSherpa parser implementation."""
    llmsherpa_api_url = "http://localhost:5010/api/parseDocument?renderFormat=all"
    pdf_reader = LayoutPDFReader(llmsherpa_api_url)
    start_time = time.time()
    try:
        doc = pdf_reader.read_pdf(document_path)
        doc_json = doc.to_json()
    except Exception as e:
        logger.error(f"LLMSherpa parser failed: {e}")
        output = ""
    elapsed_time = time.time() - start_time
    return output, elapsed_time

async def custom_llmsherpa_parser(document_path): # also include pdf_figures
    """Custom LLMSherpa parser implementation."""
    start_time = time.time()
    #base_dir = os.path.abspath(os.path.dirname(__file__))
    base_dir = os.getcwd()
    output_dir = os.path.join(base_dir, "data", "figures")
    os.makedirs(output_dir, exist_ok=True)

    loader = DocumentLoader(pdf_path=document_path, output_dir=output_dir, async_mode=True)
    try:
        documents, stats = await loader.load_async()
        output = json.dumps([vars(d) for d in documents], indent=4)
    except Exception as e:
        logger.error(f"Custom LLMSherpa parser failed: {e}")
        output = ""
    elapsed_time = time.time() - start_time
    return output, elapsed_time

def pdffigures2_parser(document_path):
    """PDFFigures2 parser implementation."""
    # base_dir = os.path.abspath(os.path.dirname(__file__))
    base_dir = os.getcwd()
    output_dir = os.path.join(base_dir, "data", "figures")
    os.makedirs(output_dir, exist_ok=True)
    extractor = FigureExtractor(document_path, output_dir)
    start_time = time.time()
    try:
        figures = extractor.extract_figures()
        output = json.dumps([vars(fig) for fig in figures], indent=4)
    except Exception as e:
        logger.error(f"PDFFigures2 parser failed: {e}")
        output = ""
    elapsed_time = time.time() - start_time
    return output, elapsed_time



In [19]:
import nest_asyncio
import asyncio

nest_asyncio.apply()

def evaluate_parsers(parsers, document_path):
    """Evaluate parsers for speed and output."""
    results = {}

    for parser_name, parser_function in parsers.items():
        logger.info(f"Evaluating {parser_name}...")

        if asyncio.iscoroutinefunction(parser_function):
            # Handle async parsers
            output, elapsed_time = asyncio.run(parser_function(document_path))
        else:
            output, elapsed_time = parser_function(document_path)

        # Basic metrics (e.g., output length)
        output_length = len(output) if output else 0

        # Store results
        results[parser_name] = {
            "elapsed_time": elapsed_time,
            "output_length": output_length,
            "output": output
        }

    return results

In [20]:
document_path = "/Users/zehrakorkusuz/PaperRAG/data/2501.12691v1.pdf"

# Define parsers
parsers = {
    "Docling": docling_parser,
    "LLMSherpa": llmsherpa_parser,
    "Custom LLMSherpa": custom_llmsherpa_parser, #also include pdf_figures2
    #"PDFFigures2": pdffigures2_parser,
    "Docling Images": docling_images_parser
}

# Run evaluation
evaluation_results = evaluate_parsers(parsers, document_path)

# Print results: Number of tables extracted; number of images extracted; number of figures extracted; qualitative evaluation of the output
for parser_name, metrics in evaluation_results.items():
    logger.info(f"\nResults for {parser_name}:")
    logger.info(f"  Elapsed time: {metrics['elapsed_time']:.2f} seconds")
    logger.info(f"  Output length: {metrics['output_length']} characters")
    logger.info(f"  Sample output: {metrics['output']}")

2025-01-23 20:25:10,816 - __main__ - INFO - Evaluating Docling...
2025-01-23 20:25:10,934 - docling.document_converter - INFO - Going to convert document batch...
2025-01-23 20:25:11,113 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "GET /api/models/ds4sd/docling-models/revision/v2.1.0 HTTP/1.1" 200 1264
2025-01-23 20:25:11,156 - docling.utils.accelerator_utils - INFO - Accelerator device: 'mps'
2025-01-23 20:25:18,003 - docling.utils.accelerator_utils - INFO - Accelerator device: 'mps'
2025-01-23 20:25:20,665 - docling_ibm_models.layoutmodel.layout_predictor - DEBUG - LayoutPredictor settings: {'safe_tensors_file': '/Users/zehrakorkusuz/.cache/huggingface/hub/models--ds4sd--docling-models/snapshots/36bebf56681740529abd09f5473a93a69373fbf0/model_artifacts/layout/model.safetensors', 'device': 'mps', 'num_threads': 4, 'image_size': 640, 'threshold': 0.3}
2025-01-23 20:25:20,670 - docling.utils.accelerator_utils - INFO - Accelerator device: 'mps'
2025-01-23 20:25:21,373 - 

Total figures: 5, Total tables: 0, Total pages: 0, Total time: 0


2025-01-23 20:31:51,728 - docling.document_converter - INFO - Going to convert document batch...
2025-01-23 20:31:51,772 - urllib3.connectionpool - DEBUG - Resetting dropped connection: huggingface.co
2025-01-23 20:31:52,345 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "GET /api/models/ds4sd/docling-models/revision/v2.1.0 HTTP/1.1" 200 1264
2025-01-23 20:31:52,424 - docling.utils.accelerator_utils - INFO - Accelerator device: 'mps'
2025-01-23 20:32:00,207 - docling.utils.accelerator_utils - INFO - Accelerator device: 'mps'
2025-01-23 20:32:03,642 - docling_ibm_models.layoutmodel.layout_predictor - DEBUG - LayoutPredictor settings: {'safe_tensors_file': '/Users/zehrakorkusuz/.cache/huggingface/hub/models--ds4sd--docling-models/snapshots/36bebf56681740529abd09f5473a93a69373fbf0/model_artifacts/layout/model.safetensors', 'device': 'mps', 'num_threads': 4, 'image_size': 640, 'threshold': 0.3}
2025-01-23 20:32:03,654 - docling.utils.accelerator_utils - INFO - Accelerator de

Speed to process
Number of tables extracted
Number of figures extracted 
Figures metadata & text extracted 

File processing
Folder processing

In [21]:
evaluation_results

{'Docling': {'elapsed_time': 87.74744701385498,
  'output_length': 0,
  'output': ''},
 'LLMSherpa': {'elapsed_time': 164.5761911869049,
  'output_length': 0,
  'output': ''},
 'Custom LLMSherpa': {'elapsed_time': 148.26967215538025,
  'output_length': 235551,
  'output': '[\n    {\n        "id": null,\n        "metadata": {\n            "page_idx": 0,\n            "section_title": "Efficient treatment of heterogeneous malignant cell populations > Uzi Harush,1,2 Ravid Straussman3 and Baruch Barzel1,2,4 January 23, 2025",\n            "type": "chunk",\n            "source": "doc_0",\n            "id": "doc_0_p0_chunk_1"\n        },\n        "page_content": "1. Department of Mathematics, Bar-Ilan University, Ramat-Gan, Israel 52900.",\n        "type": "Document"\n    },\n    {\n        "id": null,\n        "metadata": {\n            "page_idx": 0,\n            "section_title": "Efficient treatment of heterogeneous malignant cell populations > Uzi Harush,1,2 Ravid Straussman3 and Baruch B