# Docling testes 001

## settings

In [1]:
import os
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc import ImageRefMode
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions, TesseractOcrOptions, OcrMacOptions
from docling.datamodel.settings import settings

#"/usr/share/tesseract-ocr/4.00/tessdata"
!export TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
os.putenv("TESSDATA_PREFIX", "/usr/share/tesseract-ocr/4.00/tessdata")
!echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
LANG = "por" # por | eng

Set TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata


In [2]:
class DoclingAuxiliar:

    IMAGE_RESOLUTION_SCALE = 2.0
    doc_converter = None

    def __init__(self):
        pass

    def get_doc_converter(self):
        if (self.doc_converter is not None): return self.doc_converter
        # Define pipeline options for PDF processing
        pipeline_options = PdfPipelineOptions(
            do_table_structure=True,  # Enable table structure detection
            do_ocr=True,  # Enable OCR
            # full page ocr and language selection
            #ocr_options=EasyOcrOptions(force_full_page_ocr=True, lang=["en"]),  # Use EasyOCR for OCR
            ocr_options=TesseractOcrOptions(force_full_page_ocr=True, lang=[LANG]),  # Uncomment to use Tesseract for OCR
            #ocr_options = OcrMacOptions(force_full_page_ocr=True, lang=['en-US']),
            table_structure_options=dict(
                do_cell_matching=False,  # Use text cells predicted from table structure model
                mode=TableFormerMode.ACCURATE  # Use more accurate TableFormer model
            ),
            generate_page_images=True,  # Enable page image generation
            generate_picture_images=True,  # Enable picture image generation
            images_scale=self.IMAGE_RESOLUTION_SCALE, # Set image resolution scale (scale=1 corresponds to a standard 72 DPI image)
        )

        # Initialize the DocumentConverter with the specified pipeline options
        self.doc_converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
            }
        )
        return self.doc_converter

    def convert_file(self, caminho_arquivo):
        if (caminho_arquivo is None): return None
        if (self.doc_converter is None): self.doc_converter = self.get_doc_converter()
        if (self.doc_converter is None): return None

        try:
            return self.doc_converter.convert(caminho_arquivo)
        except Exception as e:
            print(f"Erro ao converter arquivo: {e}")
        # finally:
        #     try:
        #         if os.path.exists: os.remove(caminho_arquivo)
        #     except Exception as e:
        #         print(f"Erro ao deletar arquivo '{caminho_arquivo}': {e}")
        return None

## test

In [None]:
doclingAuxiliar = DoclingAuxiliar()

In [None]:
print(doclingAuxiliar.convert_file("../../../arquivos_analise/pdfs/LLMs.pdf"))

In [4]:
print(doclingAuxiliar.convert_file("../../../arquivos_analise/imagens/Captura de tela 2025-03-21 110017.png"))

input=InputDocument(file=PurePosixPath('Captura de tela 2025-03-21 110017.png'), document_hash='ac46265842190cfc1fa075d083bbe8eb3093d43a4ef3568fbe905ea0aaf3cbd9', valid=True, limits=DocumentLimits(max_num_pages=9223372036854775807, max_file_size=9223372036854775807, page_range=(1, 9223372036854775807)), format=<InputFormat.IMAGE: 'image'>, filesize=298862, page_count=1) status=<ConversionStatus.SUCCESS: 'success'> errors=[] pages=[Page(page_no=0, size=Size(width=2064.0, height=807.0), cells=[TextCell(index=0, rgba=ColorRGBA(r=0, g=0, b=0, a=255), rect=BoundingRectangle(r_x0=144.33333333333334, r_y0=26.333333333333332, r_x1=518.3333333333334, r_y1=26.333333333333332, r_x2=518.3333333333334, r_y2=0.0, r_x3=144.33333333333334, r_y3=0.0, coord_origin=<CoordOrigin.TOPLEFT: 'TOPLEFT'>), text='@xKGgoKCgoKCIQtAQh#KCgoKCgiIKQI', orig='@xKGgoKCgoKCIQtAQh#KCgoKCgiIKQI', text_direction=<TextDirection.LEFT_TO_RIGHT: 'left_to_right'>, confidence=0.688832851123289, from_ocr=True), TextCell(index=1, r

# SmolDocling-256M-preview

[SmolDocling-256M-preview](https://huggingface.co/ds4sd/SmolDocling-256M-preview)