<a href="https://colab.research.google.com/github/tejusn/doc-ai-exp/blob/main/Docling_Doc_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Docling DocExtractor: [Link to docs](https://docling-project.github.io/docling/examples/extraction/#defining-the-extractor)

In [2]:
%pip install -q docling[vlm]  # Install the Docling package with VLM support

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.5/164.5 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m87.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.7/42.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m83.0 MB/s[0m eta [36m0

In [3]:
from IPython import display
from pydantic import BaseModel, Field
from rich import print

In [4]:
file_path = (
    "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg"
)
display.HTML(f"<img src='{file_path}' height='1000'>")

# Define the Extractor

In [5]:
from docling.datamodel.base_models import InputFormat
from docling.document_extractor import DocumentExtractor

extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])

## Using a string schema

In [7]:
result = extractor.extract(
    source=file_path,
    template='{"bill_no": "string", "total": "float"}',
)
print(result.pages)

ERROR:docling.pipeline.extraction_vlm_pipeline:Error processing page 1: CUDA out of memory. Tried to allocate 8.27 GiB. GPU 0 has a total capacity of 14.74 GiB of which 1.65 GiB is free. Process 4487 has 13.09 GiB memory in use. Of the allocated memory 4.70 GiB is allocated by PyTorch, and 8.26 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


ConversionError: Extraction failed for: Swiss_QR-Bill_example.jpg with status: ConversionStatus.FAILURE

## Using a Dict schema

In [None]:
result = extractor.extract(
    source=file_path,
    template={
        "bill_no": "string",
        "total": "float",
    },
)
print(result.pages)

## Using Pydantic model

In [None]:
from typing import Optional


class Invoice(BaseModel):
    bill_no: str = Field(
        examples=["A123", "5414"]
    )  # provide some examples, but no default value
    total: float = Field(
        default=10, examples=[20]
    )  # provide some examples and a default value
    tax_id: Optional[str] = Field(default=None, examples=["1234567890"])

In [None]:
result = extractor.extract(
    source=file_path,
    template=Invoice,
)
print(result.pages)

### Or override values

In [None]:
result = extractor.extract(
    source=file_path,
    template=Invoice(
        bill_no="41",
        total=100,
        tax_id="42",
    ),
)
print(result.pages)