## Setup 

In [18]:
import time
import json
import logging
from pathlib import Path
import warnings


warnings.filterwarnings("ignore")

In [19]:
import yaml

In [20]:
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc import ImageRefMode
from docling.datamodel.settings import settings

from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions, TesseractOcrOptions, OcrMacOptions


In [21]:
_log = logging.getLogger(__name__)

In [22]:
docs_chucks = []

def chunking(result):
    from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
    from transformers import AutoTokenizer

    from docling.chunking import HybridChunker

    EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"

    tokenizer = HuggingFaceTokenizer(
        tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID),
    )

    chunker = HybridChunker(tokenizer=tokenizer,merge_peers=True)  # set tokenizer as needed
    chunk_iter = chunker.chunk(result.document)

    # Convert the iterator to a list to count the chunks
    chunks = list(chunk_iter)
    docs_chucks.append(chunks)
    num_chunks = len(chunks)

    for i, chunk in enumerate(chunks):
        print(f"=== {i} ===")
        txt_tokens = tokenizer.count_tokens(chunk.text)
        print(f"chunk.text ({txt_tokens} tokens):\n{chunk.text!r}")

        ser_txt = chunker.contextualize(chunk=chunk)
        ser_tokens = tokenizer.count_tokens(ser_txt)
        print(f"chunker.contextualize(chunk) ({ser_tokens} tokens):\n{ser_txt!r}")

        print()

    # Print the number of chunks
    print(f"The document has been divided into {num_chunks} chunks.")

In [23]:
def multiformatconversion():
    input_paths = [
        Path("ARD Docs/data/pdf/aws.pdf"),
        # Path("ARD Docs/md/"),
        # Path("ARD Docs/data/html/"),
        # Path("ARD Docs/data/docx/"),
        # Path("ARD Docs/data/pptx/"),
        # Path("ARD Docs/data/images/"),
        # Path("ARD Docs/data/asciidoc/"),
        # Path("ARD Docs/data/csv/"),
        # Path("ARD Docs/data/xlsx/"),
    ]

    ## for defaults use:
    # doc_converter = DocumentConverter()

    ## to customize use:

    IMAGE_RESOLUTION_SCALE = 2.0

    # Define pipeline options for PDF processing
    pipeline_options = PdfPipelineOptions(
        do_table_structure=True,  
        do_ocr=True,  # Enable OCR
        ocr_options=TesseractOcrOptions(force_full_page_ocr=True, lang=["eng"]),  
        table_structure_options=dict(
            do_cell_matching=False,  
            mode=TableFormerMode.ACCURATE  
        ),
        generate_page_images=True,  
        generate_picture_images=True, 
        images_scale=IMAGE_RESOLUTION_SCALE,
    )

    doc_converter = (
        DocumentConverter(  # all of the below is optional, has internal defaults.
            allowed_formats=[
                InputFormat.PDF,
                InputFormat.IMAGE,
                InputFormat.DOCX,
                InputFormat.HTML,
                InputFormat.PPTX,
                InputFormat.ASCIIDOC,
                InputFormat.CSV,
                InputFormat.MD,
                InputFormat.XLSX,
            ],  # whitelist formats, non-matching files are ignored.
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
                InputFormat.DOCX: WordFormatOption(
                    pipeline_cls=SimplePipeline  # , backend=MsWordDocumentBackend
                ),
            },
        )
    )
    # Enable the profiling to measure the time spent
    settings.debug.profile_pipeline_timings = True

    start_time = time.time()
    conv_results = doc_converter.convert_all(input_paths)
    end_time = time.time() - start_time

    _log.info(f"Document converted in {end_time:.2f} seconds.")

    for res in conv_results:
        out_path = Path("parsed-ard-docs")
        out_path.mkdir(parents=True, exist_ok=True)
        

        print(
            f"Document {res.input.file.name} converted."
            f"\nSaved markdown output to: {out_path!s}"
        )

        chunking(res)

        _log.debug(res.document._export_to_indented_text(max_text_len=16))

        # Export Docling document format to markdowndoc:
        with (out_path / f"{res.input.file.stem}.md").open("w") as fp:
            fp.write(res.document.export_to_markdown(image_mode=ImageRefMode.REFERENCED))
            doc_filename = Path(f"./parsed-ard-docs/{res.input.file.name}").stem
            md_filename = out_path / f"{doc_filename}-with-images.md"
            res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

        # with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
        #     fp.write(json.dumps(res.document.export_to_dict()))

        # with (out_path / f"{res.input.file.stem}.yaml").open("w") as fp:
        #     fp.write(yaml.safe_dump(res.document.export_to_dict()))

In [24]:
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from typing import List

db = lancedb.connect("lancedb/ard")
func = get_registry().get("sentence-transformers").create(name="sentence-transformers/all-MiniLM-L6-v2",device="cpu")

In [25]:
# Define a simplified metadata schema
class ChunkMetadata(LanceModel):
    """
    You must order the fields in alphabetical order.
    This is a requirement of the Pydantic implementation.
    """

    filename: str | None
    page_numbers: List[int] | None
    title: str | None


# Define the main Schema
class Chunks(LanceModel):
    text: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()  # type: ignore
    metadata: ChunkMetadata

In [26]:
def embedding(table_name, chunks):
    # Create table with processed chunks
    processed_chunks = [
    {
        "text": chunk.text,
        "metadata": {
            "filename": chunk.meta.origin.filename,
            "page_numbers": [
                page_no
                for page_no in sorted(
                    set(
                        prov.page_no
                        for item in chunk.meta.doc_items
                        for prov in item.prov
                    )
                )
            ]
            or None,
            "title": chunk.meta.headings[0] if chunk.meta.headings else None,
        },
    }
    for chunk in chunks
]

    # Create table - this will apply the embedding function automatically
    table = db.create_table(table_name, schema=Chunks, mode="overwrite")
    
    # Add data - the embedding function will be applied automatically
    table.add(processed_chunks)
    df = table.to_pandas()
    df.to_csv("vectors.csv")


In [27]:
def main():
    multiformatconversion()
    for chucks in docs_chucks:
        embedding("awsdb",chucks)

In [28]:
if __name__ == "__main__":
    main()

Document aws.pdf converted.
Saved markdown output to: parsed-ard-docs


Token indices sequence length is longer than the specified maximum sequence length for this model (4329 > 512). Running this sequence through the model will result in indexing errors


=== 0 ===
chunk.text (107 tokens):
"Copyright © 2025 Amazon Web Services, Inc. and/or its affiliates. All rights reserved.\nAmazon's trademarks and trade dress may not be used in connection with any product or service that is not Amazon's, in any manner that is likely to cause contusion among customers, or in any manner that disparages or discredits Amazon. All other trademarks not owned by Amazon are the property of their respective owners, who may or may not be affiliated with, connected to, or sponsored by Amazon."
chunker.contextualize(chunk) (117 tokens):
"Overview of Amazon Web Services: AWS Whitepaper\nCopyright © 2025 Amazon Web Services, Inc. and/or its affiliates. All rights reserved.\nAmazon's trademarks and trade dress may not be used in connection with any product or service that is not Amazon's, in any manner that is likely to cause contusion among customers, or in any manner that disparages or discredits Amazon. All other trademarks not owned by Amazon are the property o