# Notebook to use RAGDocument and LlamaStackClient to store documents in PGVector

In [None]:
!pip install --upgrade pip
!pip install boto3 pandas
!pip install docstring-parser==0.7.3 kfp-pipeline-spec==0.6.0 kfp-server-api==2.1.0 kubernetes==8.0.0 protobuf==4.21.1 requests-toolbelt==0.8.0
!pip install llama-stack
!pip install sentence-transformers
!pip install llama-stack-client==0.1.9
!pip install huggingface_hub==0.14.1
!pip install numpy
!pip install pdfplumber




### Fetch from minIO

In [187]:
from kfp.dsl import component, InputPath, OutputPath
from kfp.v2 import compiler

@component(
    base_image="python:3.10",
    packages_to_install=["boto3", "pandas", "llama-stack", "httpx", "numpy", "psycopg2", "llama-stack-client==0.1.9"]
)
def fetch_from_minio(
    bucket_name: str,
    file_key: str,
    minio_endpoint: str,
    minio_access_key: str,
    minio_secret_key: str,
    output_file: OutputPath()
):
    import boto3
    import os

    s3 = boto3.client(
        "s3",
        endpoint_url=minio_endpoint,
        aws_access_key_id=minio_access_key,
        aws_secret_access_key=minio_secret_key
    )

    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    s3.download_file(bucket_name, file_key, output_file)
    print(f"File downloaded to: {output_file}")


### Chunk and Store Embeddings in PGVector

In [None]:
from kfp import dsl
from kfp.dsl import component, Input, InputPath, Output, OutputPath
import os
import json
from llama_stack_client import LlamaStackClient
from llama_stack_client.types import Document as LlamaStackDocument

from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.types.doc.labels import DocItemLabel


@component(
    base_image="python:3.10",
    packages_to_install=[
        "llama-stack-client==0.1.9",
        "docling",
        "docling-core"
    ]
)
def chunk_embed_and_store(
    input_file: InputPath(),
    llama_stack_url: str = "http://localhost:8321",
    embedding_model: str = "all-MiniLM-L6-v2",
    embedding_dimension: int = 384,
    provider_id: str = "pgvector",
    vector_db_id: str = "my_document_db"
) -> int:
    """
    Process documents, convert them to chunks, and store in a vector database.
    
    Args:
        input_file (InputPath): Path to the document or a JSON file containing document paths
        llama_stack_url (str): URL for the Llama Stack API
        embedding_model (str): Model to use for embeddings
        embedding_dimension (int): Dimension size for embeddings
        provider_id (str): Provider ID for vector database
        vector_db_id (str): ID for the vector database
        
    Returns:
        int: Number of documents processed
    """
    # === Step 1: Configure Llama Stack client ===
    client = LlamaStackClient(base_url=llama_stack_url)

    # === Step 2: Process document path ===
    # The document_path could be a single file or a JSON file with multiple document paths
    documents = []
    if os.path.isfile(input_file):
        # Check if this is a JSON file with a list of document paths
        if input_file.endswith('.json'):
            try:
                with open(input_file, 'r') as f:
                    file_data = json.load(f)
                    if isinstance(file_data, list):
                        documents = file_data
                    elif isinstance(file_data, dict) and 'input_file' in file_data:
                        documents = file_data['input_file']
                    elif isinstance(file_data, dict) and 'documents' in file_data:
                        documents = file_data['documents']
                    else:
                        documents = [input_file]
            except json.JSONDecodeError:
                documents = [input_file]
        else:
            documents = [input_file]

    # === Step 3: Convert, Chunk, and Prepare Documents ===
    # converter format option for the pictures on pdf to be generated as base64
    pipeline_options = PdfPipelineOptions()
    pipeline_options.generate_picture_images = True
    converter = DocumentConverter(
                format_options={
                    InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
                }
    )
    chunker = HybridChunker()
    llama_documents: list[LlamaStackDocument] = []
    i = 0

    for file_path in documents:
        print(f"Processing {file_path}...")
        try:
            docling_doc = converter.convert(source=file_path).document
            chunks = chunker.chunk(docling_doc)
            chunk_count = 0

            for chunk in chunks:
                if any(
                    c.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH]
                    for c in chunk.meta.doc_items
                ):
                    i += 1
                    chunk_count += 1
                    llama_documents.append(
                        LlamaStackDocument(
                            document_id=f"doc-{i}",
                            content=chunk.text,
                            mime_type="text/plain",
                            metadata={"source": file_path},
                        )
                    )
            
        except Exception as e:
            error_message = str(e)
            print(f"Error processing {file_path}: {error_message}")

    total_chunks = len(llama_documents)
    print(f"Total valid documents prepared: {total_chunks}")

    # === Step 4: Create Vector DB ===
    try:
        client.vector_dbs.register(
            vector_db_id=vector_db_id,
            embedding_model=embedding_model,
            embedding_dimension=embedding_dimension,
            provider_id=provider_id,
        )
        print(f"Vector DB registered successfully: {vector_db_id}")

    except Exception as e:
        error_message = str(e)
        print(f"Failed to register vector DB '{vector_db_id}': {error_message}")

    # === Step 5: Insert into Vector DB ===
    try:
        client.tool_runtime.rag_tool.insert(
            documents=llama_documents,
            vector_db_id=vector_db_id,
            chunk_size_in_tokens=512,
        )
        print("Documents successfully inserted into the vector DB.")

    except Exception as e:
        error_message = str(e)
        print(f"Error inserting documents into RAG tool: {error_message}")

    print(f"Total chunks inserted into vectordb: {total_chunks}")


### Pipeline

In [None]:
from kfp.dsl import pipeline
from kfp.v2 import compiler

@pipeline(name="pipeline-fetch-chunk-embed-store-vector-db")
def full_pipeline():
    fetch_step = fetch_from_minio(
        bucket_name="llama",
        file_key="abc.pdf",
        minio_endpoint="<mino-api-url>",
        minio_access_key="<minio_username>",
        minio_secret_key="<minio_password>"
    )

    chunk_and_embed_step = chunk_embed_and_store(
        input_file=fetch_step.outputs["output_file"],
        llama_stack_url="<your-llama-stack-url>"
    )


In [None]:
compiler.Compiler().compile(
    pipeline_func=full_pipeline,
    package_path="fetch_chunk_embed_pipeline.yaml"
)