# Notebook to use RAGDocument and LlamaStackClient to store documents in PGVector

In [None]:
!pip install --upgrade pip
!pip install boto3 pandas
!pip install docstring-parser==0.7.3 kfp-pipeline-spec==0.6.0 kfp-server-api==2.1.0 kubernetes==8.0.0 protobuf==4.21.1 requests-toolbelt==0.8.0
!pip install llama-stack
!pip install sentence-transformers
!pip install llama-stack-client==0.1.9
!pip install huggingface_hub==0.14.1
!pip install numpy
!pip install pdfplumber




### Fetch from minIO

In [187]:
from kfp.dsl import component, InputPath, OutputPath
from kfp.v2 import compiler

@component(
    base_image="python:3.10",
    packages_to_install=["boto3", "pandas", "llama-stack", "httpx", "numpy", "psycopg2", "llama-stack-client==0.1.9"]
)
def fetch_from_minio(
    bucket_name: str,
    file_key: str,
    minio_endpoint: str,
    minio_access_key: str,
    minio_secret_key: str,
    output_file: OutputPath()
):
    import boto3
    import os

    s3 = boto3.client(
        "s3",
        endpoint_url=minio_endpoint,
        aws_access_key_id=minio_access_key,
        aws_secret_access_key=minio_secret_key
    )

    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    s3.download_file(bucket_name, file_key, output_file)
    print(f"File downloaded to: {output_file}")


### Chunk and Store Embeddings in PGVector

In [None]:
from kfp.dsl import component, InputPath, OutputPath
import json
import pandas as pd
import os
from llama_stack_client import LlamaStackClient
from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.apis.vector_io import Chunk

In [None]:
@component(
    base_image="python:3.10",
    packages_to_install=["pdfplumber", "numpy", "pymupdf", "llama-stack-client==0.1.9", "pandas", "llama-stack", "httpx", "numpy", "psycopg2", "sentence-transformers", "huggingface_hub==0.14.1"]
)
def chunk_embed_and_store(
    input_file: InputPath(),
):
    from llama_stack_client import LlamaStackClient
    from llama_stack_client import Agent, AgentEventLogger, RAGDocument
    import pandas as pd
    from sentence_transformers import SentenceTransformer
    import json
    import os
    from sentence_transformers import SentenceTransformer
    import pdfplumber
    import numpy as np
    
    print(f"Reading PDF from: {input_file}")
    full_text = ""
    
    with pdfplumber.open(input_file) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text

    # Cleaning data
    full_text = full_text.encode('utf-8', 'ignore').decode('utf-8')
    full_text = full_text.replace('\x00', '')
    
    if not full_text.strip():
        raise ValueError("No text could be extracted from the PDF.")


    rng = np.random.default_rng()
    random_id = rng.integers(1000, 9999)
    
    document = RAGDocument(
        document_id=f"pdf-{random_id}-1",
        content=full_text,
        mime_type="application/pdf",
        metadata={"source": "rag-pipeline", "filename": os.path.basename(input_file)},
    )


    client = LlamaStackClient(base_url="http://llamastack.llama-stack-rag-2.svc.cluster.local:8321")

    vector_db_id = "my_documents_db_1"

    response = client.vector_dbs.register(
        vector_db_id="my_documents_db_1",
        embedding_model="all-MiniLM-L6-v2",
        embedding_dimension=384,
        provider_id="pgvector",
    )
    print(f"Vector database registered: {response}")
    
    print("Inserting document using rag_tool...")
    client.tool_runtime.rag_tool.insert(
        documents=[document],
        vector_db_id="my_documents_db_1",
        chunk_size_in_tokens=512,
    )
    print("Document inserted successfully using RAGDocument.")


### Pipeline

In [None]:
from kfp.dsl import pipeline
from kfp.v2 import compiler

@pipeline(name="pipeline-fetch-chunk-embed-store-vector-db")
def full_pipeline():
    fetch_step = fetch_from_minio(
        bucket_name="llama",
        file_key="abc.pdf",
        minio_endpoint="<mino-api-url>",
        minio_access_key="<minio_username>",
        minio_secret_key="<minio_password>"
    )

    chunk_and_embed_step = chunk_embed_and_store(
        input_file=fetch_step.outputs["output_file"]
    )


In [None]:
compiler.Compiler().compile(
    pipeline_func=full_pipeline,
    package_path="fetch_chunk_embed_pipeline.yaml"
)