In [34]:
### Simple RAG Demo

In [1]:
%pip install llama_stack pdfplumber

Collecting llama_stack
  Downloading llama_stack-0.2.12-py3-none-any.whl.metadata (17 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
Collecting fastapi<1.0,>=0.115.0 (from llama_stack)
  Downloading fastapi-0.116.1-py3-none-any.whl.metadata (28 kB)
Collecting fire (from llama_stack)
  Downloading fire-0.7.1-py3-none-any.whl.metadata (5.8 kB)
Collecting huggingface-hub (from llama_stack)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting llama-stack-client>=0.2.12 (from llama_stack)
  Downloading llama_stack_client-0.2.12-py3-none-any.whl.metadata (15 kB)
Collecting openai>=1.66 (from llama_stack)
  Downloading openai-1.106.1-py3-none-any.whl.metadata (29 kB)
Collecting python-dotenv (from llama_stack)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting python-jose (from llama_stack)
  Downloading python_jose-3.5.0-py2.py3-none-any.whl.metadata (5.5 kB)
Collecting pydantic>=2 (from lla

In [2]:
from llama_stack_client import RAGDocument, LlamaStackClient
# Cell 2: extract text from website via SmolDocling (with BeautifulSoup fallback), save to `raw_text`
import time
from bs4 import BeautifulSoup 


In [3]:
deployment_endpoint= "http://lsd-llama-milvus-service:8321"

In [5]:
client = LlamaStackClient(base_url=deployment_endpoint)
models= client.models.list()
print(client.models.list())

model_id = next(m.identifier for m in models if m.model_type == "llm")
embedding_model = next(m for m in models if m.model_type == "embedding")
embedding_model_id = embedding_model.identifier
embedding_dimension = embedding_model.metadata["embedding_dimension"]

vector_db_id = "my_milvus_db"
provider_id  = "milvus"

# ### Do this step only once 


# _ = client.vector_dbs.register(
# vector_db_id=vector_db_id,
# embedding_model=embedding_model_id,
# embedding_dimension=embedding_dimension,
# provider_id=provider_id,
# )
# print(f"Registered vector DB: {vector_db_id}")

INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/models "HTTP/1.1 200 OK"


[Model(identifier='granite', metadata={}, api_model_type='llm', provider_id='vllm-inference', type='model', provider_resource_id='granite', model_type='llm'), Model(identifier='granite-embedding-125m', metadata={'embedding_dimension': 768.0}, api_model_type='embedding', provider_id='sentence-transformers', type='model', provider_resource_id='ibm-granite/granite-embedding-125m-english', model_type='embedding')]


In [None]:
# provider_id = "sentence-transformers"   # from your providers output
# embedding_model_local_id = "nomic-embed-text-v1.5-768"   # local id you choose
# provider_model_id = "nomic-ai/nomic-embed-text-v1.5"     # HF repo id
# embedding_dimension = 768   # choose 768, 512, 256, 128, or 64 (nomic supports variable dims)

# try:
#     resp = client.models.register(
#         model_id=embedding_model_local_id,
#         provider_id=provider_id,
#         provider_model_id=provider_model_id,
#         model_type="embedding",
#         metadata={"description": "Nomic embed text v1.5 (768-d)", "embedding_dimension": float(embedding_dimension)}
#     )
#     print("Registered embedding model:", embedding_model_local_id, resp)
# except Exception as e:
#     print("Register embedding model error:", e)

# # verify it appears
# print("models after register:", [(m.identifier, m.model_type, getattr(m, "metadata", None)) for m in client.models.list()])


In [9]:
# Cell 2: extract text via SmolDocling (with pdfplumber fallback), save to `raw_text`
def extract_text_with_smol_and_save(pdf_path: str, smol_endpoint: str = SMOLDOCLING_ENDPOINT, out_txt: str = "raw_text.txt") -> str:
    """
    1) Try to POST the PDF to SmolDocling's parse endpoint (assumes POST <endpoint>/parse_pdf accepts multipart/form-data file).
    2) If that fails or returns no text, fallback to local extraction using pdfplumber.
    3) Save the final text to out_txt and also return it (and set raw_text).
    """
    # read bytes
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()

    # 1) Try SmolDocling parse route
    extracted = ""
    try:
        files = {"file": (pdf_path, pdf_bytes, "application/pdf")}
        # attempt parse route first
        resp = requests.post(smol_endpoint.rstrip("/") + "/parse_pdf", files=files, timeout=300)
        resp.raise_for_status()
        j = resp.json()
        extracted = j.get("text") or j.get("content") or ""
        if extracted and extracted.strip():
            print("Extracted text from SmolDocling /parse_pdf")
    except Exception as e_parse:
        # try a generic predict route if parse_pdf doesn't exist
        try:
            payload = {"file_name": pdf_path}
            # some deployments might accept multipart on /predict or different shapes - try /predict with same files
            resp2 = requests.post(smol_endpoint.rstrip("/") + "/predict", files=files, timeout=300)
            resp2.raise_for_status()
            j2 = resp2.json()
            extracted = j2.get("text") or j2.get("content") or j2.get("generated_text") or ""
            if extracted and extracted.strip():
                print("Extracted text from SmolDocling /predict")
        except Exception as e_predict:
            print("SmolDocling endpoints failed (parse/predict). Falling back to local extraction.")
            # will fallback below

    # 2) Fallback to pdfplumber if Smol didn't return text
    if not extracted or not extracted.strip():
        try:
            with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
                pages = []
                for i, p in enumerate(pdf.pages):
                    pages.append(f"[page:{i+1}]\n" + (p.extract_text() or ""))
            extracted = "\n\n".join(pages)
            print("Extracted text using pdfplumber fallback.")
        except Exception as e_local:
            # if even fallback fails, keep an empty string and raise/log
            print("Local pdfplumber extraction also failed:", e_local)
            extracted = ""

    # 3) Save to file and return
    with open(out_txt, "w", encoding="utf-8") as out_f:
        out_f.write(extracted or "")

    # put into a global variable raw_text for convenience in notebook
    global raw_text
    raw_text = extracted or ""
    print(f"Saved extracted text to {out_txt}; raw_text length = {len(raw_text)} characters.")
    return raw_text




SmolDocling endpoints failed (parse/predict). Falling back to local extraction.
Extracted text using pdfplumber fallback.
Saved extracted text to raw_text.txt; raw_text length = 70894 characters.
--- preview (first 800 chars) ---
[page:1]
State Bank of India
Central Recruitment & Promotion Department
Corporate Centre, Mumbai
Phone: 022-22820427; e-mail: crpd@sbi.co.in
SBI HONOURED AS OVERALL WINNER UNDER “TOP PERFORMING BANK”
CATEGORY AT EA SE 7.0 CITATION CEREMONY
Page 1 of 11

[page:2]
State Bank of India
CENTRAL RECRUITMENT & PROMOTION DEPARTMENT
CORPORATE CENTRE, MUMBAI
(Phone: 022-2282 0427; E-mail: crpd@sbi.co.in)
RECRUITMENT OF JUNIOR ASSOCIATES (CUSTOMER SUPPORT & SALES)
(Advertisement No. CRPD/CR/2025-26/06)
ONLINE REGISTRATION OF APPLICATION AND PAYMENT OF FEES: 06.08.2025 TO 26.08.2025
Applications are invited from eligible Indian Citizens for appointment as Junior Associate (Customer Support & Sales) in clerical cadre in State Bank of India. Candidates
can apply for vacanci

In [None]:
# Execute extraction (will set raw_text)
raw_text = extract_text_with_smol_and_save(pdf_path)
# quick preview
print("--- preview (first 800 chars) ---")
print(raw_text[:800])

In [11]:
document = RAGDocument(
document_id="raw_text_001",
content=raw_text,
mime_type="text/plain",
metadata={"source": "SBI_Doc"},
)

In [12]:
client.tool_runtime.rag_tool.insert(
    documents=[document],
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=100,
)
print("Raw text ingested successfully")

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/tool-runtime/rag-tool/insert "HTTP/1.1 200 OK"


Raw text ingested successfully


In [48]:
# Example RAG query for one-off lookups
query = "What is condition for uploading photographs for SBI RECRUITMENT OF JUNIOR ASSOCIATES (CUSTOMER SUPPORT & SALES 2025 ?"


ground_truth =""" Photograph Image: (4.5 cm x 3.5 cm)
• Photograph must be a recent passport style colour picture.
• Make sure that the picture is in colour, taken against a light-coloured, preferably white,
background.
• Look straight at the camera with a relaxed face
• If picture is taken on a sunny day, have the sun behind you, or place yourself in shade,
so that you are not squinting and there are no harsh shadows
• If you have to use flash, ensure there's no "red-eye"
• If you wear glasses make sure that there are no reflections and your eyes can be
clearly seen.
• Caps, hats and dark glasses are not acceptable. Religious headwear is allowed but it
must not cover your face.
• Dimensions 200 x 230 pixels (preferred)
• Size of file should be between 20 kb–50 kb
• Ensure that size of the scanned image is not more than 50kb. If the size of the file is
more than 50 kb, then adjust the settings of the scanner such as the DPI resolution,
no. of colours etc., during the process of scanning.
• Photo uploaded should be of appropriate size and clearly visible.
• It is advisable that candidate retains about 8 copies of the same photograph
which is uploaded at the time of online application as these would be needed for
further processes of this selection process."""

## LLM Response (With RAG)

In [52]:
# Query chunks
rag_result = client.tool_runtime.rag_tool.query(
    vector_db_ids=[vector_db_id],
    content=query,
)

# Build context
context = "\n\n".join([item.text for item in rag_result.content if hasattr(item, "text")])

# Ask the model
messages = [
    {"role": "system", "content": "Answer using only the provided CONTEXT."},
    {"role": "user", "content": f"CONTEXT:\n{context}\n\nQUESTION:\n{query}"}
]

resp = client.inference.chat_completion(messages=messages, model_id=model_id)

print(resp.completion_message.content)
rag_answer= resp.completion_message.content

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/tool-runtime/rag-tool/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"


For the SBI RECRUITMENT OF JUNIOR ASSOCIATES (CUSTOMER SUPPORT & SALES 2025, the conditions for uploading photographs are as follows:

1. The photograph must be of a candidate's face, clearly visible with blue ink.
2. The photo should be in jpg/jpeg format and have dimensions of 240 x 240 pixels in 200 DPI (preferred for required quality, i.e., 3 cm x 3 cm).
3. If the photo is not uploaded at the specified place, admission for the examination will be rejected/denied.
4. The photo should be captured against a light, preferably coloured, and clicked.
5. The photo should not be a small size.
6. The photo should be of appropriate size and clearly visible.
7. Candidates are advised to retain about 8 copies of the same photograph.
8. The photograph will get auto-uploaded in the application form.
9. If the face in the photograph, signature, left thumb impression, or handwritten declaration is unclear or smudged, the candidate's application may be rejected.

The photograph must be uploaded dur

## LLM response (Without RAG)

In [53]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": f"\nQUESTION:\n{query}\n\nAnswer concisely:"}
]
resp = client.inference.chat_completion(messages=messages, model_id=model_id)

# Extract final answer text
answer = None
if hasattr(resp, "completion_message") and resp.completion_message:
    answer = resp.completion_message.content
elif hasattr(resp, "choices") and resp.choices:
    answer = resp.choices[0].message.content
else:
    answer = str(resp)

print("Base LLM Answer:\n", answer)

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"


Base LLM Answer:
 For SBI Recruitment of Junior Associates (Customer Support & Sales 2025), photographs are not explicitly mentioned as a requirement in the official notification. However, it's advisable to check the latest updates on the State Bank of India (SBI) recruitment website or contact the recruitment department for the most accurate and current information regarding photograph upload conditions.


## Comparison between actual answer , RAG & Without RAG Generated Answer

In [54]:
print("\n=== Ground Truth ===\n")
print(ground_truth)

print("\n=== RAG Generated Answer ===\n")
print(rag_answer)

print("\n=== Base LLM Answer (without RAG) ===\n")
print(answer)


=== Ground Truth ===

 Photograph Image: (4.5 cm x 3.5 cm)
• Photograph must be a recent passport style colour picture.
• Make sure that the picture is in colour, taken against a light-coloured, preferably white,
background.
• Look straight at the camera with a relaxed face
• If picture is taken on a sunny day, have the sun behind you, or place yourself in shade,
so that you are not squinting and there are no harsh shadows
• If you have to use flash, ensure there's no "red-eye"
• If you wear glasses make sure that there are no reflections and your eyes can be
clearly seen.
• Caps, hats and dark glasses are not acceptable. Religious headwear is allowed but it
must not cover your face.
• Dimensions 200 x 230 pixels (preferred)
• Size of file should be between 20 kb–50 kb
• Ensure that size of the scanned image is not more than 50kb. If the size of the file is
more than 50 kb, then adjust the settings of the scanner such as the DPI resolution,
no. of colours etc., during the process of s

### RAG Agent - This is advanced RAG feature present in Llamastack

In [55]:
# 1) Run retrieval explicitly (same tool used by the agent)
rag_result = client.tool_runtime.rag_tool.query(
    vector_db_ids=[vector_db_id],
    content=query,
    query_config={
        "chunk_size_in_tokens": 512,
        "chunk_overlap_in_tokens": 100,
        "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
    },
)

# 2) Extract the retrieved text chunks (simple join)
retrieved_texts = []
for item in getattr(rag_result, "content", []) or []:
    # item may be dict-like or object; handle both
    txt = item.get("text") if isinstance(item, dict) else getattr(item, "text", None)
    if not txt:
        continue
    # remove our template labels to keep only chunk text
    import re
    cleaned = re.sub(r"^Result\s*\d+\s*", "", txt, flags=re.I)
    cleaned = re.sub(r"Content:\s*", "", cleaned, flags=re.I)
    cleaned = re.sub(r"\nMetadata:.*$", "", cleaned, flags=re.S|re.I)
    cleaned = cleaned.strip()
    if cleaned:
        retrieved_texts.append(cleaned)

context = "\n\n".join(retrieved_texts).strip()

# 3) Pick an LLM model (first available LLM)
models = client.models.list()
model_id = next((m.identifier for m in models if getattr(m, "model_type", None) == "llm"), None)
if model_id is None:
    raise RuntimeError("No LLM model found in client.models.list()")

# 4) Call the inference API (chat completion) with the retrieved context + user question
messages = [
    {"role": "system", "content": "You are a helpful assistant. USE ONLY the provided CONTEXT to answer."},
    {"role": "user", "content": f"CONTEXT:\n{context}\n\nQUESTION:\n{query}\n\nAnswer concisely:"}
]

resp = client.inference.chat_completion(messages=messages, model_id=model_id)

# 5) Extract final answer text from the inference response and print it
final_text = None
if hasattr(resp, "completion_message") and getattr(resp, "completion_message") is not None:
    final_text = getattr(resp.completion_message, "content", None) or getattr(resp.completion_message, "text", None)
if not final_text and hasattr(resp, "choices") and resp.choices:
    c0 = resp.choices[0]
    if hasattr(c0, "message"):
        final_text = getattr(c0.message, "content", None) or getattr(c0.message, "text", None)
if not final_text:
    # fallback to string representation
    final_text = str(resp)

print(final_text.strip())


INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/tool-runtime/rag-tool/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"


For SBI's Recruitment of Junior Associates (Customer Support & Sales 2025), the conditions for uploading photographs are:

1. The photo must be a clear, colored image of the candidate's face, taken against a light.
2. The photo should be of appropriate size, ideally 240 x 240 pixels in 200 DPI (3 cm x 3 cm).
3. The file type should be jpg/jpeg.
4. If the photo is not uploaded at the specified place, admission for the examination will be rejected.
5. The photograph will be auto-uploaded in the application form.
6. If the face in the photograph is unclear or smudged, the candidate's application may be rejected.
7. Candidates are advised to retain 8 copies of the same photograph.


### URL RAG

In [20]:

def extract_text_from_url_with_smol_and_save(
    url: str,
    smol_endpoint: str = SMOLDOCLING_ENDPOINT,
    out_txt: str = "raw_text.txt"
) -> str:
    """
    1) Try to POST the URL to SmolDocling's parse endpoint (assumes POST <endpoint>/parse accepts JSON {"url": <url>}).
    2) If that fails or returns no text, fallback to local extraction using requests + BeautifulSoup.
    3) Save the final text to out_txt and also return it (and set raw_text).
    """
    extracted = ""

    # 1) Try SmolDocling /parse endpoint for URL
    try:
        resp = requests.post(
            smol_endpoint.rstrip("/") + "/parse",
            json={"url": url},
            timeout=300,
        )
        resp.raise_for_status()
        j = resp.json()
        extracted = j.get("text") or j.get("content") or j.get("extracted_text") or ""
        if extracted and extracted.strip():
            print("Extracted text from SmolDocling /parse (URL)")
    except Exception as e_parse:
        try:
            # try /predict endpoint if /parse is not available
            resp2 = requests.post(
                smol_endpoint.rstrip("/") + "/predict",
                json={"url": url},
                timeout=300,
            )
            resp2.raise_for_status()
            j2 = resp2.json()
            extracted = (
                j2.get("text") or j2.get("content") or j2.get("generated_text") or ""
            )
            if extracted and extracted.strip():
                print("Extracted text from SmolDocling /predict (URL)")
        except Exception as e_predict:
            print("SmolDocling endpoints failed for URL (parse/predict). Falling back to BeautifulSoup.")

    # 2) Fallback to BeautifulSoup if Smol didn't return text
    if not extracted or not extracted.strip():
        try:
            r = requests.get(url, timeout=60)
            r.raise_for_status()
            soup = BeautifulSoup(r.text, "html.parser")
            for s in soup(["script", "style", "noscript", "header", "footer", "nav"]):
                s.decompose()
            lines = [ln.strip() for ln in soup.get_text(separator="\n").splitlines() if ln.strip()]
            extracted = "\n\n".join(lines)
            print("Extracted text using BeautifulSoup fallback.")
        except Exception as e_local:
            print("Local BeautifulSoup extraction also failed:", e_local)
            extracted = ""

    # 3) Save to file and return
    with open(out_txt, "w", encoding="utf-8") as out_f:
        out_f.write(extracted or "")

    global raw_text
    raw_text = extracted or ""
    print(f"Saved extracted text to {out_txt}; raw_text length = {len(raw_text)} characters.")
    return raw_text




SmolDocling endpoints failed for URL (parse/predict). Falling back to BeautifulSoup.
Extracted text using BeautifulSoup fallback.
Saved extracted text to raw_text.txt; raw_text length = 28473 characters.
--- preview (first 800 chars) ---
ï»¿

YONO LITE SBI

YONO LITE SBI - Frequently Asked Questions

FAQ Features

ADD AND MANAGE BENEFICIARY

1. How do I add SBI beneficiary?

Please follow the below steps:

Go to 'Settings' >> 'Profile Management' >> 'Add/Manage Beneficiary'.

Give your profile password and click on 'Submit' button.

Click on 'Add' icon on the right hand corner.

Select 'State Bank Account' from dropdown.

Provide the account number and limit and click on 'Submit' button.

Confirm the details in the pre confirm screen.

Click on 'Submit' button.

Provide the OTP received in mobile and click on 'Submit' button.

2. How do I add other bank beneficiary?

Please follow the below steps:

Go to 'Settings' >> 'Profile Management' >> 'Add/Manage Beneficiary'.

Provide your prof

In [None]:
# Example execution
url = "https://mobilityretail.sbi/sbustaticweb/mobile/faq_features.html"
raw_text = extract_text_from_url_with_smol_and_save(url)
print("--- preview (first 800 chars) ---")
print(raw_text[:800])

In [21]:
document = RAGDocument(
document_id="raw_text_002",
content=raw_text,
mime_type="text/plain",
metadata={"source": "SBI_url"},
)

In [22]:
client.tool_runtime.rag_tool.insert(
    documents=[document],
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=100,
)
print("Raw text ingested successfully")

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/tool-runtime/rag-tool/insert "HTTP/1.1 200 OK"


Raw text ingested successfully


In [42]:
query= "In YONO Lite SBI , How can customer do quick transfer using QR code?"

ground_truth= """A facility available in post login section of Yono Lite SBI application, whereby Beneficiary (any SBI customer) can create a QR code. by entering details (Name, A/c Number, IFS Code etc.). The generated QR code can be shared (by the beneficiary) with any SBI customer to send money by scanning the QR code using Yono Lite SBI app:

The remitter logs into Yono Lite SBI application
Select Quick Transfer and Send Money using QR Code.
Scans the QR code on the beneficiary mobile device using his/her smart phone camera.
Or reads the QR code received through various modes from which it can be shared.
The application decodes the QR code and auto populates the beneficiary details and provides information like Name, A/c No, IFSC, etc.


The remitter enters the “Amount” and “Remarks” and proceeds for payment. No prior registration of the beneficiary is required!


Note : Creation and Scanning of QR Code is currently available on Android devices only."""

## LLM Response (With RAG)

In [43]:
# 1) Run retrieval explicitly (same tool used by the agent)
rag_result = client.tool_runtime.rag_tool.query(
    vector_db_ids=[vector_db_id],
    content=query,
    query_config={
        "chunk_size_in_tokens": 512,
        "chunk_overlap_in_tokens": 100,
        "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
    },
)

# 2) Extract the retrieved text chunks (simple join)
retrieved_texts = []
for item in getattr(rag_result, "content", []) or []:
    # item may be dict-like or object; handle both
    txt = item.get("text") if isinstance(item, dict) else getattr(item, "text", None)
    if not txt:
        continue
    # remove our template labels to keep only chunk text
    import re
    cleaned = re.sub(r"^Result\s*\d+\s*", "", txt, flags=re.I)
    cleaned = re.sub(r"Content:\s*", "", cleaned, flags=re.I)
    cleaned = re.sub(r"\nMetadata:.*$", "", cleaned, flags=re.S|re.I)
    cleaned = cleaned.strip()
    if cleaned:
        retrieved_texts.append(cleaned)

context = "\n\n".join(retrieved_texts).strip()

# 3) Pick an LLM model (first available LLM)
models = client.models.list()
model_id = next((m.identifier for m in models if getattr(m, "model_type", None) == "llm"), None)
if model_id is None:
    raise RuntimeError("No LLM model found in client.models.list()")

# 4) Call the inference API (chat completion) with the retrieved context + user question
messages = [
    {"role": "system", "content": "You are a helpful assistant. USE ONLY the provided CONTEXT to answer."},
    {"role": "user", "content": f"CONTEXT:\n{context}\n\nQUESTION:\n{query}\n\nAnswer concisely:"}
]

resp = client.inference.chat_completion(messages=messages, model_id=model_id)

# 5) Extract final answer text from the inference response and print it
final_text = None
if hasattr(resp, "completion_message") and getattr(resp, "completion_message") is not None:
    final_text = getattr(resp.completion_message, "content", None) or getattr(resp.completion_message, "text", None)
if not final_text and hasattr(resp, "choices") and resp.choices:
    c0 = resp.choices[0]
    if hasattr(c0, "message"):
        final_text = getattr(c0.message, "content", None) or getattr(c0.message, "text", None)
if not final_text:
    # fallback to string representation
    final_text = str(resp)

print(final_text.strip())

rag_answer= final_text.strip()

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/tool-runtime/rag-tool/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"


In YONO Lite SBI, customers can perform a quick transfer using QR code by following these steps:

1. Log into the Yono Lite SBI application using your internet banking credentials.
2. Navigate to the 'Quick Transfer' option.
3. Select 'Send Money using Account Details'.
4. Enter the beneficiary's details manually, including Name, A/c Number, IFSC Code, etc.
5. Confirm the details and complete the transaction.

The remitter, upon receiving the QR code from the beneficiary, can then:

1. Log into the Yono Lite SBI application.
2. Select 'Quick Transfer' and 'Send Money using QR Code'.
3. Scan the QR code on the beneficiary's mobile device using their smartphone camera.
4. The application will decode the QR code and auto-populate the beneficiary's details.

Please note that this feature is currently available only on Android devices.


## LLM Response (Without RAG) 

In [44]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": f"\nQUESTION:\n{query}\n\nAnswer concisely:"}
]
resp = client.inference.chat_completion(messages=messages, model_id=model_id)

# Extract final answer text
answer = None
if hasattr(resp, "completion_message") and resp.completion_message:
    answer = resp.completion_message.content
elif hasattr(resp, "choices") and resp.choices:
    answer = resp.choices[0].message.content
else:
    answer = str(resp)

print("Base LLM Answer:\n", answer)

base_answer=answer


INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"


Base LLM Answer:
 1. Open YONO Lite SBI app and select the "Transfer" option.
2. Choose "Quick Transfer" or "QR Code Transfer."
3. Enter the recipient's account number and the amount to transfer.
4. Scan the recipient's QR code using the app's camera or manually enter the account number.
5. Review the details and confirm the transfer.
6. The recipient will receive a notification with the transfer details.


## Comparison between Actual Answer , RAG Generated Answer, Without RAG generated answer 

In [46]:
print("\n=== Ground Truth ===\n")
print(ground_truth)

print("\n=== RAG Generated Answer ===\n")
print(rag_answer)

print("\n=== Base LLM Answer (without RAG) ===\n")
print(base_answer)



=== Ground Truth ===

A facility available in post login section of Yono Lite SBI application, whereby Beneficiary (any SBI customer) can create a QR code. by entering details (Name, A/c Number, IFS Code etc.). The generated QR code can be shared (by the beneficiary) with any SBI customer to send money by scanning the QR code using Yono Lite SBI app:

The remitter logs into Yono Lite SBI application
Select Quick Transfer and Send Money using QR Code.
Scans the QR code on the beneficiary mobile device using his/her smart phone camera.
Or reads the QR code received through various modes from which it can be shared.
The application decodes the QR code and auto populates the beneficiary details and provides information like Name, A/c No, IFSC, etc.


The remitter enters the “Amount” and “Remarks” and proceeds for payment. No prior registration of the beneficiary is required!


Note : Creation and Scanning of QR Code is currently available on Android devices only.

=== RAG Generated Answer

In [56]:
### Testing ....
#### Upload URL / pdf file , query & ground truth (actual answer)

# Master parser: handles URL (HTML or remote PDF) and local PDF.
# Requirements: pip install requests beautifulsoup4 pdfplumber

import os, io, time, requests
from bs4 import BeautifulSoup
import pdfplumber

# CONFIG: set your docling endpoint or None to skip docling attempts
SMOLDOCLING_ENDPOINT = "https://docling-dsdemo.apps.cluster-h97qh.h97qh.sandbox1475.opentlc.com/v1"
# Optional: set to False if your environment blocks SSL verification (dev only)
VERIFY_SSL = True

def _try_docling_parse_url(url: str, endpoint: str = SMOLDOCLING_ENDPOINT, timeout: int = 90) -> str:
    if not endpoint:
        return ""
    for path in ("/parse", "/predict", ""):
        try:
            resp = requests.post(endpoint.rstrip("/") + path, json={"url": url}, timeout=timeout, verify=VERIFY_SSL)
            resp.raise_for_status()
            j = resp.json()
            text = j.get("text") or j.get("content") or j.get("extracted_text") or j.get("generated_text") or ""
            if isinstance(text, list):
                text = "\n\n".join(text)
            if text and text.strip():
                return text
        except Exception:
            continue
    return ""

def _try_docling_upload_pdf_bytes(filename: str, pdf_bytes: bytes, endpoint: str = SMOLDOCLING_ENDPOINT, timeout: int = 180) -> str:
    if not endpoint:
        return ""
    files = {"file": (os.path.basename(filename), pdf_bytes, "application/pdf")}
    for path in ("/parse_pdf", "/parse", "/predict", ""):
        try:
            resp = requests.post(endpoint.rstrip("/") + path, files=files, timeout=timeout, verify=VERIFY_SSL)
            resp.raise_for_status()
            j = resp.json()
            text = j.get("text") or j.get("content") or j.get("extracted_text") or j.get("generated_text") or ""
            if isinstance(text, list):
                text = "\n\n".join(text)
            if text and text.strip():
                return text
        except Exception:
            continue
    return ""

def _pdfplumber_extract(pdf_bytes: bytes) -> str:
    try:
        with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
            pages = []
            for i, p in enumerate(pdf.pages):
                pages.append(f"[page:{i+1}]\n" + (p.extract_text() or ""))
        return "\n\n".join(pages)
    except Exception:
        return ""

def _beautifulsoup_extract(html_text: str) -> str:
    try:
        soup = BeautifulSoup(html_text, "html.parser")
        for s in soup(["script", "style", "noscript", "header", "footer", "nav", "aside"]):
            s.decompose()
        lines = [ln.strip() for ln in soup.get_text(separator="\n").splitlines() if ln.strip()]
        return "\n\n".join(lines)
    except Exception:
        return ""

def parse_input_doc(input_doc: str, smol_endpoint: str = SMOLDOCLING_ENDPOINT, out_txt: str = "raw_text.txt") -> str:
    """
    Parse an input document (URL or local PDF path).
    Returns extracted text (string) and saves to out_txt. Also sets global raw_text.
    """
    extracted = ""
    is_url = isinstance(input_doc, str) and input_doc.lower().startswith(("http://", "https://"))
    is_local_file = isinstance(input_doc, str) and os.path.exists(input_doc) and input_doc.lower().endswith(".pdf")

    # 1) If it's a URL: attempt Docling remote parse first (works for HTML or remote PDFs)
    if is_url:
        print(f"[parse] Input is a URL: {input_doc}")

        # Try Docling remote parse (best-first)
        if smol_endpoint:
            try:
                extracted = _try_docling_parse_url(input_doc, endpoint=smol_endpoint)
                if extracted:
                    print("[parse] extracted via SmolDocling (remote URL).")
            except Exception:
                extracted = ""

        # If docling returned nothing -> try fetching HTML then BeautifulSoup
        if not extracted:
            # If the URL looks like a PDF (endswith .pdf or content-type), try remote PDF docling/upload fallback
            if input_doc.lower().endswith(".pdf"):
                print("[parse] URL looks like a PDF; attempting remote PDF Docling or fallback.")
                try:
                    # try docling again with upload-style (some endpoints accept remote pdf via url param; we try parse_url first above)
                    extracted = _try_docling_parse_url(input_doc, endpoint=smol_endpoint)
                except Exception:
                    extracted = ""

            # If still nothing, try plain HTTP fetch + BeautifulSoup
            if not extracted:
                try:
                    headers = {"User-Agent": "Mozilla/5.0"}
                    r = requests.get(input_doc, headers=headers, timeout=60, verify=VERIFY_SSL)
                    r.raise_for_status()
                    html = r.text or ""
                    extracted = _beautifulsoup_extract(html)
                    if extracted:
                        print("[parse] extracted via BeautifulSoup fallback (URL).")
                except Exception as e:
                    print(f"[parse] BeautifulSoup/http fallback failed for URL: {e}")
                    extracted = ""

    # 2) If it's a local PDF file: try upload to Docling first, then pdfplumber fallback
    elif is_local_file:
        print(f"[parse] Input is a local PDF: {input_doc}")
        try:
            with open(input_doc, "rb") as f:
                pdf_bytes = f.read()
        except Exception as e:
            print("[parse] Failed to read local PDF:", e)
            pdf_bytes = None

        if pdf_bytes:
            # try docling upload parse
            if smol_endpoint:
                try:
                    extracted = _try_docling_upload_pdf_bytes(input_doc, pdf_bytes, endpoint=smol_endpoint)
                    if extracted:
                        print("[parse] extracted via SmolDocling (local PDF upload).")
                except Exception:
                    extracted = ""

            # fallback to pdfplumber
            if not extracted:
                try:
                    extracted = _pdfplumber_extract(pdf_bytes)
                    if extracted:
                        print("[parse] extracted via pdfplumber fallback (local PDF).")
                except Exception as e:
                    print("[parse] pdfplumber fallback failed:", e)
                    extracted = ""

    # 3) If it is a local non-pdf path or unknown: try to read as text
    else:
        # Might be a raw text input; return it as-is
        if os.path.exists(input_doc):
            try:
                with open(input_doc, "r", encoding="utf-8") as f:
                    extracted = f.read()
                print("[parse] Input is a local file (non-PDF), read as text.")
            except Exception as e:
                print("[parse] Failed to read local file:", e)
                extracted = ""
        else:
            # treat as inline text
            extracted = str(input_doc)
            print("[parse] Input treated as inline text.")

    # 4) persist and return
    try:
        with open(out_txt, "w", encoding="utf-8") as out_f:
            out_f.write(extracted or "")
    except Exception as e:
        print("[parse] Failed to write output file:", e)

    global raw_text
    raw_text = extracted or ""
    print(f"[parse] Saved text to {out_txt}; length = {len(raw_text)} characters.")
    return raw_text

# ---------------- Example usage ----------------
# local PDF: parse_input_doc("sample.pdf")
# remote PDF: parse_input_doc("https://example.com/some.pdf")
# web page: parse_input_doc("https://www.onlinesbi.sbi/sbf_retail.html")
# inline text: parse_input_doc("This is a short note.")

In [58]:
raw_text= parse_input_doc("https://www.onlinesbi.sbi/sbf_retail.html")
raw_text[0:800]

[parse] Input is a URL: https://www.onlinesbi.sbi/sbf_retail.html
[parse] extracted via BeautifulSoup fallback (URL).
[parse] Saved text to raw_text.txt; length = 222 characters.


'This question is for testing whether you are a human visitor and to prevent automated spam submission.\n\nAudio is not supported in your browser.\n\nWhat code is in the image?\n\nsubmit\n\nYour support ID is:  3273872632185551290.'

In [None]:
document = RAGDocument(
document_id="raw_text_10",
content=raw_text,
mime_type="text/plain",
metadata={"source": "SBI_url"},
)

client.tool_runtime.rag_tool.insert(
    documents=[document],
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=100,
)
print("Raw text ingested successfully")


# 1) Run retrieval explicitly (same tool used by the agent)
rag_result = client.tool_runtime.rag_tool.query(
    vector_db_ids=[vector_db_id],
    content=query,
    query_config={
        "chunk_size_in_tokens": 512,
        "chunk_overlap_in_tokens": 100,
        "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
    },
)

# 2) Extract the retrieved text chunks (simple join)
retrieved_texts = []
for item in getattr(rag_result, "content", []) or []:
    # item may be dict-like or object; handle both
    txt = item.get("text") if isinstance(item, dict) else getattr(item, "text", None)
    if not txt:
        continue
    # remove our template labels to keep only chunk text
    import re
    cleaned = re.sub(r"^Result\s*\d+\s*", "", txt, flags=re.I)
    cleaned = re.sub(r"Content:\s*", "", cleaned, flags=re.I)
    cleaned = re.sub(r"\nMetadata:.*$", "", cleaned, flags=re.S|re.I)
    cleaned = cleaned.strip()
    if cleaned:
        retrieved_texts.append(cleaned)

context = "\n\n".join(retrieved_texts).strip()

# 3) Pick an LLM model (first available LLM)
models = client.models.list()
model_id = next((m.identifier for m in models if getattr(m, "model_type", None) == "llm"), None)
if model_id is None:
    raise RuntimeError("No LLM model found in client.models.list()")

# 4) Call the inference API (chat completion) with the retrieved context + user question
messages = [
    {"role": "system", "content": "You are a helpful assistant. USE ONLY the provided CONTEXT to answer."},
    {"role": "user", "content": f"CONTEXT:\n{context}\n\nQUESTION:\n{query}\n\nAnswer concisely:"}
]

resp = client.inference.chat_completion(messages=messages, model_id=model_id)

# 5) Extract final answer text from the inference response and print it
final_text = None
if hasattr(resp, "completion_message") and getattr(resp, "completion_message") is not None:
    final_text = getattr(resp.completion_message, "content", None) or getattr(resp.completion_message, "text", None)
if not final_text and hasattr(resp, "choices") and resp.choices:
    c0 = resp.choices[0]
    if hasattr(c0, "message"):
        final_text = getattr(c0.message, "content", None) or getattr(c0.message, "text", None)
if not final_text:
    # fallback to string representation
    final_text = str(resp)

print(final_text.strip())

rag_answer= final_text.strip()


messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": f"\nQUESTION:\n{query}\n\nAnswer concisely:"}
]
resp = client.inference.chat_completion(messages=messages, model_id=model_id)

# Extract final answer text
answer = None
if hasattr(resp, "completion_message") and resp.completion_message:
    answer = resp.completion_message.content
elif hasattr(resp, "choices") and resp.choices:
    answer = resp.choices[0].message.content
else:
    answer = str(resp)

print("Base LLM Answer:\n", answer)

base_answer=answer

# Testing (In case if you want to test the code go to below cell)

In [62]:
# Single cell: two main functions - ingest_document(...) and answer_query_with_eval(...)
# Requirements (install if missing): pip install requests beautifulsoup4 pdfplumber sklearn llama-stack-client

import os, io, time, uuid, re, requests
from typing import List, Tuple, Dict, Any, Optional
from bs4 import BeautifulSoup
import pdfplumber
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from llama_stack_client import RAGDocument

# ---------- CONFIG (edit for your environment) ----------
SMOLDOCLING_ENDPOINT = globals().get("SMOLDOCLING_ENDPOINT", None)  # set to your docling URL or None
VECTOR_DB_ID = globals().get("vector_db_id", globals().get("VECTOR_DB_ID", None))
CLIENT = globals().get("client", None)  # must be LlamaStackClient already instantiated
VERIFY_SSL = True  # set False only in dev where TLS issues exist
# Ingestion/retrieval defaults tuned for good behavior:
INGEST_CHUNK_SIZE = 512
INGEST_CHUNK_OVERLAP = 50
RETRIEVE_CHUNK_SIZE = 512
RETRIEVE_CHUNK_OVERLAP = 50
RETRIEVE_TOP_K = 6
INGEST_BATCH_DELAY = 0.5
# --------------------------------------------------------

# ------------------ Helpers ------------------
def _try_docling_parse_url(url: str, endpoint: Optional[str], timeout: int = 90) -> str:
    if not endpoint:
        return ""
    for path in ("/parse", "/predict", ""):
        try:
            resp = requests.post(endpoint.rstrip("/") + path, json={"url": url}, timeout=timeout, verify=VERIFY_SSL)
            resp.raise_for_status()
            j = resp.json()
            text = j.get("text") or j.get("content") or j.get("extracted_text") or j.get("generated_text") or ""
            if isinstance(text, list):
                text = "\n\n".join(text)
            if text and text.strip():
                return text
        except Exception:
            continue
    return ""

def _try_docling_upload_pdf_bytes(filename: str, pdf_bytes: bytes, endpoint: Optional[str], timeout: int = 180) -> str:
    if not endpoint:
        return ""
    files = {"file": (os.path.basename(filename), pdf_bytes, "application/pdf")}
    for path in ("/parse_pdf", "/parse", "/predict", ""):
        try:
            resp = requests.post(endpoint.rstrip("/") + path, files=files, timeout=timeout, verify=VERIFY_SSL)
            resp.raise_for_status()
            j = resp.json()
            text = j.get("text") or j.get("content") or j.get("extracted_text") or j.get("generated_text") or ""
            if isinstance(text, list):
                text = "\n\n".join(text)
            if text and text.strip():
                return text
        except Exception:
            continue
    return ""

def _pdfplumber_extract_from_bytes(pdf_bytes: bytes) -> str:
    try:
        with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
            pages = []
            for i, p in enumerate(pdf.pages):
                pages.append(f"[page:{i+1}]\n" + (p.extract_text() or ""))
        return "\n\n".join(pages)
    except Exception:
        return ""

def _beautifulsoup_extract(html_text: str) -> str:
    try:
        soup = BeautifulSoup(html_text, "html.parser")
        for s in soup(["script", "style", "noscript", "header", "footer", "nav", "aside"]):
            s.decompose()
        lines = [ln.strip() for ln in soup.get_text(separator="\n").splitlines() if ln.strip()]
        return "\n\n".join(lines)
    except Exception:
        return ""

def _tfidf_cosine(a: str, b: str) -> float:
    try:
        vect = TfidfVectorizer().fit([a or "", b or ""])
        tfidf = vect.transform([a or "", b or ""])
        sim = cosine_similarity(tfidf[0], tfidf[1])[0][0]
        return float(sim)
    except Exception:
        return 0.0

def _choose_llm_model_id(client) -> str:
    models = client.models.list()
    model_id = next((m.identifier for m in models if getattr(m, "model_type", None) == "llm"), None)
    if model_id is None and models:
        model_id = models[0].identifier
    return model_id

def _clean_rag_template_text(raw_text: str) -> str:
    # remove "Result {i} Content: ... Metadata: ..." wrappers used by chunk_template
    cleaned = re.sub(r"^Result\s*\d+\s*", "", raw_text, flags=re.I|re.M)
    cleaned = re.sub(r"Content:\s*", "", cleaned, flags=re.I)
    cleaned = re.sub(r"\nMetadata:.*$", "", cleaned, flags=re.S|re.I)
    return cleaned.strip()

# --------------- Main function 1: ingest_document ----------------
def ingest_document(
    input_doc: str,
    client = CLIENT,
    vector_db_id: str = VECTOR_DB_ID,
    smol_endpoint: Optional[str] = SMOLDOCLING_ENDPOINT,
    chunk_size_in_tokens: int = INGEST_CHUNK_SIZE,
    chunk_overlap_in_tokens: int = INGEST_CHUNK_OVERLAP,
    out_txt: str = "raw_text.txt",
) -> Dict[str, Any]:
    """
    Parse input_doc (URL / local PDF / inline text), create a RAGDocument with metadata,
    insert into vector DB via client.tool_runtime.rag_tool.insert(), and return {document_id, insert_response, source_type}.
    """
    if client is None:
        raise RuntimeError("client (LlamaStackClient) is not provided. Set 'client' variable.")

    # detect type
    is_url = isinstance(input_doc, str) and input_doc.lower().startswith(("http://", "https://"))
    is_local_pdf = isinstance(input_doc, str) and os.path.exists(input_doc) and input_doc.lower().endswith(".pdf")

    # parse text (Docling-first)
    extracted = ""
    parser_used = None

    # URL handling (remote PDF or webpage)
    if is_url:
        # Remote PDF?
        if input_doc.lower().endswith(".pdf"):
            # fetch bytes then try docling upload or local extraction
            try:
                r = requests.get(input_doc, timeout=60, verify=VERIFY_SSL)
                r.raise_for_status()
                pdf_bytes = r.content
            except Exception as e:
                pdf_bytes = None
            if pdf_bytes:
                # try docling upload (remote pdf)
                parsed = _try_docling_upload_pdf_bytes(input_doc, pdf_bytes, endpoint=smol_endpoint)
                if parsed:
                    extracted = parsed; parser_used = "docling_remote_pdf"
                else:
                    # fallback to pdfplumber
                    extracted = _pdfplumber_extract_from_bytes(pdf_bytes)
                    parser_used = "pdfplumber_remote_pdf" if extracted else None
        else:
            # try docling parse(url)
            parsed = _try_docling_parse_url(input_doc, endpoint=smol_endpoint)
            if parsed:
                extracted = parsed; parser_used = "docling_url"
            else:
                # fallback to BeautifulSoup
                try:
                    r = requests.get(input_doc, timeout=60, verify=VERIFY_SSL)
                    r.raise_for_status()
                    extracted = _beautifulsoup_extract(r.text)
                    parser_used = "bs4_url" if extracted else None
                except Exception:
                    extracted = ""

    # local PDF handling
    elif is_local_pdf:
        try:
            with open(input_doc, "rb") as f:
                pdf_bytes = f.read()
        except Exception:
            pdf_bytes = None
        if pdf_bytes:
            parsed = _try_docling_upload_pdf_bytes(input_doc, pdf_bytes, endpoint=smol_endpoint)
            if parsed:
                extracted = parsed; parser_used = "docling_local_pdf"
            else:
                extracted = _pdfplumber_extract_from_bytes(pdf_bytes)
                parser_used = "pdfplumber_local_pdf" if extracted else None

    # otherwise treat as inline text (or path to text)
    else:
        if isinstance(input_doc, str) and os.path.exists(input_doc):
            try:
                with open(input_doc, "r", encoding="utf-8") as f:
                    extracted = f.read()
                parser_used = "local_text_file"
            except Exception:
                extracted = str(input_doc); parser_used = "inline_text_fallback"
        else:
            extracted = str(input_doc or ""); parser_used = "inline_text"

    # save extracted text
    try:
        with open(out_txt, "w", encoding="utf-8") as o:
            o.write(extracted or "")
    except Exception:
        pass

    # prepare metadata
    document_id = f"doc-{uuid.uuid4().hex[:8]}"
    if is_url:
        source_type = "pdf_url" if input_doc.lower().endswith(".pdf") else "url"
    elif is_local_pdf:
        source_type = "pdf_local"
    else:
        source_type = "text_inline"

    metadata = {
        "source": input_doc,
        "source_type": source_type,
        "parser": parser_used,
        "document_id": document_id,
    }

    # Create RAGDocument object (explicit text content) - server will chunk & embed
    rag_doc = RAGDocument(
        document_id=document_id,
        content={"type": "text", "text": extracted or ""},
        mime_type="text/plain",
        metadata=metadata,
    )

    # Insert into vector DB
    try:
        resp = client.tool_runtime.rag_tool.insert(
            documents=[rag_doc],
            vector_db_id=vector_db_id,
            chunk_size_in_tokens=chunk_size_in_tokens,
        )
        time.sleep(INGEST_BATCH_DELAY)
    except Exception as e:
        # bubble up but include context
        raise RuntimeError(f"Insert failed: {e}")

    return {"document_id": document_id, "insert_response": resp, "source_type": source_type, "parser_used": parser_used}

# --------------- Main function 2: answer_query_with_eval ----------------
def answer_query_with_eval(
    query: str,
    ground_truth: str,
    document_id: str,
    client = CLIENT,
    vector_db_id: str = VECTOR_DB_ID,
    retrieve_top_k: int = RETRIEVE_TOP_K,
    retrieve_chunk_size: int = RETRIEVE_CHUNK_SIZE,
    retrieve_chunk_overlap: int = RETRIEVE_CHUNK_OVERLAP,
    llm_model_id: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Given a user query, ground_truth, and a single previously-ingested document_id (in vector DB),
    perform:
      - retrieval from vector DB (rag_tool.query)
      - RAG answer (LLM with retrieved context)
      - Base LLM answer (no retrieval)
      - Basic eval metrics (TF-IDF cosine to ground_truth, lengths, grounding proxy)
    Returns dict containing rag_answer, base_answer, metrics, retrieved_chunks, document_id.
    """
    if client is None:
        raise RuntimeError("client is not provided.")

    # 1) retrieval
    rag_result = client.tool_runtime.rag_tool.query(
        vector_db_ids=[vector_db_id],
        content=query,
        query_config={
            "chunk_size_in_tokens": retrieve_chunk_size,
            "chunk_overlap_in_tokens": retrieve_chunk_overlap,
            "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
            "top_k": retrieve_top_k,
        },
    )

    # extract cleaned retrieved chunks
    retrieved_chunks = []
    for item in getattr(rag_result, "content", []) or []:
        raw_txt = item.get("text") if isinstance(item, dict) else getattr(item, "text", None)
        if not raw_txt:
            continue
        cleaned = _clean_rag_template_text(raw_txt)
        if cleaned:
            retrieved_chunks.append(cleaned)

    # Build context for RAG LLM (label sources)
    context_parts = []
    for i, chunk in enumerate(retrieved_chunks[:retrieve_top_k], start=1):
        # try to extract metadata document id from chunk object if available in raw event
        # simpler: just mark SRC_i
        context_parts.append(f"[SRC_{i}]\n{chunk}")
    context_text = "\n\n---\n\n".join(context_parts).strip()

    # 2) pick LLM model
    model_id = llm_model_id or _choose_llm_model_id(client)
    if not model_id:
        raise RuntimeError("No LLM model available via client.models.list()")

    # 3) RAG answer - instruct to use only context
    if context_text:
        messages_rag = [
            {"role": "system", "content": "You are a helpful assistant. USE ONLY the provided CONTEXT to answer the question. When using context, append inline citations like [SRC_1]. If the answer is not present in the context, say 'I don't know.'"},
            {"role": "user", "content": f"CONTEXT:\n{context_text}\n\nQUESTION:\n{query}\n\nAnswer concisely and include inline citations where used."}
        ]
    else:
        messages_rag = [
            {"role": "system", "content": "You are a helpful assistant. No retrieved context was found."},
            {"role": "user", "content": f"QUESTION:\n{query}\n\nAnswer concisely:"}
        ]

    resp_rag = client.inference.chat_completion(messages=messages_rag, model_id=model_id)
    # robust extraction
    rag_answer = None
    if hasattr(resp_rag, "completion_message") and getattr(resp_rag, "completion_message") is not None:
        rag_answer = getattr(resp_rag.completion_message, "content", None) or getattr(resp_rag.completion_message, "text", None)
    elif hasattr(resp_rag, "choices") and resp_rag.choices:
        c0 = resp_rag.choices[0]
        rag_answer = getattr(c0.message, "content", None) or getattr(c0.message, "text", None)
    else:
        rag_answer = str(resp_rag)
    rag_answer = (rag_answer or "").strip()

    # 4) Base LLM answer (no context)
    messages_base = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"QUESTION:\n{query}\n\nAnswer concisely:"}
    ]
    resp_base = client.inference.chat_completion(messages=messages_base, model_id=model_id)
    base_answer = None
    if hasattr(resp_base, "completion_message") and getattr(resp_base, "completion_message") is not None:
        base_answer = getattr(resp_base.completion_message, "content", None) or getattr(resp_base.completion_message, "text", None)
    elif hasattr(resp_base, "choices") and resp_base.choices:
        c0 = resp_base.choices[0]
        base_answer = getattr(c0.message, "content", None) or getattr(c0.message, "text", None)
    else:
        base_answer = str(resp_base)
    base_answer = (base_answer or "").strip()

    # 5) Evaluation metrics
    sim_rag_gt = _tfidf_cosine(rag_answer, ground_truth)
    sim_base_gt = _tfidf_cosine(base_answer, ground_truth)
    # grounding proxy: fraction of sentences in rag_answer that contain a 4-word substring from any retrieved chunk
    def grounding_fraction(answer_text: str, chunks: List[str]) -> float:
        if not answer_text.strip() or not chunks:
            return 0.0
        sents = re.split(r"[.?!]\s+", answer_text)
        matched = 0
        for sent in sents:
            sent_low = sent.lower()
            found = False
            # check n-grams of length 4 from each chunk
            for ch in chunks:
                tokens = ch.split()
                for i in range(max(0, len(tokens)-3)):
                    phrase = " ".join(tokens[i:i+4]).lower()
                    if phrase and phrase in sent_low:
                        found = True
                        break
                if found:
                    break
            if found:
                matched += 1
        return matched / max(1, len(sents))

    grounding_score = grounding_fraction(rag_answer, retrieved_chunks)

    metrics = {
        "sim_rag_vs_ground_truth_tfidf": round(sim_rag_gt, 4),
        "sim_base_vs_ground_truth_tfidf": round(sim_base_gt, 4),
        "length_rag_words": len(rag_answer.split()),
        "length_base_words": len(base_answer.split()),
        "retrieved_chunk_count": len(retrieved_chunks),
        "grounding_fraction": round(grounding_score, 4),
    }

    return {
        "document_id": document_id,
        "retrieved_chunks": retrieved_chunks,
        "rag_answer": rag_answer,
        "base_answer": base_answer,
        "metrics": metrics,
    }


### Testing for pdf

In [None]:
# Enter your pdf and test it 

pdf_name= "RAG_demo_doc.pdf"

### Type in your query

query="What are the conditions for uploading photographs?"

### Type your ground truth (Actual Answer)

ground_truth =""" Photograph Image: (4.5 cm x 3.5 cm)
• Photograph must be a recent passport style colour picture.
• Make sure that the picture is in colour, taken against a light-coloured, preferably white,
background.
• Look straight at the camera with a relaxed face
• If picture is taken on a sunny day, have the sun behind you, or place yourself in shade,
so that you are not squinting and there are no harsh shadows
• If you have to use flash, ensure there's no "red-eye"
• If you wear glasses make sure that there are no reflections and your eyes can be
clearly seen.
• Caps, hats and dark glasses are not acceptable. Religious headwear is allowed but it
must not cover your face.
• Dimensions 200 x 230 pixels (preferred)
• Size of file should be between 20 kb–50 kb
• Ensure that size of the scanned image is not more than 50kb. If the size of the file is
more than 50 kb, then adjust the settings of the scanner such as the DPI resolution,
no. of colours etc., during the process of scanning.
• Photo uploaded should be of appropriate size and clearly visible.
• It is advisable that candidate retains about 8 copies of the same photograph
which is uploaded at the time of online application as these would be needed for
further processes of this selection process."""



In [63]:
# 1) Ingest a document (URL or local PDF or inline text)
res_ingest = ingest_document(pdf_name, client=client, vector_db_id=vector_db_id)
print("Inserted document_id:", res_ingest["document_id"])

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/tool-runtime/rag-tool/insert "HTTP/1.1 200 OK"


Inserted document_id: doc-fccfbdf5


In [65]:
#2) Ask query and evaluate
out = answer_query_with_eval(
    query=query,
    ground_truth=ground_truth,
    document_id=res_ingest["document_id"],
    client=client,
    vector_db_id=vector_db_id,
)

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/tool-runtime/rag-tool/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"


In [66]:
print(out["metrics"])
print("RAG answer:", out["rag_answer"])
print("Base answer:", out["base_answer"])

{'sim_rag_vs_ground_truth_tfidf': 0.4943, 'sim_base_vs_ground_truth_tfidf': 0.3421, 'length_rag_words': 223, 'length_base_words': 107, 'retrieved_chunk_count': 8, 'grounding_fraction': 0.3158}
RAG answer: To upload photographs for the application process, candidates must adhere to several conditions:

1. **Size and Format:** The photograph should be a recent passport-style color picture, with a minimum size of 200 x 230 pixels (preferred) [SRC_5](5). The file type should be JPG or JPEG [SRC_6](6).

2. **Background and Lighting:** The photograph should be taken against a light-colored, preferably white, background [SRC_4](4). Ensure adequate lighting, and avoid using flash if possible, to prevent "red-eye" [SRC_4](4).

3. **Face Clarity:** The candidate's face should be clearly visible, with no harsh shadows or glasses reflections [SRC_4](4). Candidates should look straight at the webcam or mobile phone [SRC_4](4).

4. **File Size:** The file size should be between 20 kb–50 kb [SRC_5](5

## Testing For URL 

In [None]:
### input url
url= "https://mobilityretail.sbi/sbustaticweb/mobile/faq_features.html"

### question
query="Can I view a list of all cheques which I have stopped?"

### Actual answer
ground_truth ="""Yes. Please follow the following steps to check the list of stopped cheques.

Login to Yono Lite SBI.
Click on 'Requests' >> 'Cheque Book' >> 'Stop/Revoke Cheque'
Select the radio button 'View Recent'.
Select the account number from the dropdown.
The app will show all the cheques that you have stopped with their reference number and start cheque numbers.

To see in detail, please select any of the entry and it will show the details."""



In [67]:
# ---------------- Example usage ----------------
# 1) Ingest a document (URL or local PDF or inline text)
res_ingest = ingest_document(url, client=client, vector_db_id=vector_db_id)
print("Inserted document_id:", res_ingest["document_id"])


INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/tool-runtime/rag-tool/insert "HTTP/1.1 200 OK"


Inserted document_id: doc-0bf1edc5


In [69]:
#2) Ask query and evaluate
out = answer_query_with_eval(
    query=query,
    ground_truth=ground_truth,
    document_id=res_ingest["document_id"],
    client=client,
    vector_db_id=vector_db_id,
)


INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/tool-runtime/rag-tool/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"


In [70]:
print(out["metrics"])
print("RAG answer:", out["rag_answer"])
print("Base answer:", out["base_answer"])

{'sim_rag_vs_ground_truth_tfidf': 0.6661, 'sim_base_vs_ground_truth_tfidf': 0.178, 'length_rag_words': 93, 'length_base_words': 37, 'retrieved_chunk_count': 8, 'grounding_fraction': 0.4286}
RAG answer: Yes, you can view a list of all cheques that you have stopped. To do this, follow these steps:

1. Login to Yono Lite SBI [SRC_4].
2. Click on 'Requests' >> 'Cheque Book' >> 'Stop/Revoke Cheque' [SRC_4].
3. Select the radio button 'View Recent' [SRC_6].
4. Select the account number from the dropdown [SRC_6].
5. The app will display all the cheques you have stopped with their reference number and start cheque numbers [SRC_6].
6. To see details of any specific cheque, select it [SRC_6].

This information is provided in [SRC_4] and [SRC_6].
Base answer: Yes, you can typically view a list of stopped cheques in your bank account through online banking, mobile app, or by contacting customer service. The process may vary by bank, so refer to your bank's specific instructions.


In [None]:
## Rough Work

In [None]:
from llama_stack_client import RAGDocument, LlamaStackClient
deployment_endpoint= "http://lsd-llama-milvus-service:8321"
client = LlamaStackClient(base_url=deployment_endpoint)
models= client.models.list()
print(client.models.list())
model_id = next(m.identifier for m in models if m.model_type == "llm")
embedding_model = next(m for m in models if m.model_type == "embedding")
embedding_model_id = embedding_model.identifier
embedding_dimension = embedding_model.metadata["embedding_dimension"]
print(client.vector_dbs.list()) # lists available connectors

vector_db_id = "my_milvus_db"
provider_id  = "milvus"

# Cell 1: dependencies & config (edit endpoint and file path)
import io
import requests
import pdfplumber

# EDIT: put your SmolDocling base URL here (as you provided earlier)
SMOLDOCLING_ENDPOINT = "https://docling-dsdemo.apps.cluster-h97qh.h97qh.sandbox1475.opentlc.com/v1"

# Path to the PDF you want to extract
pdf_path = "RAG_demo_doc.pdf"   # <- change to your pdf file path

# Cell 2: extract text via SmolDocling (with pdfplumber fallback), save to `raw_text`
def extract_text_with_smol_and_save(pdf_path: str, smol_endpoint: str = SMOLDOCLING_ENDPOINT, out_txt: str = "raw_text.txt") -> str:
    """
    1) Try to POST the PDF to SmolDocling's parse endpoint (assumes POST <endpoint>/parse_pdf accepts multipart/form-data file).
    2) If that fails or returns no text, fallback to local extraction using pdfplumber.
    3) Save the final text to out_txt and also return it (and set raw_text).
    """
    # read bytes
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()

    # 1) Try SmolDocling parse route
    extracted = ""
    try:
        files = {"file": (pdf_path, pdf_bytes, "application/pdf")}
        # attempt parse route first
        resp = requests.post(smol_endpoint.rstrip("/") + "/parse_pdf", files=files, timeout=300)
        resp.raise_for_status()
        j = resp.json()
        extracted = j.get("text") or j.get("content") or ""
        if extracted and extracted.strip():
            print("Extracted text from SmolDocling /parse_pdf")
    except Exception as e_parse:
        # try a generic predict route if parse_pdf doesn't exist
        try:
            payload = {"file_name": pdf_path}
            # some deployments might accept multipart on /predict or different shapes - try /predict with same files
            resp2 = requests.post(smol_endpoint.rstrip("/") + "/predict", files=files, timeout=300)
            resp2.raise_for_status()
            j2 = resp2.json()
            extracted = j2.get("text") or j2.get("content") or j2.get("generated_text") or ""
            if extracted and extracted.strip():
                print("Extracted text from SmolDocling /predict")
        except Exception as e_predict:
            print("SmolDocling endpoints failed (parse/predict). Falling back to local extraction.")
            # will fallback below

    # 2) Fallback to pdfplumber if Smol didn't return text
    if not extracted or not extracted.strip():
        try:
            with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
                pages = []
                for i, p in enumerate(pdf.pages):
                    pages.append(f"[page:{i+1}]\n" + (p.extract_text() or ""))
            extracted = "\n\n".join(pages)
            print("Extracted text using pdfplumber fallback.")
        except Exception as e_local:
            # if even fallback fails, keep an empty string and raise/log
            print("Local pdfplumber extraction also failed:", e_local)
            extracted = ""

    # 3) Save to file and return
    with open(out_txt, "w", encoding="utf-8") as out_f:
        out_f.write(extracted or "")

    # put into a global variable raw_text for convenience in notebook
    global raw_text
    raw_text = extracted or ""
    print(f"Saved extracted text to {out_txt}; raw_text length = {len(raw_text)} characters.")
    return raw_text

# Execute extraction (will set raw_text)
raw_text = extract_text_with_smol_and_save(pdf_path)
# quick preview
print("--- preview (first 800 chars) ---")
print(raw_text[:800])


### Do this step only once 


# _ = client.vector_dbs.register(
# vector_db_id=vector_db_id,
# embedding_model=embedding_model_id,
# embedding_dimension=embedding_dimension,
# provider_id=provider_id,
# )
# print(f"Registered vector DB: {vector_db_id}")


document = RAGDocument(
document_id="raw_text_001",
content=raw_text,
mime_type="text/plain",
metadata={"source": "SBI_Doc"},
)

client.tool_runtime.rag_tool.insert(
    documents=[document],
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=100,
)
print("Raw text ingested successfully")

# Example RAG query for one-off lookups
query = "What is condition for uploading photographs ?"

# 1) Run retrieval explicitly (same tool used by the agent)
rag_result = client.tool_runtime.rag_tool.query(
    vector_db_ids=[vector_db_id],
    content=query,
    query_config={
        "chunk_size_in_tokens": 512,
        "chunk_overlap_in_tokens": 0,
        "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
    },
)

# 2) Extract the retrieved text chunks (simple join)
retrieved_texts = []
for item in getattr(rag_result, "content", []) or []:
    # item may be dict-like or object; handle both
    txt = item.get("text") if isinstance(item, dict) else getattr(item, "text", None)
    if not txt:
        continue
    # remove our template labels to keep only chunk text
    import re
    cleaned = re.sub(r"^Result\s*\d+\s*", "", txt, flags=re.I)
    cleaned = re.sub(r"Content:\s*", "", cleaned, flags=re.I)
    cleaned = re.sub(r"\nMetadata:.*$", "", cleaned, flags=re.S|re.I)
    cleaned = cleaned.strip()
    if cleaned:
        retrieved_texts.append(cleaned)

context = "\n\n".join(retrieved_texts).strip()

# 3) Pick an LLM model (first available LLM)
models = client.models.list()
model_id = next((m.identifier for m in models if getattr(m, "model_type", None) == "llm"), None)
if model_id is None:
    raise RuntimeError("No LLM model found in client.models.list()")

# 4) Call the inference API (chat completion) with the retrieved context + user question
messages = [
    {"role": "system", "content": "You are a helpful assistant. USE ONLY the provided CONTEXT to answer."},
    {"role": "user", "content": f"CONTEXT:\n{context}\n\nQUESTION:\n{query}\n\nAnswer concisely:"}
]

resp = client.inference.chat_completion(messages=messages, model_id=model_id)

# 5) Extract final answer text from the inference response and print it
final_text = None
if hasattr(resp, "completion_message") and getattr(resp, "completion_message") is not None:
    final_text = getattr(resp.completion_message, "content", None) or getattr(resp.completion_message, "text", None)
if not final_text and hasattr(resp, "choices") and resp.choices:
    c0 = resp.choices[0]
    if hasattr(c0, "message"):
        final_text = getattr(c0.message, "content", None) or getattr(c0.message, "text", None)
if not final_text:
    # fallback to string representation
    final_text = str(resp)

print(final_text.strip())
