In [1]:
import os
from dotenv import load_dotenv

env_loaded = load_dotenv('../.envrc')
assert env_loaded, 'Failed to load .envrc'

DB_HOST = os.getenv('DB_HOST')
assert DB_HOST is not None
DB_PORT = os.getenv('DB_PORT')
assert DB_PORT is not None
DB_USER = os.getenv('DB_USER')
assert DB_USER is not None
DB_PASSWORD = os.getenv('DB_PASSWORD')
assert DB_PASSWORD is not None
DB_NAME = os.getenv('DB_NAME')
assert DB_NAME is not None

DB_URL = f'postgresql+asyncpg://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'



In [2]:

OLLAMA_API_BASE = os.getenv(
    'OLLAMA_API_BASE',
)
print(f'OLLAMA_API_BASE: {OLLAMA_API_BASE}')

LITELLM_LLM_RERANKER_MODEL_NAME = os.getenv(
    'LITELLM_LLM_RERANKER_MODEL'
)
print(f'LITELLM_LLM_RERANKER_MODEL_NAME: {LITELLM_LLM_RERANKER_MODEL_NAME}')

LLM_RERANKER_TOP_N = os.getenv(
    'LLM_RERANKER_TOP_N',
)
print(f'LLM_RERANKER_TOP_N: {LLM_RERANKER_TOP_N}')

LLM_RERANKER_CHOICE_BATCH_SIZE = os.getenv(
    'LLM_RERANKER_CHOICE_BATCH_SIZE',
)
print(f'LLM_RERANKER_CHOICE_BATCH_SIZE: {LLM_RERANKER_CHOICE_BATCH_SIZE}')

LITELLM_RESPONSE_SYNTHESIZER_MODEL = os.getenv(
    'LITELLM_RESPONSE_SYNTHESIZER_MODEL',
)
print(f'LITELLM_RESPONSE_SYNTHESIZER_MODEL: {LITELLM_RESPONSE_SYNTHESIZER_MODEL}')

SIMILARITY_TOP_K = os.getenv(
    'SIMILARITY_TOP_K',
)
print(f'SIMILARITY_TOP_K: {SIMILARITY_TOP_K}')

SIMILARITY_CUTOFF = os.getenv(
    'SIMILARITY_CUTOFF',
)
print(f'SIMILARITY_CUTOFF: {SIMILARITY_CUTOFF}')

JINA_RERANKER_MODEL = os.getenv(
    'JINA_RERANKER_MODEL',
)
print(f'JINA_RERANKER_MODEL: {JINA_RERANKER_MODEL}')

JINA_RERANKER_TOP_N = os.getenv(
    'JINA_RERANKER_TOP_N',
)
print(f'JINA_RERANKER_TOP_N: {JINA_RERANKER_TOP_N}')

LITELLM_CHAT_ENGINE_LLM_MODEL_NAME = os.getenv(
    'LITELLM_CHAT_ENGINE_LLM_MODEL_NAME',
)
print(f'LITELLM_CHAT_ENGINE_LLM_MODEL_NAME: {LITELLM_CHAT_ENGINE_LLM_MODEL_NAME}')

SENTENCE_TRANSFORMER_RERANKER_MODEL = os.getenv(
    'SENTENCE_TRANSFORMER_RERANKER_MODEL',
)
print(f'SENTENCE_TRANSFORMER_RERANKER_MODEL: {SENTENCE_TRANSFORMER_RERANKER_MODEL}')

SENTENCE_TRANSFORMER_RERANKER_TOP_N = os.getenv(
    'SENTENCE_TRANSFORMER_RERANKER_TOP_N',
)
print(f'SENTENCE_TRANSFORMER_RERANKER_TOP_N: {SENTENCE_TRANSFORMER_RERANKER_TOP_N}')

OLLAMA_API_BASE: http://ollama:11434
LITELLM_LLM_RERANKER_MODEL_NAME: ollama_chat/llama3.2:3b
LLM_RERANKER_TOP_N: 4
LLM_RERANKER_CHOICE_BATCH_SIZE: 5
LITELLM_RESPONSE_SYNTHESIZER_MODEL: ollama_chat/llama3.2:1b
SIMILARITY_TOP_K: 6
SIMILARITY_CUTOFF: 0.8
JINA_RERANKER_MODEL: jina-reranker-v2-base-multilingual
JINA_RERANKER_TOP_N: 2
LITELLM_CHAT_ENGINE_LLM_MODEL_NAME: ollama_chat/llama3.2:1b
SENTENCE_TRANSFORMER_RERANKER_MODEL: cross-encoder/stsb-distilroberta-base
SENTENCE_TRANSFORMER_RERANKER_TOP_N: 4


#### Observability with Arize Phoenix
- Check latency at each step
- Check input and output


In [4]:
import phoenix as px
import llama_index.core

def launch_phoenix():
    px.launch_app()
    llama_index.core.set_global_handler("arize_phoenix")

def close_phoenix():
    px.close_app()


In [5]:
launch_phoenix() # observability tool into the chat engine e.g. retrieval, reranking, response generation etc

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


#### Load Embedding model

In [3]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import re

embed_model_name = "intfloat/multilingual-e5-large"
embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
embedding_model_dimensions = 1024

#### Connect to vector store

In [4]:
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core import VectorStoreIndex

table_prefix = 'budget_2025-'
model_name_clean = re.sub(r'[^a-zA-Z0-9\-]', '-', embed_model_name)
table_name = f'{table_prefix}{model_name_clean}'
print(f'table_name: {table_name}')

vector_store = PGVectorStore.from_params(
            host=DB_HOST,
            port=DB_PORT,
            database=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
            table_name=table_name,
            perform_setup=False,
            embed_dim=embedding_model_dimensions,
        )

vsi = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    embed_model=embed_model
)

table_name: budget_2025-intfloat-multilingual-e5-large


### Components

In [8]:
# question = "as an undergraduate, what benefits do i get?"
question = "我是一个大学生 政府有什么补贴吗？"

#### Retriever

In [5]:
retriever = vsi.as_retriever(similarity_top_k=SIMILARITY_TOP_K)

In [6]:
assert vsi._embed_model.model_name == embed_model_name

In [11]:
nodes = retriever.retrieve(question)
print(nodes[0].score)
print(nodes[0].text)


0.8467587919975063
## 12月

现金补助 $\longrightarrow 100$ 元至600元
[定心与接助配套]


#### Postprocessors

##### Similarity Cutoff Postprocessor

In [7]:
# similarity postprocessor
from llama_index.core.postprocessor import SimilarityPostprocessor
similarity_postprocessor = SimilarityPostprocessor(similarity_cutoff=SIMILARITY_CUTOFF)


In [13]:
# Testing it out 
similarity_postprocessor_nodes = similarity_postprocessor.postprocess_nodes(nodes)
print(len(similarity_postprocessor_nodes))
print(similarity_postprocessor_nodes[0].text)


12
## 12月

现金补助 $\longrightarrow 100$ 元至600元
[定心与接助配套]


##### Sentence Transformer Reranker

In [13]:
# sentence transformer reranker postprocessor
from llama_index.core.postprocessor import SentenceTransformerRerank
sentence_transformer_reranker = SentenceTransformerRerank(
    model=SENTENCE_TRANSFORMER_RERANKER_MODEL, top_n=SENTENCE_TRANSFORMER_RERANKER_TOP_N
)

config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

In [18]:
# Testing out
reranked_nodes = sentence_transformer_reranker.postprocess_nodes(nodes, query_str=question)
print(reranked_nodes[0].text)

## 支持劳动力转型

- 推出新的技能创前程劳动力发展津贴，包括为重新设计工作的企业提供高达 $70 \%$ 的资助
- 经重新设计的技能创前程企业补助将从2026年下半年起，给予符合条件的企业 1 万元，让它们抵消推行合格劳动力转型计划的自付开销
- 为全国职工总会企业培训委员会补助金额外拨款 2 亿元，以协助更多企业转型和提升员工技能


##### Jina Reranker

In [8]:
from llama_index.postprocessor.jinaai_rerank import JinaRerank

JINA_API_KEY = os.getenv('JINA_API_KEY')
assert JINA_API_KEY is not None

jina_reranker = JinaRerank(
    top_n=JINA_RERANKER_TOP_N, model=JINA_RERANKER_MODEL, api_key=JINA_API_KEY
)

# Testing
# reranked_nodes = postprocessor.postprocess_nodes(nodes, query_str=question)
# len(reranked_nodes)


##### LLM Reranker
Not used as it is too slow though the post processed nodes are quite relevant to be passed to the LLM for generating response

In [40]:
# LLM Reranker
from llama_index.core.postprocessor import LLMRerank
from llama_index.llms.litellm import LiteLLM


llm_reranker_model_name = "ollama_chat/llama3.2:3b"
llm_reranker_model = LiteLLM(llm_reranker_model_name)

llm_reranker_top_n = 4
choice_batch_size = 10

llm_reranker = LLMRerank(llm=llm_reranker_model, top_n=llm_reranker_top_n, choice_batch_size=choice_batch_size)


In [None]:
# Testing
reranked_nodes_llm_reranker = llm_reranker.postprocess_nodes(similarity_postprocessor_nodes, query_str=question)
print(len(reranked_nodes_llm_reranker))
print(reranked_nodes_llm_reranker[0].text)


#### Response Synthesizer

In [9]:
from llama_index.core.response_synthesizers import get_response_synthesizer
from llama_index.core.response_synthesizers.type import ResponseMode
from llama_index.llms.litellm import LiteLLM

response_synthesizer_llm = LiteLLM(LITELLM_RESPONSE_SYNTHESIZER_MODEL)

response_synthesizer = get_response_synthesizer(llm=response_synthesizer_llm, response_mode=ResponseMode.COMPACT)

In [21]:
# testing out
response_synthesizer_response = await response_synthesizer.asynthesize(query=question, nodes=reranked_nodes)
print(response_synthesizer_response)

政府为年轻人提供的补贴包括现金补助和教育储蓄户头填补。


#### Chat Engine

In [10]:
from llama_index.llms.litellm import LiteLLM

llm = LiteLLM(LITELLM_CHAT_ENGINE_LLM_MODEL_NAME)
print(f'{LITELLM_CHAT_ENGINE_LLM_MODEL_NAME} context window: {llm.metadata.context_window}')

ollama_chat/llama3.2:1b context window: 2048


In [10]:
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.storage.chat_store.postgres import PostgresChatStore

chat_store = PostgresChatStore.from_uri(DB_URL)

thread_id = 'thread_6'

chat_history = chat_store.get_messages(thread_id)

memory = ChatMemoryBuffer.from_defaults(
    chat_store=chat_store,
    chat_store_key=thread_id,
    chat_history=chat_history,
    llm=llm,
)

print(f'chat history: {chat_history}')

chat history: [ChatMessage(role=<MessageRole.USER: 'user'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='tell me more about SG culture pass')]), ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='The SG Culture Pass is a program that offers cultural experiences and discounts to Singaporeans aged 18 and above, providing access to various institutions, events, and attractions.')]), ChatMessage(role=<MessageRole.USER: 'user'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='What are the key beenfits for undergraduate students? Are there housing subsidies provided for first time home buyers?')]), ChatMessage(role=<MessageRole.USER: 'user'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='What are the key beenfits for undergraduate students? Are there housing subsidies provided for first time home buyers?')]), ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, addit

In [11]:
from llama_index.core.chat_engine.types import ChatMode

chat_engine_llm = LiteLLM(LITELLM_CHAT_ENGINE_LLM_MODEL_NAME)

chat_engine = vsi.as_chat_engine(
    chat_mode=ChatMode.BEST, 
    llm=chat_engine_llm,
    similarity_top_k=SIMILARITY_TOP_K,
    # node_postprocessors=[similarity_postprocessor, sentence_transformer_reranker], 
    node_postprocessors=[similarity_postprocessor, jina_reranker],
    response_synthesizer=response_synthesizer,
    streaming=True,
    # memory=memory, # remove memory for now as it does not work well with the chat engine i.e. the llm pays more attention to the memory than the context
)

In [12]:
type(chat_engine)

llama_index.core.agent.react.base.ReActAgent

In [13]:
chat_engine_llm._get_model_name()

'ollama_chat/llama3.2:3b'

In [34]:
from llama_index.core.chat_engine.types import StreamingAgentChatResponse

# question = "tell me more about SG culture pass"
question = "What are the key beenfits for undergraduate students? Are there housing subsidies provided for first time home buyers?"

response: StreamingAgentChatResponse = chat_engine.stream_chat(question)
for token in response.response_gen:
    print(token, end="")

 Undergraduate students in Singapore can benefit from a $\$ 500$ top-up to their Edusave account or Post-Secondary Education Account (PSEA) this year, which can help with education expenses. For first-time homebuyers, particularly young married couples and parents with young children, there are subsidies available through the Fresh Start scheme, which allows them to buy shorter-lease subsidised flats.

---

### Agent with Tools
Tools:
- Search from knowledge base (QueryEngineTool)
- Search from web (FunctionTool)

##### Query Engine Tool

In [81]:
response_synthesizer._llm

LiteLLM(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7f10ba05ddc0>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x7f118598a340>, completion_to_prompt=<function default_completion_to_prompt at 0x7f1185751260>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, model='ollama_chat/llama3.2:3b', temperature=0.1, max_tokens=None, additional_kwargs={}, max_retries=10)

In [33]:
from llama_index.core.tools import QueryEngineTool

query_engine = vsi.as_query_engine(
    llm=response_synthesizer._llm,
    similarity_top_k=SIMILARITY_TOP_K,
    node_postprocessors=[similarity_postprocessor, sentence_transformer_reranker],
    response_synthesizer=response_synthesizer,
)

search_knowledge_base_tool = QueryEngineTool.from_defaults(
    query_engine,
    name='search_knowledge_base',
    description="Search information from the knowledge base"
)

#### Web Search Tool
Web search just in case the answer is not retrieved from the knowledge base? 

In [35]:
TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')
assert TAVILY_API_KEY is not None

In [None]:
from llama_index.core.tools import FunctionTool
from tavily import AsyncTavilyClient

async def search_web(query: str) -> str:
    """Useful for using the web to answer questions."""
    client = AsyncTavilyClient(api_key=TAVILY_API_KEY)
    return str(await client.search(query))


tool = FunctionTool.from_defaults(
    search_web,
    name='search_web',
    description="Useful for using the web to answer questions"
    # async_fn=asearch_web,  # optional!
)

In [None]:
from tavily import AsyncTavilyClient

client = AsyncTavilyClient(api_key=TAVILY_API_KEY)

In [40]:
query = "how much CDC vouchers can I get?"

include_domains = ['https://www.mof.gov.sg/singaporebudget']

search_res = await client.search(query=query, include_domains=include_domains)

In [43]:
search_res

{'query': 'how much CDC vouchers can I get?',
 'follow_up_questions': None,
 'answer': None,
 'images': [],
 'results': [{'title': 'Budget | Support For Singaporeans',
   'url': 'https://www.mof.gov.sg/singaporebudget/budget-2025-highlights/support-for-singaporeans',
   'content': 'Budget Speech Budget Resources Budget 2025 Highlights About Budget Budget 2025 Highlights CDC Vouchers [New]  $500 SG60 ActiveSG Credit Top-Up [New]   $100 SG60 Vouchers [New] $600 or $800 Child LifeSG Credits or Edusave Account / Post-Secondary Education Account Top-up [New] $500 MediSave [GSTV] $150 to $450 Large Family LifeSG Credits [New] SG Culture Pass [New]   $100 Personal Income Tax Rebate for Year of Assessment (YA) 2025 [New]   Up to $200 CDC Vouchers [New]  $300 Support for You and Your Households Singapore Budget 2025 is part of the Ministry of Finance, Singapore. Singapore Budget 2025 Budget Speech Budget Statement Budget Resources Budget 2025 Highlights Support For You And Your Household Suppor

In [46]:
print(search_res['results'][0]['content'])


Budget Speech Budget Resources Budget 2025 Highlights About Budget Budget 2025 Highlights CDC Vouchers [New]  $500 SG60 ActiveSG Credit Top-Up [New]   $100 SG60 Vouchers [New] $600 or $800 Child LifeSG Credits or Edusave Account / Post-Secondary Education Account Top-up [New] $500 MediSave [GSTV] $150 to $450 Large Family LifeSG Credits [New] SG Culture Pass [New]   $100 Personal Income Tax Rebate for Year of Assessment (YA) 2025 [New]   Up to $200 CDC Vouchers [New]  $300 Support for You and Your Households Singapore Budget 2025 is part of the Ministry of Finance, Singapore. Singapore Budget 2025 Budget Speech Budget Statement Budget Resources Budget 2025 Highlights Support For You And Your Household Support For Vulnerable Families And Persons With Disabilities About Budget Budget Archives


#### Agent

In [59]:
from llama_index.core.agent.workflow import ReActAgent
from llama_index.core.memory import ChatMemoryBuffer

func_calling_llm = LiteLLM("ollama_chat/llama3.2:3b")

memory = ChatMemoryBuffer.from_defaults(token_limit=40000)

tools = [search_knowledge_base_tool]
agent = ReActAgent(
    name="Budget 2025 RAG Agent", 
    description="An agent that tells you information about the budget",
    llm=func_calling_llm,
    tools=tools
)



In [70]:
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.storage.chat_store.postgres import PostgresChatStore

chat_store = PostgresChatStore.from_uri(DB_URL)

thread_id = 'thread_2'

memory = ChatMemoryBuffer.from_defaults(
    chat_store=chat_store,
    chat_store_key=thread_id,
)

In [71]:
chat_history = memory.get_all()
question = 'as an undergraduate student, what benefits can i get?'
handler = agent.run(question, memory=memory, chat_history=chat_history)

In [72]:
from llama_index.core.agent.workflow import AgentStream, AgentOutput

events = []
async for event in handler.stream_events():
    events.append(event)
    if isinstance(event, AgentStream):
        print(event.delta, end="", flush=True)

    elif isinstance(event, AgentOutput):
       print(f'AgentOutput: {event.response}')  # the current full response
    #    print(event.tool_calls)  # the selected tool calls, if any
    #    print(event.raw)  # the raw llm api response

Thought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: search_knowledge_base
Action Input: {"input": {"title": "Benefits for undergraduate students", "type": "string"}}AgentOutput: assistant: Thought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: search_knowledge_base
Action Input: {"input": {"title": "Benefits for undergraduate students", "type": "string"}}
Thought: The current language of the user is still English. It seems that I need to provide a valid string as input for the tool.
Action: search_knowledge_base
Action Input: {'input': 'Benefits for undergraduate students'}AgentOutput: assistant: Thought: The current language of the user is still English. It seems that I need to provide a valid string as input for the tool.
Action: search_knowledge_base
Action Input: {'input': 'Benefits for undergraduate students'}
Thought: The current language of the user is sti

Maybe use chat engine will be better than agent

### Corrective RAG Workflow 
- (retrieve_context) Question come in, retrieve context and set in state, maybe perform some postprocessing e.g. reranking
- (generate_answer) Answer question from context
- (grade_answer) Grade answer
- (refine_answer) Refine answer if needed
- (provide_answer) Provide answer to user

State to maintain:
- Question
- Context
- Answer
- Grade
- Refined Answer



In [9]:
from llama_index.core.workflow import (
    Event,
    StartEvent,
    StopEvent,
    Workflow,
    step,
)
from llama_index.core.schema import NodeWithScore
from llama_index.core.base.response.schema import RESPONSE_TYPE

In [115]:
from typing import Annotated
from pydantic import BaseModel, Field

# Start Event
class QuestionEvent(StartEvent):
    question: Annotated[str, "Single question"]
    similarity_top_k: Annotated[int, "Number of nodes to retrieve"] = 20


class NodesRetrievedEvent(Event):
    retrieved_nodes: Annotated[list[NodeWithScore], "Retrieved nodes"]

class NodesRerankedEvent(Event):
    reranked_nodes: Annotated[list[NodeWithScore], "Reranked nodes"]

# class AnswerGeneratedFromContextEvent(Event):
class AnswerGeneratedFromContextEvent(StopEvent):
    question: Annotated[str, "Question"]
    context: Annotated[str, "Context"]
    answer: Annotated[str, "Answer generated from context"]

class GraderOutput(BaseModel):
    is_grounded: bool = Field(description="Whether the answer is grounded in the context")
    confidence: float = Field(
        gt=0.0, lt=1.0,
        description="Confidence value between 0.00 and 1.00 of how grounded the answer is obtained from the context.",
    )
    confidence_explanation: str = Field(..., description="Explanation for the confidence score")

class AnswerGradedEvent(Event):
    grader_output: Annotated[GraderOutput, 'Output object from grading the answer with respect to the context']
    question: Annotated[str, "Question"]
    context: Annotated[str, "Context"]
    answer: Annotated[str, "Answer"]
    
# Stop Event
class AnswerGeneratedEvent(StopEvent):
    answer: Annotated[str, "Answer to user's question. If the answer is grounded in the context, then the answer will be the generated answer from context. Otherwise, the answer will be a fallback answer."]


In [116]:
from llama_index.core.prompts import PromptTemplate
DEFAULT_RELEVANCY_GROUNDING_PROMPT_TEMPLATE = PromptTemplate(
    template="""As a grader, your task is to evaluate the grounding of a generated answer in the context provided with respect to the user's question.

    <question-start>:
    \"\"\"
    {question_str}
    \"\"\"
    <question-end>

    <context-start>:
    \"\"\"
    {context_str}
    \"\"\"
    <context-end>

    <answer-start>:
    \"\"\"
    {answer_str}
    \"\"\"
    <answer-end>

    Evaluation Criteria:
    - Consider whether the answer answers the question.
    - Consider whether the answer can be inferred from the context.

    """
)


In [124]:
from llama_index.core.prompts import PromptTemplate

DEFAULT_ANSWER_GENERATION_PROMPT_TEMPLATE = PromptTemplate(
    template="""Your task is to answer the user's question about Singapore government budget statement based on the context provided. Be detailed and objective in your answer.

    Context: \"\"\"
    {context_str}
    \"\"\"

    User Question: \"\"\"
    {query_str}
    \"\"\"

    """
)

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.core.workflow import Context
from llama_index.core.response_synthesizers import get_response_synthesizer
from llama_index.core.response_synthesizers.type import ResponseMode
from llama_index.core.schema import MetadataMode
from guardrails import Guard
import guardrails as gd
from llama_index.core.llms import ChatMessage
from llama_index.core.llms import ChatResponse
from llama_index.llms.litellm import LiteLLM

# Define the prompt
guard_structured_prompt = """
Query string here.

${gr.xml_prefix_prompt}

${output_schema}

${gr.json_suffix_prompt_v2_wo_none}
"""

class QAWorkflow(Workflow):
    llm = LiteLLM("ollama_chat/llama3.2:3b")
    vsi: VectorStoreIndex = VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        embed_model=embedding_model
    )
    postprocessor = JinaRerank(
        top_n=20, model="jina-reranker-v1-base-en", api_key=JINA_API_KEY
    )
    
    grader_output_guard = gd.Guard.from_pydantic(output_class=GraderOutput, prompt=guard_structured_prompt)

    grader_output_parser = GuardrailsOutputParser(grader_output_guard)
    grader_llm = Ollama(model="llama3.2:3b", request_timeout=60.0, output_parser=grader_output_parser)

    @step
    async def retrieve_nodes(self, ctx: Context, ev: QuestionEvent) -> NodesRetrievedEvent:
        question = ev.question
        # set question in global state
        await ctx.set("question", question)

        retriever = self.vsi.as_retriever(similarity_top_k=ev.similarity_top_k)
        qe = vsi.as_query_engine()
        nodes = await retriever.aretrieve(question)
        
        return NodesRetrievedEvent(retrieved_nodes=nodes)

    @step
    async def rerank_nodes(self, ctx: Context, ev: NodesRetrievedEvent) -> NodesRerankedEvent:
        nodes = ev.retrieved_nodes
        question = await ctx.get("question")

        reranked_nodes = postprocessor.postprocess_nodes(nodes, query_str=question)
        return NodesRerankedEvent(reranked_nodes=reranked_nodes)

    @step
    async def generate_answer_from_context(self, ctx: Context, ev: NodesRerankedEvent) -> AnswerGeneratedFromContextEvent:
        nodes = ev.reranked_nodes
        question = await ctx.get("question")
        print(f'in generate_answer_from_context, question: {question}')

        # set context in global state for reference subsequently
        context_str = "\n\n".join([node.get_content(MetadataMode.LLM) for node in nodes])
        # await ctx.set("context_str", context_str)

        response_synthesizer = get_response_synthesizer(llm=self.llm, response_mode=ResponseMode.COMPACT)
        
        response = await response_synthesizer.asynthesize(query=question, nodes=nodes)
        print(f'in generate_answer_from_context, response: {response}')
        
        return AnswerGeneratedFromContextEvent(answer=response.response, question=question, context=context_str)

    # @step
    # async def grade_answer(self, ctx: Context, ev: AnswerGeneratedFromContextEvent) -> AnswerGradedEvent:
    #     question, answer, context = ev.question, ev.answer, ev.context
    #     print(f'in grade_answer, question: {question}')
    #     print(f'in grade_answer, answer: {answer}')

    #     grading_template = DEFAULT_RELEVANCY_GROUNDING_PROMPT_TEMPLATE.format(question_str=question, context_str=context, answer_str=answer)

    #     grading_template_with_guard = self.grader_output_parser.format(grading_template)

    #     grader_output_response: ChatResponse = await self.grader_llm.achat([ChatMessage(role='user', content=grading_template_with_guard)])

    #     grader_output_response_str = grader_output_response.message.content
    #     grader_output = GraderOutput.model_validate(grader_output_response_str)

    #     return AnswerGradedEvent(grader_output=grader_output, question=question, context=context, answer=answer)
    
    # @step
    # async def generate_answer(self, ctx: Context, ev: AnswerGradedEvent) -> AnswerGeneratedEvent:
    #     # check if the answer is grounded in the context
    #     answer = 'Sorry, I don\'t have enough information to answer that question.'
    #     if ev.grader_output.is_grounded:
    #         answer = ev.answer
        
    #     return AnswerGeneratedEvent(answer=answer)


NameError: name 'JinaRerank' is not defined

In [135]:
w = QAWorkflow(timeout=120,verbose=True)
result = await w.run(start_event=QuestionEvent(question="What do i gain if i am an undergraduate student?"))
print(result)


Running step retrieve_nodes
Step retrieve_nodes produced event NodesRetrievedEvent
Running step rerank_nodes
Step rerank_nodes produced event NodesRerankedEvent
Running step generate_answer_from_context
in generate_answer_from_context, question: What do i gain if i am an undergraduate student?
in generate_answer_from_context, response: Based on the new context of changing circumstances, I'll rewrite the answer:

As a Singaporean undergraduate student, navigating the current economic landscape can be challenging. However, with the government's emphasis on technology and innovation, enterprise ecosystem, and infrastructure investments, there are opportunities to enhance your skills and knowledge.

You can develop in-demand expertise through training programs and workshops that drive efficiency and innovation in various sectors, such as those related to technology and sustainability. Additionally, connecting with like-minded individuals and entrepreneurs who share your vision for creating

WorkflowRuntimeError: Error in step 'grade_answer': 1 validation error for GraderOutput
  Input should be a valid dictionary or instance of GraderOutput [type=model_type, input_value='```json\n{\n  "is_ground...in the context"\n}\n```', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/model_type

#### Guard

In [13]:
ollama_model_name = 'ollama_chat/llama3.2:3b'

In [None]:
from guardrails import Guard


guard = Guard()

result = guard(
    messages=[{"role":"user", "content":"How many moons does Jupiter have?"}],
    model=ollama_model_name,
)

print(f"{result.validated_output}")

In [80]:
class GraderOutput(BaseModel):
    is_grounded: bool = Field(description="Whether the answer is grounded in the context")
    confidence: float = Field(
        gt=0.0, lt=1.0,
        description="Confidence value between 0-1 of how grounded the answer is obtained from the context.",
    )
    confidence_explanation: str = Field(..., description="Explanation for the confidence score")

In [81]:
guard = Guard.for_pydantic(GraderOutput)

In [82]:
question = 'What benefits do i get if i am an undergraduate student?'
context = 'Singapore government provides various benefits to undergraduate students. For example, they can apply for the Singaporean government scholarship to study in Singapore. They can also apply for the Singaporean government loan to study in Singapore. They can also apply for the Singaporean government grant to study in Singapore.'
# answer = 'The capital of France is Paris.'
answer = 'Students can apply for the Singaporean government scholarship, take a loan or apply for a government grant to study in Singapore.'


prompt = DEFAULT_RELEVANCY_GROUNDING_PROMPT_TEMPLATE.format(question_str=question, context_str=context, answer_str=answer)

In [83]:
messages = [{
  "role": "system",
  "content": "You are a helpful assistant."
}, {
  "role": "user",
  "content": prompt
}]

In [84]:
prompt+="""

${gr.complete_json_suffix_v3}
"""
response = guard(
    model=ollama_model_name,
    messages=messages,
    # prompt_params={"chat_history": chat_history},
)



In [87]:
response.validated_output

{'is_grounded': True,
 'confidence': 0.7,
 'confidence_explanation': 'Partially. The context provides information about various government benefits available to undergraduate students, but it does not explicitly mention a scholarship, loan, or grant.'}

In [88]:
response_str = str(response.validated_output)
response_str

"{'is_grounded': True, 'confidence': 0.7, 'confidence_explanation': 'Partially. The context provides information about various government benefits available to undergraduate students, but it does not explicitly mention a scholarship, loan, or grant.'}"

In [None]:
tools = [] # an open ai compatible list of tools

response = guard(
    model=ollama_model_name,
    messages=messages,
    prompt_params={"chat_history": chat_history},
    tools=guard.json_function_calling_tool(tools),
    tool_choice="required",
)