### Evaluation
- Response Evaluation
- Retriever Evaluation

In [1]:
import os
from dotenv import load_dotenv

env_loaded = load_dotenv('../.envrc')
assert env_loaded, 'Failed to load .envrc'

DB_HOST = os.getenv('DB_HOST')
assert DB_HOST is not None
DB_PORT = os.getenv('DB_PORT')
assert DB_PORT is not None
DB_USER = os.getenv('DB_USER')
assert DB_USER is not None
DB_PASSWORD = os.getenv('DB_PASSWORD')
assert DB_PASSWORD is not None
DB_NAME = os.getenv('DB_NAME')
assert DB_NAME is not None

DB_URL = f'postgresql+asyncpg://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'



In [17]:
import os

OLLAMA_API_BASE = os.getenv(
    'OLLAMA_API_BASE',
)
print(f'OLLAMA_API_BASE: {OLLAMA_API_BASE}')

LITELLM_LLM_RERANKER_MODEL_NAME = os.getenv(
    'LITELLM_LLM_RERANKER_MODEL'
)
print(f'LITELLM_LLM_RERANKER_MODEL_NAME: {LITELLM_LLM_RERANKER_MODEL_NAME}')

LLM_RERANKER_TOP_N = os.getenv(
    'LLM_RERANKER_TOP_N',
)
print(f'LLM_RERANKER_TOP_N: {LLM_RERANKER_TOP_N}')

LLM_RERANKER_CHOICE_BATCH_SIZE = os.getenv(
    'LLM_RERANKER_CHOICE_BATCH_SIZE',
)
print(f'LLM_RERANKER_CHOICE_BATCH_SIZE: {LLM_RERANKER_CHOICE_BATCH_SIZE}')

LITELLM_RESPONSE_SYNTHESIZER_MODEL = os.getenv(
    'LITELLM_RESPONSE_SYNTHESIZER_MODEL',
)
print(f'LITELLM_RESPONSE_SYNTHESIZER_MODEL: {LITELLM_RESPONSE_SYNTHESIZER_MODEL}')

SIMILARITY_TOP_K = os.getenv(
    'SIMILARITY_TOP_K',
)
print(f'SIMILARITY_TOP_K: {SIMILARITY_TOP_K}')

SIMILARITY_CUTOFF = os.getenv(
    'SIMILARITY_CUTOFF',
)
print(f'SIMILARITY_CUTOFF: {SIMILARITY_CUTOFF}')

LITELLM_CHAT_ENGINE_LLM_MODEL_NAME = os.getenv(
    'LITELLM_CHAT_ENGINE_LLM_MODEL_NAME',
)
print(f'LITELLM_CHAT_ENGINE_LLM_MODEL_NAME: {LITELLM_CHAT_ENGINE_LLM_MODEL_NAME}')

SENTENCE_TRANSFORMER_RERANKER_MODEL = os.getenv(
    'SENTENCE_TRANSFORMER_RERANKER_MODEL',
)
print(f'SENTENCE_TRANSFORMER_RERANKER_MODEL: {SENTENCE_TRANSFORMER_RERANKER_MODEL}')

SENTENCE_TRANSFORMER_RERANKER_TOP_N = os.getenv(
    'SENTENCE_TRANSFORMER_RERANKER_TOP_N',
)
print(f'SENTENCE_TRANSFORMER_RERANKER_TOP_N: {SENTENCE_TRANSFORMER_RERANKER_TOP_N}')

OLLAMA_API_BASE: http://localhost:11434
LITELLM_LLM_RERANKER_MODEL_NAME: ollama_chat/llama3.2:3b
LLM_RERANKER_TOP_N: 4
LLM_RERANKER_CHOICE_BATCH_SIZE: 5
LITELLM_RESPONSE_SYNTHESIZER_MODEL: ollama_chat/llama3.2:3b
SIMILARITY_TOP_K: 12
SIMILARITY_CUTOFF: 0.7
LITELLM_CHAT_ENGINE_LLM_MODEL_NAME: ollama_chat/llama3.2:3b
SENTENCE_TRANSFORMER_RERANKER_MODEL: cross-encoder/stsb-distilroberta-base
SENTENCE_TRANSFORMER_RERANKER_TOP_N: 4


In [2]:
import phoenix as px
import llama_index.core

def launch_phoenix():
    px.launch_app()
    llama_index.core.set_global_handler("arize_phoenix")

def close_phoenix():
    px.close_app()


#### Load Chat Engine

In [3]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model_name = "intfloat/multilingual-e5-large"
embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
embedding_model_dimensions = 1024

In [4]:
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core import VectorStoreIndex
import re

table_prefix = 'budget_2025-'
model_name_clean = re.sub(r'[^a-zA-Z0-9\-]', '-', embed_model_name)
table_name = f'{table_prefix}{model_name_clean}'
print(f'table_name: {table_name}')

vector_store = PGVectorStore.from_params(
            host=DB_HOST,
            port=DB_PORT,
            database=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
            table_name=table_name,
            perform_setup=False,
            embed_dim=embedding_model_dimensions,
        )

vsi = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    embed_model=embed_model
)

table_name: budget_2025-intfloat-multilingual-e5-large


In [18]:
# similarity postprocessor
from llama_index.core.postprocessor import SimilarityPostprocessor
similarity_postprocessor = SimilarityPostprocessor(similarity_cutoff=SIMILARITY_CUTOFF)


In [19]:
# sentence transformer reranker postprocessor
from llama_index.core.postprocessor import SentenceTransformerRerank
sentence_transformer_reranker_model = 'cross-encoder/mmarco-mMiniLMv2-L12-H384-v1'

sentence_transformer_reranker = SentenceTransformerRerank(
    model=sentence_transformer_reranker_model, top_n=SENTENCE_TRANSFORMER_RERANKER_TOP_N
)



In [20]:
from llama_index.core.response_synthesizers import get_response_synthesizer
from llama_index.core.response_synthesizers.type import ResponseMode
from llama_index.llms.litellm import LiteLLM

response_synthesizer_llm_model = "ollama_chat/llama3.2:3b"
response_synthesizer_llm = LiteLLM(response_synthesizer_llm_model)

response_synthesizer = get_response_synthesizer(llm=response_synthesizer_llm, response_mode=ResponseMode.COMPACT)

In [21]:
from llama_index.llms.litellm import LiteLLM

llm_model_name = "ollama_chat/llama3.2:3b"
llm = LiteLLM(llm_model_name)
print(f'{llm_model_name} context window: {llm.metadata.context_window}')

ollama_chat/llama3.2:3b context window: 2048


In [22]:
from llama_index.core.chat_engine.types import ChatMode

# chat engine settings
similarity_top_k = 12 # no. of nodes to retrieve
# llm_reranker_top_n = 4

chat_engine_llm_model = "ollama_chat/llama3.2:3b"
chat_engine_llm = LiteLLM(chat_engine_llm_model)

chat_engine = vsi.as_chat_engine(
    chat_mode=ChatMode.BEST, 
    llm=chat_engine_llm,
    similarity_top_k=SIMILARITY_TOP_K,
    # node_postprocessors=[similarity_postprocessor, llm_reranker],
    node_postprocessors=[similarity_postprocessor, sentence_transformer_reranker],
    response_synthesizer=response_synthesizer,
    streaming=True
)

In [24]:
from llama_index.core.chat_engine.types import StreamingAgentChatResponse

question = "Are there benefits for having babies in SG60?"

response: StreamingAgentChatResponse = chat_engine.stream_chat(question)
for token in response.response_gen:
    print(token, end="")

 The benefits of having babies in Singapore under the SG60 package include a one-off $500 in LifeSG credits for each child, as well as additional grants and credits for families with three or more children. Specifically, these include:
- A $5,000 increase in the Child Development Account First Step Grant for each third and subsequent child born from today onwards
- A new $5,000 Large Family MediSave Grant disbursed into the mother's MediSave account for each third and subsequent child born from today onwards
- $1,000 in LifeSG credits disbursed annually to families for each of their third and subsequent children during the years that the child turns one to six.

In [None]:
llm_model_name = "ollama_chat/llama3.2:3b"
llm = LiteLLM(llm_model_name)
# llm = LiteLLM(llm_model_name)
print(f'{llm_model_name} context window: {llm.metadata.context_window}')

ollama_chat/llama3.2:3b context window: 2048


In [11]:
launch_phoenix()

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


##### Question Generation

In [None]:
# set your own openai api key for using LLM to evaluate
#os.environ['OPENAI_API_KEY'] = ''

In [36]:
import nest_asyncio

nest_asyncio.apply()

In [30]:
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import FaithfulnessEvaluator
from llama_index.core.chat_engine.types import ChatMessage

# create llm
llm = OpenAI(model="gpt-4o-mini", temperature=0.0)

# define evaluator
evaluator = FaithfulnessEvaluator(llm=llm)

In [31]:
from llama_index.core.schema import MetadataMode
from typing import Sequence

retriever = vsi.as_retriever(similarity_top_k=SIMILARITY_TOP_K)

def get_context(question: str) -> Sequence[str]:
    nodes = retriever.retrieve(question)
    nodes = similarity_postprocessor.postprocess_nodes(nodes)
    nodes = sentence_transformer_reranker.postprocess_nodes(nodes, query_str=question)

    contexts = [node.get_content(MetadataMode.LLM) for node in nodes]

    return contexts


In [None]:
# query index
question = 'What are the benefits of having babies in SG60?'

response = await chat_engine.achat(question)
eval_result = await evaluator.evaluate_response(query=question, response=response)


In [44]:
eval_result

EvaluationResult(query='What are the benefits of having babies in SG60?', contexts=[], response='The benefits of having babies in SG60 include receiving $500 in LifeSG credits for each child, as well as additional grants and credits for families with three or more children.', passing=False, feedback='Empty Response', score=0.0, pairwise_source=None, invalid_result=False, invalid_reason=None)

In [39]:
spans_df = px.active_session().get_spans_dataframe()


  df_attributes = pd.DataFrame.from_records(


In [41]:
spans_with_docs_df = spans_df[spans_df["attributes.retrieval.documents"].notnull()]
spans_with_docs_df[["attributes.input.value", "attributes.retrieval.documents"]].head()

Unnamed: 0_level_0,attributes.input.value,attributes.retrieval.documents
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1
d2d30fa7521a1bff,SG culture pass,"[{'document.score': 0.8397232795, 'document.co..."
8c56dee6160366dc,Benefits of having babies in SG60,"[{'document.score': 0.8403207254, 'document.co..."
6eb476bb6db286a6,What are the benefits of having babies in SG60?,"[{'document.score': 0.8343092401000001, 'docum..."
a2993a8a330b7fdb,What are the benefits of having babies in SG60?,"[{'document.score': 0.8343092401000001, 'docum..."
8c428e7a246907eb,SG60 baby benefits,"[{'document.score': 0.8438623827, 'document.co..."


In [46]:
# reset phoenix
close_phoenix()
launch_phoenix()

Attempting to instrument while already instrumented


🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [47]:
import json
# load json from ../data/questions.json
with open('../data/questions.json', 'r') as f:
    questions = json.load(f)

len(questions)

100

In [48]:
chat_engine.chat(questions[0]['question'])

AgentChatResponse(response="The main theme of the FY2025 Budget is to build on previous budgets while addressing current challenges and planning for a stronger future, with a focus on advancing Singapore's growth, supporting its citizens, and fostering a united society.", sources=[ToolOutput(content="The FY2025 Budget is a Budget for all Singaporeans, tackling immediate challenges while laying the groundwork for a stronger, more resilient tomorrow. It is shaped together with all Singaporeans to advance Forward Singapore efforts and empower each citizen to contribute towards their shared future. The main theme of the budget can be summarized as building a better tomorrow for all Singaporeans by addressing various challenges and planning ahead to secure the nation's future.", tool_name='query_engine_tool', raw_input={'input': 'FY2025 Budget main theme'}, raw_output=Response(response="The FY2025 Budget is a Budget for all Singaporeans, tackling immediate challenges while laying the ground