### Evaluation
- Retriever Evaluation
    - NCDG Score
    - Precision at 2
    - Hit Rate
- Response Evaluation
    - QE
    - Hallucination
    - Response Time

In [1]:
import os
from dotenv import load_dotenv

env_loaded = load_dotenv('../.envrc')
assert env_loaded, 'Failed to load .envrc'

DB_HOST = os.getenv('DB_HOST')
assert DB_HOST is not None
DB_PORT = os.getenv('DB_PORT')
assert DB_PORT is not None
DB_USER = os.getenv('DB_USER')
assert DB_USER is not None
DB_PASSWORD = os.getenv('DB_PASSWORD')
assert DB_PASSWORD is not None
DB_NAME = os.getenv('DB_NAME')
assert DB_NAME is not None

DB_URL = f'postgresql+asyncpg://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'



In [2]:
import os

OLLAMA_API_BASE = os.getenv(
    'OLLAMA_API_BASE',
)
print(f'OLLAMA_API_BASE: {OLLAMA_API_BASE}')

LITELLM_LLM_RERANKER_MODEL_NAME = os.getenv(
    'LITELLM_LLM_RERANKER_MODEL'
)
print(f'LITELLM_LLM_RERANKER_MODEL_NAME: {LITELLM_LLM_RERANKER_MODEL_NAME}')

LLM_RERANKER_TOP_N = os.getenv(
    'LLM_RERANKER_TOP_N',
)
print(f'LLM_RERANKER_TOP_N: {LLM_RERANKER_TOP_N}')

LLM_RERANKER_CHOICE_BATCH_SIZE = os.getenv(
    'LLM_RERANKER_CHOICE_BATCH_SIZE',
)
print(f'LLM_RERANKER_CHOICE_BATCH_SIZE: {LLM_RERANKER_CHOICE_BATCH_SIZE}')

LITELLM_RESPONSE_SYNTHESIZER_MODEL = os.getenv(
    'LITELLM_RESPONSE_SYNTHESIZER_MODEL',
)
print(f'LITELLM_RESPONSE_SYNTHESIZER_MODEL: {LITELLM_RESPONSE_SYNTHESIZER_MODEL}')

SIMILARITY_TOP_K = os.getenv(
    'SIMILARITY_TOP_K',
)
print(f'SIMILARITY_TOP_K: {SIMILARITY_TOP_K}')

SIMILARITY_CUTOFF = os.getenv(
    'SIMILARITY_CUTOFF',
)
print(f'SIMILARITY_CUTOFF: {SIMILARITY_CUTOFF}')

LITELLM_CHAT_ENGINE_LLM_MODEL_NAME = os.getenv(
    'LITELLM_CHAT_ENGINE_LLM_MODEL_NAME',
)
print(f'LITELLM_CHAT_ENGINE_LLM_MODEL_NAME: {LITELLM_CHAT_ENGINE_LLM_MODEL_NAME}')

SENTENCE_TRANSFORMER_RERANKER_MODEL = os.getenv(
    'SENTENCE_TRANSFORMER_RERANKER_MODEL',
)
print(f'SENTENCE_TRANSFORMER_RERANKER_MODEL: {SENTENCE_TRANSFORMER_RERANKER_MODEL}')

# SENTENCE_TRANSFORMER_RERANKER_TOP_N = os.getenv(
#     'SENTENCE_TRANSFORMER_RERANKER_TOP_N',
# )
# print(f'SENTENCE_TRANSFORMER_RERANKER_TOP_N: {SENTENCE_TRANSFORMER_RERANKER_TOP_N}')

JINA_RERANKER_TOP_N = os.getenv(
    'JINA_RERANKER_TOP_N',
)
print(f'JINA_RERANKER_TOP_N: {JINA_RERANKER_TOP_N}')

JINA_RERANKER_MODEL = os.getenv(
    'JINA_RERANKER_MODEL',
)
print(f'JINA_RERANKER_MODEL: {JINA_RERANKER_MODEL}')

JINA_API_KEY = os.getenv(
    'JINA_API_KEY',
)
assert JINA_API_KEY is not None



OLLAMA_API_BASE: http://localhost:11434
LITELLM_LLM_RERANKER_MODEL_NAME: ollama_chat/llama3.2:3b
LLM_RERANKER_TOP_N: 4
LLM_RERANKER_CHOICE_BATCH_SIZE: 5
LITELLM_RESPONSE_SYNTHESIZER_MODEL: ollama_chat/llama3.2:3b
SIMILARITY_TOP_K: 12
SIMILARITY_CUTOFF: 0.7
LITELLM_CHAT_ENGINE_LLM_MODEL_NAME: ollama_chat/llama3.2:3b
SENTENCE_TRANSFORMER_RERANKER_MODEL: cross-encoder/stsb-distilroberta-base
JINA_RERANKER_TOP_N: 4
JINA_RERANKER_MODEL: jina-reranker-v2-base-multilingual


In [3]:
import phoenix as px
import llama_index.core

def launch_phoenix():
    if not px.active_session():
        px.launch_app()
        llama_index.core.set_global_handler("arize_phoenix")

def close_phoenix():
    if px.active_session():
        px.close_app()


In [4]:
# The nest_asyncio module enables the nesting of asynchronous functions within an already running async loop.
# This is necessary because Jupyter notebooks inherently operate in an asynchronous loop.
# By applying nest_asyncio, we can run additional async functions within this existing loop without conflicts.
import nest_asyncio

nest_asyncio.apply()


#### Load Query Engine
For both retrieval and response evaluation, use query engine instead of chat engine as chat engine llm may answer the question based on memory of previous questions and skip retrieval.

We want to do retrieval on every question to evaluate retrieval and whether the answer generated can be found from the retrieved chunks.

In [5]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model_name = "intfloat/multilingual-e5-large"
embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
embedding_model_dimensions = 1024

In [6]:
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core import VectorStoreIndex
import re

table_prefix = 'budget_2025-'
model_name_clean = re.sub(r'[^a-zA-Z0-9\-]', '-', embed_model_name)
table_name = f'{table_prefix}{model_name_clean}'
print(f'table_name: {table_name}')

vector_store = PGVectorStore.from_params(
            host=DB_HOST,
            port=DB_PORT,
            database=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
            table_name=table_name,
            perform_setup=False,
            embed_dim=embedding_model_dimensions,
        )

vsi = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    embed_model=embed_model
)

table_name: budget_2025-intfloat-multilingual-e5-large


In [15]:
# Assembling the chat engine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.postprocessor.jinaai_rerank import JinaRerank
from llama_index.core.response_synthesizers import get_response_synthesizer
from llama_index.core.response_synthesizers.type import ResponseMode
from llama_index.llms.litellm import LiteLLM
from llama_index.core.chat_engine.types import ChatMode

# similarity postprocessor
similarity_postprocessor = SimilarityPostprocessor(similarity_cutoff=SIMILARITY_CUTOFF)

# reranker
jina_reranker = JinaRerank(
    top_n=JINA_RERANKER_TOP_N, model=JINA_RERANKER_MODEL, api_key=JINA_API_KEY
)

# response synthesizer
response_synthesizer_llm = LiteLLM(LITELLM_RESPONSE_SYNTHESIZER_MODEL, api_base=OLLAMA_API_BASE)
response_synthesizer = get_response_synthesizer(llm=response_synthesizer_llm, response_mode=ResponseMode.COMPACT)

llm = LiteLLM(LITELLM_CHAT_ENGINE_LLM_MODEL_NAME, api_base=OLLAMA_API_BASE)
print(f'{LITELLM_CHAT_ENGINE_LLM_MODEL_NAME} context window: {llm.metadata.context_window}')

query_engine = vsi.as_query_engine(
    llm=llm,
    similarity_top_k=SIMILARITY_TOP_K,
    node_postprocessors=[similarity_postprocessor, jina_reranker],
    response_synthesizer=response_synthesizer
)


ollama_chat/llama3.2:3b context window: 2048


In [16]:
question = "what benefits can i expect to receive in April?"
# testing out the query engine
print(f'Asking question to query engine: {question}')
response = query_engine.query(question)
print(response)


Asking question to query engine: what benefits can i expect to receive in April?
You can expect a water and electricity bill rebate of $110 to $190, a housing and services charge rebate of 0.5 months or 1 month, and a climate voucher of up to $400 in April.


##### Question Generation
Generate synthetic questions based on retrieved chunks. As not every chunk may be able to create a question, use a LLM to generate question only if it thinks a question can be generated from the chunk


- Create a structured output object
- Retrieve some chunks from the vector store
- Filter for questions that the LLM thinks can be generated from the chunk

In [22]:
# Getting all nodes/chunks from vector store to evaluate from
nodes = vsi._vector_store.get_nodes(node_ids=[])
print(f'There are {len(nodes)} nodes in the vector store')

There are 309 nodes in the vector store


In [23]:
from llama_index.core.schema import MetadataMode
import random
random_node = random.choice(nodes)
print(random_node.get_content(MetadataMode.EMBED))

header_path: /

# G. Rallying as One United People 

145. Singaporeans across different generations and from all walks of life have played a vital role in getting us to SG60.
146.


Start phoenix to observe the retriever engine

In [13]:
launch_phoenix()

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


### LLM for question generation and evaluation
- OpenAI API because:
    - support multilingual questions
    - support structured output e.g. for question generation
    - strong model for performing evaluation (large context window)


In [17]:

from getpass import getpass
if not (OPENAI_API_KEY := os.getenv("OPENAI_API_KEY")):
    OPENAI_API_KEY = getpass("🔑 Enter your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [18]:
from pydantic import BaseModel, Field

class QuestionFromChunk(BaseModel):
    can_generate_question: bool = Field(description="Whether a question can be generated from the context")
    explanation: str = Field(description="Explanation for why a question can or cannot be generated from the context")
    questions_generated: list[str] = Field(..., description="List of questions generated from the context if question(s) can be generated from the context")

In [36]:
generate_questions_template = """\
Given the context information and not prior knowledge, your task is to generate questions based on the context. \
The questions should be diverse in nature across the context. Restrict the questions to the context information provided.
Questions may or may not be able to be generated from the chunk. Be very selective when generating questions.
Questions do not need to be generated if the context is like a header or a table of contents, or summary phrase of a context.
In the questions generated, do not include the header path or 'according to the context / document'. Only generate natural questions that people will ask from the context.
Generate the question in the same language as the context.
Generate the explanation in English

Context:
\"\"\"\
${context}
\"\"\"

Output the response in JSON format
"""

In [37]:
from guardrails import Guard

guard = Guard.from_pydantic(QuestionFromChunk)

In [39]:
# Testing out on sample node
import random
from pprint import pprint

random_node = random.choice(nodes) # nodes[297] is a node in indian language
print(f'random node text embedded content: \n {random_node.get_content(MetadataMode.EMBED)}', end='\n---------------------------------------------\n')

tools = [] # an open ai compatible list of tools

response = guard(
    model="gpt-4o",
    instructions="You are a helpful assistant to generate questions from the context to be used for a quiz/examination.",
    prompt=generate_questions_template,
    prompt_params={"context": random_node.get_content(MetadataMode.EMBED)},
    tools=guard.json_function_calling_tool(tools),
    tool_choice="required",
)

pprint(response.validated_output)


random node text embedded content: 
 header_path: /

# குடும்பங்களுக்கு உகந்த சிங்கப்பூரை உருவாக்குதல் 

- 12 வயதுக்கும் குறைந்த சிங்கப்பூர்ப் பிள்ளைகள் ஒவ்வொருவருக்கும் தவா \$500 - பிள்ளைகளுக்கான LifeSG சிறப்புத்தொகை
- 13 முதல் 20 வயதுடைய சிங்கப்பூர் மாணவர்கள் ஒவ்வொருவருக்கும் தவா \$500 - எடுசேவ் கல்லிச் சேமிப்புக் கணக்கு அல்லது உயர்நிலைப்பள்ளிக்குப் பிந்தைய கல்லிக் கணக்கில் நிரப்புத்தொகை
- பிள்ளைப் பராமரிப்புச் செலவைக் குறைக்க அரசாங்க ஆதரவுபெற்ற பாலர் பள்ளிகளில் மாதாந்தர முழுநாள் பிள்ளைப் பராமரிப்புக் கட்டண உச்சவரம்பைக் குறைத்தல்
- [புதிது] பெரிய குடும்பங்களுக்கான ஆதரவுத் திட்டம்
- 18 பிப்ரவரி 2025 முதல் பிறக்கும் மூன்றாவது, அடுத்தடுத்த பிள்ளைகள் ஒவ்வொருவருக்கும், பிள்ளை மேம்பாட்டுக் கணக்கின் தொடக்க மானியமாக \$5,000 அதிகரிப்பு
- 18 பிப்ரவரி 2025 முதல் பிறக்கும் மூன்றாவது, அடுத்தடுத்த பிள்ளைகள் ஒவ்வொருவருக்கும் தாயாரின் மெடிசேவ் கணக்கில் பெரிய குடும்பங்களுக்கான மெடிசேவ் மானியம் - \$5,000
- மூன்றாவது, அடுத்தடுத்த பிள்ளைகள்
![img-3.jpeg](img-3.jpeg)

ஓவ்வொருவரும் 1 முதல் 6 வயது உள்ள கால

In [40]:
import concurrent.futures
from functools import partial

def generate_questions_for_node(node, model="gpt-4o", template=generate_questions_template, tools=[]):
    response = guard(
        model=model,
        instructions="You are a helpful assistant.",
        prompt=template,
        prompt_params={"context": node.get_content(MetadataMode.EMBED)},
        tools=guard.json_function_calling_tool(tools),
        tool_choice="required",
    )
    return response.validated_output

# Process nodes concurrently with a maximum of workers
def generate_questions(nodes, max_workers=5, batch_size=None):
    questions_generated = []
    
    # If batch_size is specified, process in batches
    if batch_size:
        for i in range(0, len(nodes), batch_size):
            batch = nodes[i:i+batch_size]
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                results = list(executor.map(generate_questions_for_node, batch))
                questions_generated.extend(results)
    else:
        # Process all at once with max_workers
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(executor.map(generate_questions_for_node, nodes))
            questions_generated.extend(results)
    
    return questions_generated

# Use the function
questions_generated = generate_questions(
    nodes=nodes,
    max_workers=5,  
    batch_size=10   # set to None to process all at once
)

In [55]:
questions_generated[:5]

[None,
 {'can_generate_question': False,
  'explanation': "The context provided is a header or title, specifically for a section titled 'Budget Debate Round Up Speech' on a Singapore Government Agency Website. It does not contain any substantive content or details from which questions can be generated. Headers or titles typically do not provide enough information to form meaningful questions, as they are meant to introduce or categorize content rather than convey detailed information.",
  'questions_generated': []},
 {'can_generate_question': False,
  'explanation': 'The context provided is a header and a table of contents, which does not contain substantive information to generate meaningful questions. It merely indicates the presence of an overview section without any details.',
  'questions_generated': []},
 {'can_generate_question': False,
  'explanation': 'The context provided is a table of contents or a list of sections from a document. It does not contain detailed information or

In [48]:
# create the context_questions df
context_questions = {'context': [], 'question': []}
for i, node in enumerate(nodes):
    if questions_generated[i] is not None:
        for question in questions_generated[i]['questions_generated']:
            context = node.get_content(MetadataMode.EMBED) # this was the context passed to LLM to generate question(s) from
            context_questions['context'].append(context)
            context_questions['question'].append(question)
    



In [52]:
import pandas as pd
questions_with_document_chunk_df = pd.DataFrame(context_questions)
print(f'There are {len(questions_with_document_chunk_df)} questions generated from the nodes')
questions_with_document_chunk_df.head(10)

There are 1068 questions generated from the nodes


Unnamed: 0,context,question
0,header_path: /\n\nAnd so these are issues we w...,What are the means testing criteria that need ...
1,header_path: /\n\nAnd so these are issues we w...,"Which groups did Ms Jean See, Ms Yeo Wan Ling,..."
2,header_path: /\n\nAnd so these are issues we w...,Who championed for the arts and what was the r...
3,header_path: /\n\nAnd so these are issues we w...,What mental health issues were advocated for b...
4,header_path: /\n\nAnd so these are issues we w...,What are some of the key points raised by the ...
5,header_path: /A Singapore Government Agency We...,How is the government supporting businesses an...
6,header_path: /A Singapore Government Agency We...,What measures are being taken to help Singapor...
7,header_path: /A Singapore Government Agency We...,Is the government being overly conservative in...
8,header_path: /\n\n49. We keep an open mind and...,What are some examples of participatory platfo...
9,header_path: /\n\n49. We keep an open mind and...,Why are participatory platforms considered res...


In [63]:
type(questions_with_document_chunk_df.iloc[0])


pandas.core.series.Series

In [54]:
# Checking retrieval
i = random.randint(0, len(questions_with_document_chunk_df))
print('Context:')
print(questions_with_document_chunk_df.iloc[i]['context'])
print('------------')
print('Question generated:')
print(questions_with_document_chunk_df.iloc[i]['question'])

Context:
header_path: /为国人提供援助 /

## 7月

建国60周年 $\qquad$ 600元或800元
邻里购物券 [新]
保健储蓄 $\qquad$ 500 元
育儿SG生活助手补助券或
教育储蓄户头/中学后延续教育
户头填补 [新] $\qquad$ 500 元
水电费回扣 $\qquad$ 110元至190元
组屋杂费回扣 $\qquad$ 0.5 个月或 1 个月
------------
Question generated:
What are the new benefits introduced in July?


### Retrieval Evaluation


In [58]:
# Restart phoenix to observe the retrieval for each question
close_phoenix()
launch_phoenix()



🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [66]:
questions_with_document_chunk_df[:10]

Unnamed: 0,context,question
0,header_path: /\n\nAnd so these are issues we w...,What are the means testing criteria that need ...
1,header_path: /\n\nAnd so these are issues we w...,"Which groups did Ms Jean See, Ms Yeo Wan Ling,..."
2,header_path: /\n\nAnd so these are issues we w...,Who championed for the arts and what was the r...
3,header_path: /\n\nAnd so these are issues we w...,What mental health issues were advocated for b...
4,header_path: /\n\nAnd so these are issues we w...,What are some of the key points raised by the ...
5,header_path: /A Singapore Government Agency We...,How is the government supporting businesses an...
6,header_path: /A Singapore Government Agency We...,What measures are being taken to help Singapor...
7,header_path: /A Singapore Government Agency We...,Is the government being overly conservative in...
8,header_path: /\n\n49. We keep an open mind and...,What are some examples of participatory platfo...
9,header_path: /\n\n49. We keep an open mind and...,Why are participatory platforms considered res...


In [None]:
# loop over the questions and generate the answers
for _, row in questions_with_document_chunk_df.iterrows():
    question = row["question"]
    response = query_engine.query(question)
    print(f"Question: {question}\nAnswer: {response.response}\n")

Question: What groups did Ms Jean See, Ms Yeo Wan Ling, and Mr Gan Thiam Poh highlight as needing more support?
Answer: The Tripartite Workgroup should focus on providing targeted support to freelance and agency workers, as well as working mothers. These groups face unique challenges in their professional development, and addressing these specific needs can help make the SkillsFuture system more inclusive and effective.

Question: Who championed for the arts and who advocated for mental health issues?
Answer: Ms Jean See championed for freelance and agency workers, while Ms Yeo Wan Ling advocated for working mothers, and Mr Gan Thiam Poh also supported these groups.

Question: What are some of the key points raised by the members?
Answer: The key points raised by the members of the Tripartite Workgroup on Senior Employment include improving the employability of seniors, increasing job availability that suits their needs, enhancing support for seniors who wish to continue working, and p

In [None]:
answers = []
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(answer_question, questions_with_document_chunk_df))
    answers.extend(results)

In [67]:
import concurrent.futures
import time
from tqdm import tqdm
from llama_index.core.query_engine import RetrieverQueryEngine

def answer_question(row: pd.Series, query_engine: RetrieverQueryEngine):
    question = row["question"]
    response = query_engine.query(question)
    return {
        "question": question,
        "answer": response.response
    }

def process_questions_in_parallel(df: pd.DataFrame, query_engine: RetrieverQueryEngine, max_workers=5, batch_size=10):
    results = []
    
    # Process in batches
    for i in tqdm(range(0, len(df), batch_size)):
        batch_df = df.iloc[i:i+batch_size]
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Create a partial function with chat_engine already provided
            fn = lambda row: answer_question(row, query_engine)
            
            # Submit all tasks
            future_to_row = {
                executor.submit(fn, row): idx 
                for idx, row in batch_df.iterrows()
            }
            
            # Collect results as they complete
            batch_results = []
            for future in concurrent.futures.as_completed(future_to_row):
                result = future.result()
                batch_results.append(result)
            
            results.extend(batch_results)
        
        # Optional: add a small delay between batches if concerned about rate limits
        # time.sleep(0.5)
    
    return results

# Execute the parallel processing
qa_results = process_questions_in_parallel(
    df=questions_with_document_chunk_df[:100], # test on 100 questions for now
    query_engine=query_engine,
    max_workers=10,  
    batch_size=20 
)

# Print results
for result in qa_results:
    print(f"Question: {result['question']}\nAnswer: {result['answer']}\n")

100%|██████████| 5/5 [13:46<00:00, 165.31s/it]

Question: What are the means testing criteria that need fine-tuning?
Answer: Means testing criteria related to incomes, such as per capita household income, family income (with data limitations), lifelong earnings income, and wealth measures.

Question: What mental health issues were advocated for by Dr Wan Rizal and Ms Rachel Ong?
Answer: Mental health issues.

Question: What are some examples of participatory platforms mentioned in the context?
Answer: Citizen panels, youth panels, and alliances for action are some examples of participatory platforms mentioned in the context.

Question: Is the government being overly conservative in its fiscal projections and plans?
Answer: The government maintains that it is practicing responsible and prudent budgeting, ensuring that public finances remain healthy year after year. It spends within its means, raises revenues to meet new demands, and ensures that there are sufficient funds available rather than risking falling behind. The government a




In [68]:
from phoenix.session.evaluation import get_retrieved_documents

retrieved_documents_df = get_retrieved_documents(px.active_session())
retrieved_documents_df

Unnamed: 0_level_0,Unnamed: 1_level_0,context.trace_id,input,reference,document_score
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ec428a2cc306bb78,0,9e6aeb27418f8ec78359c7e0414e7988,What mental health issues were advocated for b...,And so these are issues we will continue to fi...,0.853866
ec428a2cc306bb78,1,9e6aeb27418f8ec78359c7e0414e7988,What mental health issues were advocated for b...,# Ensuring No One Is Left Behind \n\n46. This ...,0.789440
ec428a2cc306bb78,2,9e6aeb27418f8ec78359c7e0414e7988,What mental health issues were advocated for b...,"5. First, how do we navigate a very uncertain ...",0.787053
ec428a2cc306bb78,3,9e6aeb27418f8ec78359c7e0414e7988,What mental health issues were advocated for b...,Members also had suggestions covering differen...,0.774909
ec428a2cc306bb78,4,9e6aeb27418f8ec78359c7e0414e7988,What mental health issues were advocated for b...,We are looking into these post-18 pathways for...,0.773560
...,...,...,...,...,...
37195d2ffb780dd5,7,5d10fa5e0ef83607136d017b5a15edf5,How does Singapore's social support system hel...,## Empowering Persons with Disabilities\n\n![i...,0.832206
37195d2ffb780dd5,8,5d10fa5e0ef83607136d017b5a15edf5,How does Singapore's social support system hel...,# CONTENT \n\nSupport for Singaporeans\nDisbur...,0.826288
37195d2ffb780dd5,9,5d10fa5e0ef83607136d017b5a15edf5,How does Singapore's social support system hel...,# F. Nurturing a Caring and Inclusive Society ...,0.826135
37195d2ffb780dd5,10,5d10fa5e0ef83607136d017b5a15edf5,How does Singapore's social support system hel...,158. Our Self-Help Groups play an important ro...,0.824476


In [69]:
from phoenix.evals import RelevanceEvaluator, run_evals
from phoenix.evals import OpenAIModel


relevance_evaluator = RelevanceEvaluator(OpenAIModel(model="gpt-4o-mini"))

retrieved_documents_relevance_df = run_evals(
    evaluators=[relevance_evaluator],
    dataframe=retrieved_documents_df,
    provide_explanation=True,
    concurrency=20,
)[0]

run_evals |          | 0/1200 (0.0%) | ⏳ 00:00<? | ?it/s

In [70]:
retrieved_documents_relevance_df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,label,score,explanation
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ec428a2cc306bb78,0,relevant,1,The reference text explicitly states that Dr W...
ec428a2cc306bb78,1,unrelated,0,The reference text discusses various social su...
ec428a2cc306bb78,2,unrelated,0,The reference text discusses navigating a glob...
ec428a2cc306bb78,3,unrelated,0,The reference text discusses suggestions relat...
ec428a2cc306bb78,4,unrelated,0,The reference text discusses support for perso...
ec428a2cc306bb78,5,unrelated,0,The reference text titled 'Budget Debate Round...
ec428a2cc306bb78,6,unrelated,0,The reference text discusses job-related anxie...
ec428a2cc306bb78,7,unrelated,0,The reference text discusses the social suppor...
ec428a2cc306bb78,8,unrelated,0,The reference text discusses the importance of...
ec428a2cc306bb78,9,unrelated,0,The reference text discusses the experiences o...


In [71]:
documents_with_relevance_df = pd.concat(
    [retrieved_documents_df, retrieved_documents_relevance_df.add_prefix("eval_")], axis=1
)
documents_with_relevance_df

Unnamed: 0_level_0,Unnamed: 1_level_0,context.trace_id,input,reference,document_score,eval_label,eval_score,eval_explanation
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ec428a2cc306bb78,0,9e6aeb27418f8ec78359c7e0414e7988,What mental health issues were advocated for b...,And so these are issues we will continue to fi...,0.853866,relevant,1,The reference text explicitly states that Dr W...
ec428a2cc306bb78,1,9e6aeb27418f8ec78359c7e0414e7988,What mental health issues were advocated for b...,# Ensuring No One Is Left Behind \n\n46. This ...,0.789440,unrelated,0,The reference text discusses various social su...
ec428a2cc306bb78,2,9e6aeb27418f8ec78359c7e0414e7988,What mental health issues were advocated for b...,"5. First, how do we navigate a very uncertain ...",0.787053,unrelated,0,The reference text discusses navigating a glob...
ec428a2cc306bb78,3,9e6aeb27418f8ec78359c7e0414e7988,What mental health issues were advocated for b...,Members also had suggestions covering differen...,0.774909,unrelated,0,The reference text discusses suggestions relat...
ec428a2cc306bb78,4,9e6aeb27418f8ec78359c7e0414e7988,What mental health issues were advocated for b...,We are looking into these post-18 pathways for...,0.773560,unrelated,0,The reference text discusses support for perso...
...,...,...,...,...,...,...,...,...
37195d2ffb780dd5,7,5d10fa5e0ef83607136d017b5a15edf5,How does Singapore's social support system hel...,## Empowering Persons with Disabilities\n\n![i...,0.832206,relevant,1,The reference text discusses various initiativ...
37195d2ffb780dd5,8,5d10fa5e0ef83607136d017b5a15edf5,How does Singapore's social support system hel...,# CONTENT \n\nSupport for Singaporeans\nDisbur...,0.826288,unrelated,0,The reference text lists various sections and ...
37195d2ffb780dd5,9,5d10fa5e0ef83607136d017b5a15edf5,How does Singapore's social support system hel...,# F. Nurturing a Caring and Inclusive Society ...,0.826135,relevant,1,The reference text discusses the Forward Singa...
37195d2ffb780dd5,10,5d10fa5e0ef83607136d017b5a15edf5,How does Singapore's social support system hel...,158. Our Self-Help Groups play an important ro...,0.824476,relevant,1,The reference text discusses the role of Self-...


#### NCDG Score

The NDCG measures how well document scores match the relevance score (from the LLM)
NDCG (Normalized Discounted Cumulative Gain) at k=2 measures the quality of document rankings by:
- Looking at the top 2 ranked documents for each query/context
- Comparing the system's ranking (document_score) with human relevance judgments (eval_score)
- Giving higher weight to relevant documents that appear higher in the ranking
- Normalizing the score between 0 and 1

Interpretation
- NDCG@2 = 1.0: Perfect ranking - the retrieval system perfectly ordered documents according to relevance judgments
- NDCG@2 = 0.0: Worst possible ranking - the retrieval system ranked documents completely opposite to relevance judgments
- Higher values: Better alignment between retrieval system rankings and relevance judgments


In [83]:
import numpy as np
from sklearn.metrics import ndcg_score


def _compute_ndcg(df: pd.DataFrame, k: int):
    """Compute NDCG@k in the presence of missing values"""
    n = max(2, len(df))
    eval_scores = np.zeros(n)
    doc_scores = np.zeros(n)
    eval_scores[: len(df)] = df.eval_score
    doc_scores[: len(df)] = df.document_score
    try:
        return ndcg_score([eval_scores], [doc_scores], k=k)
    except ValueError:
        return np.nan


ndcg_at_2 = pd.DataFrame(
    {"score": documents_with_relevance_df.groupby("context.span_id").apply(_compute_ndcg, k=2)}
)

In [85]:
ndcg_at_2.head()

Unnamed: 0_level_0,score
context.span_id,Unnamed: 1_level_1
0379c18156942c3b,1.0
0897cb529c743efc,1.0
092cfd51e1dca482,0.386853
0d9b9914bdbf76e2,0.613147
0ddc5364513b672e,1.0


Precision at 2: Calculate relevance score of top 2 retrieved nodes

In [86]:
"""
precision_at_2: Calculate relevance score of top 2 retrieved nodes
"""
precision_at_2 = pd.DataFrame(
    {
        "score": documents_with_relevance_df.groupby("context.span_id").apply(
            lambda x: x.eval_score[:2].sum(skipna=False) / 2
        )
    }
)

Hit: Check if there's at least one relevant node in the top 2 retrieved nodes

In [87]:
"""
hit: Check if there's at least one relevant node in the top 2 retrieved nodes
"""
hit = pd.DataFrame(
    {
        "hit": documents_with_relevance_df
        .groupby("context.span_id") # Group the retrieved nodes by the question
        .apply(
            lambda x: x.eval_score[:2].sum(skipna=False) > 0 # check if theres at least one relevant nodein the top 2 retrieved nodes
        )
    }
)

In [88]:
retrievals_df = px.active_session().get_spans_dataframe("span_kind == 'RETRIEVER'")
rag_evaluation_dataframe = pd.concat(
    [
        retrievals_df["attributes.input.value"],
        ndcg_at_2.add_prefix("ncdg@2_"),
        precision_at_2.add_prefix("precision@2_"),
        hit,
    ],
    axis=1,
)
rag_evaluation_dataframe

  df_attributes = pd.DataFrame.from_records(


Unnamed: 0_level_0,attributes.input.value,ncdg@2_score,precision@2_score,hit
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
82f8859e11a9a75b,"{""query_bundle"": {""query_str"": ""What mental he...",,,
ec428a2cc306bb78,What mental health issues were advocated for b...,1.000000,0.5,True
cb657fd48d16f8f5,"{""query_bundle"": {""query_str"": ""How is the gov...",,,
84771d9b33fc33f3,How is the government supporting businesses an...,0.386853,0.5,True
611c91b83dd1479a,"{""query_bundle"": {""query_str"": ""What measures ...",,,
...,...,...,...,...
d651dc3c0243e7f8,What adjustments are made to account for house...,1.000000,1.0,True
6d7031df23a27126,"{""query_bundle"": {""query_str"": ""What are the b...",,,
8937934cc2358a92,What are the basic needs that the social suppo...,1.000000,1.0,True
0493fe5ec98c8c33,"{""query_bundle"": {""query_str"": ""How does Singa...",,,


In [90]:
results = rag_evaluation_dataframe.mean(numeric_only=True)
results['hit_rate'] = rag_evaluation_dataframe['hit'].mean()
results

ncdg@2_score         0.775728
precision@2_score    0.645000
hit_rate             0.970000
dtype: float64

In [92]:
# save df to csv
rag_evaluation_dataframe.to_csv('rag_evaluation_dataframe.csv', index=False)


#### Interpretation of Retrieval Evaluation Results

Over 100 questions,

1. **NDCG@2 Score (~0.78)**
   - The system's ranking closely aligns with LLM's relevance judgments

2. **Precision@2 Score (varies between 0.5 and 1.0)**:
   - Shows the proportion of retrieved documents in the top 2 that are relevant
   - Scores of 1.0 indicate all top 2 retrieved documents were relevant
   - Scores of 0.5 indicate only one of the top 2 retrieved documents was relevant

3. **Hit Rate (appears high based on "True" values)**:
   - The proportion of queries where at least one relevant document was retrieved


Overall Assessment

NDCG@2 scores suggest that relevant documents are being retrieved and being ranked appropriately.

The variation in precision@2 scores (between 0.5 and 1.0) indicates that while the retrieval consistently retrieves at least one relevant document in the top 2 positions, it sometimes includes a non-relevant document as well.

Hit rate 0.97 is high, suggesting that of the 97% of questions, the retrieval system retrieved at least one relevant document in its top 2 retrieved nodes.

### Response evaluation

In [91]:
from phoenix.session.evaluation import get_qa_with_reference

qa_with_reference_df = get_qa_with_reference(px.active_session())
qa_with_reference_df

Unnamed: 0_level_0,input,output,reference
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9071eb489cdf7c7e,What are the means testing criteria that need ...,"Means testing criteria related to incomes, suc...",Members also had suggestions covering differen...
7481cddb0a3ba153,What mental health issues were advocated for b...,Mental health issues.,And so these are issues we will continue to fi...
ec2b478421e45014,What are some examples of participatory platfo...,"Citizen panels, youth panels, and alliances fo...",There have also been calls for us to make bold...
55ec24e05ac691cf,Is the government being overly conservative in...,The government maintains that it is practicing...,D. Are We Overly Conservative In Our Fiscal Pr...
2740159222ad0c3d,What are some of the key points raised by the ...,Members have raised several key points during ...,Members also had suggestions covering differen...
...,...,...,...
043f4354ea10aca7,How are income growth rates for Finland and Ja...,Income growth rates for Finland and Japan are ...,"30. As we have repeatedly emphasised, the more..."
1023b79fb044a428,What adjustments are made to account for house...,"For Finland and Japan, household income growth...","30. As we have repeatedly emphasised, the more..."
98c38667cdd8bda2,What are the basic needs that the social suppo...,The social support system in Singapore aims to...,"Sir, we are taking concrete steps to strengthe..."
aaac563c796a4043,How does Singapore's social support system hel...,Singapore's social support system provides mor...,"Sir, we are taking concrete steps to strengthe..."


In [93]:
# save df to csv
qa_with_reference_df.to_csv('qa_with_reference_df.csv', index=False)


In [94]:
from phoenix.evals import (
    HallucinationEvaluator,
    OpenAIModel,
    QAEvaluator,
    run_evals,
)

qa_evaluator = QAEvaluator(OpenAIModel(model="gpt-4o-mini"))
hallucination_evaluator = HallucinationEvaluator(OpenAIModel(model="gpt-4o-mini"))

qa_correctness_eval_df, hallucination_eval_df = run_evals(
    evaluators=[qa_evaluator, hallucination_evaluator],
    dataframe=qa_with_reference_df,
    provide_explanation=True,
    concurrency=20,
)

run_evals |          | 0/200 (0.0%) | ⏳ 00:00<? | ?it/s

In [95]:
qa_correctness_eval_df.head()


Unnamed: 0_level_0,label,score,explanation
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9071eb489cdf7c7e,correct,1,"To determine if the answer is correct, we need..."
7481cddb0a3ba153,incorrect,0,The question asks specifically about the menta...
ec2b478421e45014,correct,1,"To determine if the answer is correct, we firs..."
55ec24e05ac691cf,correct,1,The answer provided accurately reflects the go...
2740159222ad0c3d,correct,1,"To determine if the answer is correct, we need..."


In [96]:
qa_correctness_eval_df.mean(numeric_only=True)

score    0.9
dtype: float64

In [97]:
hallucination_eval_df.head()


Unnamed: 0_level_0,label,score,explanation
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9071eb489cdf7c7e,factual,0,To determine if the answer is factual or hallu...
7481cddb0a3ba153,factual,0,To determine if the answer is factual or hallu...
ec2b478421e45014,factual,0,To determine if the answer is factual or hallu...
55ec24e05ac691cf,factual,0,To determine if the answer is factual or hallu...
2740159222ad0c3d,factual,0,To determine if the answer is factual or hallu...


In [98]:
hallucination_eval_df.mean(numeric_only=True)

score    0.05
dtype: float64

QA Correctness score of 0.90 and a Hallucinations score 0.05 signifies that the generated answers are correct ~90% of the time and that the responses contain hallucinations 5% of the time - there is room for improvement. 

This could be due to the retrieval strategy e.g. chunking strategy or the LLM itself e.g. small context window. 
