## Setup LLM as a Virtual Judge

In [10]:
import json

from ollama import chat
import pandas as pd
from pydantic import BaseModel, Field

In [2]:
def generate_virtual_judge_prompt(question: str, answer_true: str, answer_rag: str) -> str:
    prompt = f"""You are an expert evaluator assessing the quality of answers generated by 
a RAG (Retrieval-Augmented Generation) system. Your task is to compare a generated answer 
against a reference answer and provide a detailed evaluation.

Question: 
{question}

Reference Answer: 
{answer_true}

Generated Answer: 
{answer_rag}

Please evaluate the generated answer through the following steps:

Step 1: Analyze the factual accuracy by comparing specific claims in the generated answer against 
the reference answer. List which facts are correct and which (if any) are incorrect or misrepresented.

Step 2: Identify any important information from the reference answer that is missing in the 
generated answer.

Step 3: Determine if the generated answer contains any hallucinated information (claims 
not supported by the reference answer).

Step 4: Assess how directly the generated answer addresses the original question.

Based on your analysis, provide scores on a scale of 1-5 for each dimension:

Factual Accuracy (1-5):
1: Contains multiple factual errors
3: Contains minor inaccuracies
5: All facts are completely accurate

Completeness (1-5):
1: Missing most key information
3: Contains core information but omits some details
5: Covers all important information from the reference

Relevance (1-5):
1: Barely addresses the question
3: Addresses the main question but with tangential information
5: Directly and specifically answers the question

Hallucination (1-5):
1: Contains significant made-up information
3: Contains minor details not in the reference
5: No hallucinated content whatsoever

Finally, provide an Overall Score (1-5) that reflects the answer's overall quality, 
and a brief justification for your evaluation.

Overall Score (1-5):
Justification:
"""
    return prompt

In [3]:
class VirtualJudgeResponse(BaseModel):
    factual_accuracy: int = Field(
        description="Factual Accuracy (1-5)"
    )
    completeness: int = Field(
        description="Completeness (1-5)"
    )
    relevance: int = Field(
        description="Relevance (1-5)"
    )
    hallucination: int = Field(
        description="Hallucination (1-5)"
    )
    overall: int = Field(
        description="Overall Score (1-5)"
    )
    justification: str = Field(
        description="Justification for the evaluation"
    )

In [30]:
def judge_answer(
    question: str,
    answer_true: str,
    answer_rag: str,
    model: str ='llama3.1:latest'
) -> VirtualJudgeResponse:
    prompt = generate_virtual_judge_prompt(question, answer_true, answer_rag)
    
    response = chat(
        messages=[
            {
            'role': 'user',
            'content': prompt,
            }
        ],
        model=model,
        format=VirtualJudgeResponse.model_json_schema(),
    )
    response_structured = VirtualJudgeResponse.model_validate_json(
        response.message.content
    )
    return response_structured

### Eval test

In [4]:
input_ = generate_virtual_judge_prompt(
    question="What is the capital of France?",
    answer_true="The capital of France is Paris.",
    answer_rag="Paris is the capital of France."
)
print(input_)

You are an expert evaluator assessing the quality of answers generated by 
a RAG (Retrieval-Augmented Generation) system. Your task is to compare a generated answer 
against a reference answer and provide a detailed evaluation.

Question: 
What is the capital of France?

Reference Answer: 
The capital of France is Paris.

Generated Answer: 
Paris is the capital of France.

Please evaluate the generated answer through the following steps:

Step 1: Analyze the factual accuracy by comparing specific claims in the generated answer against 
the reference answer. List which facts are correct and which (if any) are incorrect or misrepresented.

Step 2: Identify any important information from the reference answer that is missing in the 
generated answer.

Step 3: Determine if the generated answer contains any hallucinated information (claims 
not supported by the reference answer).

Step 4: Assess how directly the generated answer addresses the original question.

Based on your analysis, provi

In [32]:
virtual_judge_response = judge_answer(
    question="What is the capital of France?",
    answer_true="The capital of France is Paris.",
    answer_rag="Paris is the capital of France."
)

In [33]:
print(json.dumps(virtual_judge_response.model_dump(), indent=4))

{
    "factual_accuracy": 5,
    "completeness": 4,
    "relevance": 5,
    "hallucination": 5,
    "overall": 4,
    "justification": "The generated answer is factually accurate, covering all the information required by the reference answer. It may lack a specific detail but does not contain any hallucinated content or significant inaccuracies. The overall quality is good but could be improved to achieve perfection."
}


## Generate answers from RAG system
### Load data

In [11]:
def load_and_prepare_eval_data(chunk_data_fp: str, eval_data_fp: str) -> pd.DataFrame:

    df_chunks = pd.read_parquet(chunk_data_fp)

    df_tmp = pd.read_json(eval_data_fp, lines=True)
    df_tmp_exploded = df_tmp.explode('qa_pairs')
    # expand and add question-answer columns
    df_eval = pd.concat([
        df_tmp_exploded.drop(columns=['qa_pairs']),
        df_tmp_exploded['qa_pairs'].apply(pd.Series),
    ], axis=1)
    # expand and add metadata columns, and merge original data
    df_eval = pd.concat([
        df_eval.drop(columns=['metadata']),
        df_eval['metadata'].apply(pd.Series),
    ], axis=1).merge(
        df_chunks[['id', 'doc_id', 'text', 'cluster_tsne', 'metadata']],
        on='id',
    )
    return df_eval

In [12]:
chunk_filepath = './data/eval_sampled.parquet'
eval_filepath = './data/qa_pairs_gemma.jsonl'
df_eval = load_and_prepare_eval_data(chunk_filepath, eval_filepath)
df_eval.head()

Unnamed: 0,idx,id,question,answer,question_type,difficulty,required_context,reasoning,q_a_quality,doc_id,text,cluster_tsne,metadata
0,783,871e39f3-ad80-413d-9353-93b39da8adf5,What is the primary function of a data connect...,A data connector (or 'Reader') ingests data fr...,factual,easy,basic understanding of terminology,This question directly asks for the role of a ...,good,aa4c9403-c960-442a-aca3-31ad8ae64f6e,## Concept\n\nA data connector (aka `Reader`) ...,0,"{'_node_content': '{""id_"": ""871e39f3-ad80-413d..."
1,783,871e39f3-ad80-413d-9353-93b39da8adf5,"According to the document, what are the typica...","After data ingestion, you can build an Index, ...",inferential,medium,understanding of workflow,This question requires the user to understand ...,good,aa4c9403-c960-442a-aca3-31ad8ae64f6e,## Concept\n\nA data connector (aka `Reader`) ...,0,"{'_node_content': '{""id_"": ""871e39f3-ad80-413d..."
2,783,871e39f3-ad80-413d-9353-93b39da8adf5,Explain the overall purpose of using data conn...,The purpose is to take data from various sourc...,analytical,hard,understanding of overall system architecture &...,This requires the user to synthesize the infor...,good,aa4c9403-c960-442a-aca3-31ad8ae64f6e,## Concept\n\nA data connector (aka `Reader`) ...,0,"{'_node_content': '{""id_"": ""871e39f3-ad80-413d..."
3,851,45e0ab38-6280-4862-be9f-b57ce7f96492,What is the primary subject matter described i...,Relation-Based Node Parsers,factual,easy,entire chunk,This question tests basic recall - directly as...,good,5f858553-f1ec-4828-88df-b6dce5754a75,## Relation-Based Node Parsers,0,"{'_node_content': '{""id_"": ""45e0ab38-6280-4862..."
4,851,45e0ab38-6280-4862-be9f-b57ce7f96492,"Based on the title, what kind of parsing are t...",Parsing that involves relationships between no...,inferential,medium,title only,This requires inference – the user has to unde...,good,5f858553-f1ec-4828-88df-b6dce5754a75,## Relation-Based Node Parsers,0,"{'_node_content': '{""id_"": ""45e0ab38-6280-4862..."


### Setup RAG

In [None]:
from llama_index.core import Settings, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.vector_stores.lancedb import LanceDBVectorStore

In [14]:
BGE_MODEL_NAME = "BAAI/bge-small-en-v1.5"
BGE_SMALL_QUERY_INSTRUCTION = "Represent this sentence for searching relevant passages:"

model = HuggingFaceEmbedding(
    model_name=BGE_MODEL_NAME,
    query_instruction=BGE_SMALL_QUERY_INSTRUCTION,
    device="cuda",
    cache_folder="/home/zak/git/local_rag_course/local_rag/models",
)

In [19]:
vec_store = LanceDBVectorStore(
    uri="/home/zak/git/local_rag_course/local_rag/data/lancedb", 
    mode="overwrite", 
    query_type="vector", 
    refine_factor=30, 
    nprobes=100,
)
index = VectorStoreIndex.from_vector_store(vec_store, embed_model=model)

In [20]:
llm = Ollama(
    model='llama3.1:latest', 
    request_timeout=120.0,
    temperature=0.75,
)
Settings.llm = llm

query_engine = index.as_query_engine(
    similarity_top_k=5,
)

### Example

In [22]:
q, a_true = df_eval.iloc[0][['question', 'answer']]

In [25]:
q, a_true

("What is the primary function of a data connector (or 'Reader') within this system?",
 "A data connector (or 'Reader') ingests data from different data sources and data formats into a simple `Document` representation (text and simple metadata).")

In [28]:
rag_response = query_engine.query(q)
a_rag = rag_response.response

In [29]:
a_rag

'A data connector (often called a `Reader`) ingests data from different data sources and data formats into `Documents` and `Nodes`.'

In [31]:
virtual_judge_response = judge_answer(
    question=q,
    answer_true=a_true,
    answer_rag=a_rag,
    model='llama3.1:latest'
)
print(json.dumps(virtual_judge_response.model_dump(), indent=4))

{
    "factual_accuracy": 4,
    "completeness": 3,
    "relevance": 5,
    "hallucination": 5,
    "overall": 4,
    "justification": "The generated answer is almost entirely accurate and complete, directly addressing the question. However, it misses some minor details from the reference."
}


## RAG answer quality analysis