In [2]:
import numpy as np
import json
from datetime import datetime
from typing import List, Dict
from tqdm import tqdm
import pandas as pd
import seaborn as sns
from pathlib import Path
from datetime import datetime
import json
from typing import List, Dict
import matplotlib.pyplot as plt
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Settings,
    PromptTemplate
)
from llama_index.readers.file import PDFReader, DocxReader
from llama_index.readers.file.tabular import PandasExcelReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.evaluation import generate_question_context_pairs, SemanticSimilarityEvaluator, FaithfulnessEvaluator

from llm_loader import load_llm
import time
from sklearn.metrics.pairwise import cosine_similarity

from helper import inspect_chunks

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [25]:
import nest_asyncio
nest_asyncio.apply()

## Create QnA Dataset

In [4]:
def setup_evaluation_directories(base_path: str = "./Document/evaluation_datasets") -> dict:
    """Create and return evaluation directory structure"""
    paths = {
        "base": Path(base_path),
        "llm": Path(base_path) / "llm_generated",
        "manual": Path(base_path) / "manual",
        "results": Path(base_path) / "results",
        "debug": Path(base_path) / "debug",
    }
    
    for path in paths.values():
        path.mkdir(parents=True, exist_ok=True)
    
    return paths

In [5]:
paths = setup_evaluation_directories()

In [6]:
def load_documents(directory: str = "./Document"):
    """Load documents using LlamaIndex readers"""
    file_readers = {
        ".pdf": PDFReader(),
        ".docx": DocxReader(),
        ".xlsx": PandasExcelReader()
    }
    
    reader = SimpleDirectoryReader(
        input_dir=directory,
        file_extractor=file_readers,
        filename_as_id=True
    )
    
    documents = reader.load_data()
    print(f"Loaded {len(documents)} documents")
    return documents

In [7]:
documents = load_documents()

Loaded 4 documents


In [8]:
# Initialize LLM
llm = load_llm()
Settings.llm = llm

Settings.embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    cache_folder="./embedding_cache"
)


llama_init_from_model: n_ctx_per_seq (8192) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_b

In [9]:
def create_document_chunks(documents, chunk_size:int = 512, chunk_overlap: int = 50, num_questions_per_chunk:int = 3):
    """Create chunks of a document"""
    # Create node parser for chunking
    node_parser = SentenceSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    nodes = node_parser.get_nodes_from_documents(documents)
    print(f"Total documents: {len(documents)}")
    print(f"Total chunks: {len(nodes)}")
    print(f"Estimated questions to generate: {len(nodes) * num_questions_per_chunk}")

    # You can also see chunk distribution per document
    doc_chunks = {}
    for node in nodes:
        doc_id = node.metadata.get('file_name', 'unknown')
        doc_chunks[doc_id] = doc_chunks.get(doc_id, 0) + 1

    print("\nChunks per document:")
    for doc_id, count in doc_chunks.items():
        print(f"- {doc_id}: {count} chunks")

    return nodes, doc_chunks

In [10]:
def create_llm_qa_pairs(nodes, num_questions_per_chunk:int = 3, base_output_path: str = "./Document/evaluation_datasets"):
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = Path(base_output_path) / f"evaluation_dataset_{timestamp}.json"

    # Generate questions with proper prompt
    question_prompt = PromptTemplate("""
    Given the context information and not prior knowledge, generate only questions based on the below query.

    Generate {num_questions} specific questions from this text. 
    Questions should be complete, answerable questions about the content:

    Text: {context}

    Questions:
    """)

    qc_pairs = generate_question_context_pairs(
        nodes=nodes,
        llm=Settings.llm,
        num_questions_per_chunk=3,
        qa_generate_prompt_tmpl=question_prompt
    )
    qa_dataset = dict(qc_pairs)

    queries = qa_dataset.get("queries", {})
    corpus = qa_dataset.get("corpus", {})
    relevant_docs = qa_dataset.get("relevant_docs", {})


    dataset = {
        "examples": [
            {
                "query": question,
                "reference_contexts": [corpus[doc_id] for doc_id in relevant_docs[query_id]],
                "query_by": {
                    "model_name": "llama-2",
                    "type": "ai",
                    "timestamp": timestamp
                },
                "metadata": {
                    "generated_query_id": query_id
                }
            }
            for query_id, question in queries.items()
        ],
        "dataset_info": {
            "total_chunks": len(nodes),
            "questions_per_chunk": num_questions_per_chunk,
            "creation_timestamp": timestamp,
        }
        }

    
    # Save dataset with timestamp
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w') as f:
        json.dump(dataset, f, indent=2)
    
    print(f"Created dataset with {len(dataset['examples'])} examples")
    print(f"Saved to: {output_path}")

    return dataset


def validate_dataset(dataset: dict, require_answer: bool = False) -> bool:
    """Validate dataset format and content.
    
    Args:
        dataset (dict): The evaluation dataset.
        require_answer (bool): Whether a reference answer is required.
        
    Returns:
        bool: True if validation passes, False otherwise.
    """
    # Check if the main keys exist
    for key in ['examples', 'dataset_info']:
        if key not in dataset:
            print(f"Dataset is missing key: {key}")
            return False

    for example in dataset['examples']:
        # Check question format
        query = example.get('query', '')
        if not isinstance(query, str) or len(query.split()) < 4:
            print(f"Invalid question: {query}")
            return False
            
        # Check context format: must be a non-empty list of strings
        contexts = example.get('reference_contexts', [])
        if not isinstance(contexts, list) or not contexts or \
           not all(isinstance(c, str) and c.strip() for c in contexts):
            print(f"Invalid or empty context for question: {query}")
            return False
            
        # Check answer if required
        if require_answer:
            answer = example.get('reference_answer', '')
            if not answer or not isinstance(answer, str) or not answer.strip():
                print(f"Missing or invalid answer for question: {query}")
                return False
    
    return True



In [11]:
nodes, doc_chunks = create_document_chunks(documents)
inspect_chunk_df = inspect_chunks(nodes, 200)

Total documents: 4
Total chunks: 14
Estimated questions to generate: 42

Chunks per document:
- AIBots - FAQs.docx: 5 chunks
- Aibot - Datalab.docx: 5 chunks
- Synthetic Data Dictionary.xlsx: 1 chunks
- Synthetic Standard Operating Procedures for Research Data Laboratory.docx: 3 chunks
Total Documents (unique sources): 4
Total Chunks: 14

Chunks per Document:
Source
AIBots - FAQs.docx                                                           5
Aibot - Datalab.docx                                                         5
Synthetic Standard Operating Procedures for Research Data Laboratory.docx    3
Synthetic Data Dictionary.xlsx                                               1
Name: count, dtype: int64

Chunk Details:


In [12]:
inspect_chunk_df.head()

Unnamed: 0,Chunk #,Text Preview,Length,Source,Start Idx,End Idx
0,1,"What are the Data Lab's operating hours?\n\nThe MOM Data Lab is open from Tuesdays to Thursdays, 9.30 AM to 5.30 PM and is closed from 12 to 2 PM. The lab may also be closed on days when no admins are a...",1952,AIBots - FAQs.docx,,
1,2,"I have not received a confirmation for my lab booking, can I still head down?\n\nIf you did not receive any confirmation, please email us first. There may be cases where there are multiple requests for ...",1924,AIBots - FAQs.docx,,
2,3,What kinds of data are available in the Data Lab?\n\nThe Data lab houses various datasets and data products/dashboards. Please refer to this our Software and Data Products page for more info.\n\n \n\nFor ge...,2140,AIBots - FAQs.docx,,
3,4,How long does it take to get my data?\n\nOur service level agreement is to process your extraction request within 7 working days. However. please note that complex and/or sensitive requests may be subje...,1290,AIBots - FAQs.docx,,
4,5,What do I do when I cannot access the MRSD_External folder?\n\nFOR EXTERNAL USERS\n1. Ensure that Symantec Endpoint Protection is on by right clicking on the small arrow on the bottom right corner of the...,1121,AIBots - FAQs.docx,,


In [None]:
eval_dataset = create_llm_qa_pairs(
    nodes=documents,
    num_questions_per_chunk=3,
    base_output_path=paths['llm']
)

if not validate_dataset(eval_dataset):
    print("Dataset validation failed - regenerating questions required")

In [13]:

custom_prompt = PromptTemplate(
    """\
Rewrite the user's follow-up question as a standalone question.

1. Include all relevant past context.
2. Keep it natural and grammatically correct.
3. If already standalone, return it unchanged.

<Chat History>
{chat_history}

<User's Follow-Up Question>
{question}

<Rewritten Standalone Question>
"""
)


response_prompt = PromptTemplate(
    """\
You are an AI assistant providing structured responses.

### **Instructions:**
- Answer clearly and concisely.
- Summarize retrieved context to avoid duplication.
- Summarize the key facts efficiently.
- If the context lacks enough details, say: "I don’t have enough information."
- Format responses in natural sentences.

<Retrieved Context>
{context}

<User's Query>
{question}

### **AI Response:**
"""
)

# Read DataLab Inputs

In [14]:
file_name = 'Evaluation Scores.csv'

In [15]:
def load_manual_questions(excel_path: str)-> Dict:
    """Load manual questions from Excel and convert to dataset format"""
    
    human_truth_df = pd.read_csv(excel_path)[['Category', 'Question', 'Answer']]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Convert Excel data to dataset format
    manual_dataset = {
        "examples": [
            {
                "category": row["Category"],
                "question": row['Question'],  # Will be filled during evaluation
                "reference_answer": row["Answer"],
                "query_by": {
                    "model_name": "human",
                    "type": "expert",
                    "timestamp": timestamp
                },
                "metadata": {
                    "source": f"{file_name}.xlsx",
                    "doc_id": f"manual_{idx}"
                }
            }
            for idx, row in human_truth_df.iterrows()
        ],
        "dataset_info": {
            "creation_timestamp": timestamp,
            "source": "manual",
            "total_questions": len(human_truth_df)
        }
    }
    
    return manual_dataset

In [16]:
manual_dataset_df = load_manual_questions(str(paths['manual'])+'/' + file_name)
manual_dataset_df['examples'][0]

{'category': 'Data Dictionaries',
 'question': 'What do the data fields and values mean?',
 'reference_answer': 'Please refer to the code table.',
 'query_by': {'model_name': 'human',
  'type': 'expert',
  'timestamp': '20250324_170335'},
 'metadata': {'source': 'Evaluation Scores.csv.xlsx', 'doc_id': 'manual_0'}}

## RUN EVALUATION

In [17]:
def create_debug_dataframe(evaluator) -> pd.DataFrame:
    """Create DataFrame for debugging evaluation results"""
    debug_data = []
    
    for result in evaluator.results:
        # Get reference context and answer
        ref_context = evaluator.rag_dataset.reference_contexts[result['query_idx']]
        ref_answer = evaluator.rag_dataset.reference_answers[result['query_idx']]
        
        debug_data.append({
            'Question': result['query'],
            'Reference Answer': ref_answer,
            'Generated Answer': result['generated_answer'],
            'Reference Context': ref_context,
            'Retrieved Chunks': [node.node.text for node in result.get('retrieved_contexts', [])],
            'Retrieval Time': f"{result['retrieval_time']:.3f}s",
            'Accuracy': result['retrieval_metrics']['accuracy'],
            'Semantic Score': result['semantic_score']
        })
    
    df = pd.DataFrame(debug_data)
    return df

In [18]:
class RagDataset:
    def __init__(self, data):
        """Initialize with JSON data
        
        Args:
            data (dict): Dictionary containing examples with queries and contexts
        """
        self.data = data
        self.examples = self.data.get("examples", [])
    
    @property 
    def queries(self):
        """Get list of all queries"""
        return [example["query"] for example in self.examples]
    
    @property
    def reference_contexts(self):
        """Get list of reference contexts for each example"""
        # Handle both string and dict contexts
        contexts = []
        for example in self.examples:
            context = example.get("reference_contexts", [])
            # If context is dictionary, get values
            if isinstance(context[0], dict):
                contexts.append(list(context[0].values())[0])
            else:
                contexts.append(context[0])
        return contexts
    
    @property
    def reference_answers(self):
        """Get list of reference answers"""
        return [example.get("reference_answer", "") for example in self.examples]
    
    def get_example(self, idx):
        """Get complete example at index"""
        if idx >= len(self.examples):
            raise IndexError(f"Index {idx} out of range for dataset with {len(self.examples)} examples")
        return self.examples[idx]


In [19]:
class EvaluationMetrics:
    """Class for computing and aggregating retrieval metrics."""
    def __init__(self, rag_dataset):
        self.rag_dataset = rag_dataset
        self.results = []
        self.retrieval_times: List[float] = []
        self.retrieval_accuracies: List[float] = []
        self.semantic_scores: List[float] = []
        self.faithfulness_scores: List[float] = []
        self.mrr_scores: List[float] = []
        self.ndcg_scores: List[float] = []
        self.precision_at_k: Dict[int, List[float]] = {1: [], 3: [], 5: []}
        self.hit_rates: List[float] = []
        # Use the global embedding model from Settings
        self.embedding_model = Settings.embed_model

    def compute_retrieval_metrics(self, retrieved_contexts: List, query_idx: int) -> Dict:
        """
        Compute retrieval metrics for a single query.
        """
        reference_contexts = self.rag_dataset.reference_contexts[query_idx]
        retrieved_texts = [node.node.text for node in retrieved_contexts]
        
        # Convert texts into sets of sentences (this method can be refined)
        retrieved_set = set(' '.join(retrieved_texts).split('.'))
        reference_set = set(' '.join(reference_contexts).split('.'))
        
        correct = len(retrieved_set.intersection(reference_set))
        precision = correct / len(retrieved_set) if retrieved_set else 0
        recall = correct / len(reference_set) if reference_set else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        mrr = self._calculate_mrr(retrieved_texts, reference_contexts)
        ndcg = self._calculate_ndcg(retrieved_texts, reference_contexts)
        hit = 1.0 if correct > 0 else 0.0
        self.hit_rates.append(hit)
        
        p_at_k = {}
        for k in self.precision_at_k.keys():
            p = self._calculate_precision_at_k(retrieved_texts, reference_contexts, k)
            self.precision_at_k[k].append(p)
            p_at_k[f"p@{k}"] = p
        
        self.mrr_scores.append(mrr)
        self.ndcg_scores.append(ndcg)
        
        return {
            **p_at_k,
            "mrr": mrr,
            "ndcg": ndcg,
            "f1": f1,
            "precision": precision,
            "recall": recall,
            "hit_rate": hit,
            "correct_retrievals": correct,
            "total_retrieved": len(retrieved_set),
            "total_reference": len(reference_set)
        }
    
    def _calculate_mrr(self, retrieved, reference) -> float:
        for i, doc in enumerate(retrieved, 1):
            if doc in reference:
                return 1.0 / i
        return 0.0
    
    def _calculate_ndcg(self, retrieved, reference, k=None) -> float:
        if k is None:
            k = len(retrieved)
        relevance = [1 if doc in reference else 0 for doc in retrieved[:k]]
        ideal = sorted(relevance, reverse=True)
        dcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(relevance))
        idcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(ideal))
        return dcg / idcg if idcg > 0 else 0.0

    def _calculate_precision_at_k(self, retrieved, reference, k: int) -> float:
        retrieved_k = retrieved[:k]
        relevant_k = sum(1 for doc in retrieved_k if doc in reference)
        return relevant_k / k if k > 0 else 0.0

    def aggregate_summary(self) -> Dict:
        summary = {
            "avg_mrr": np.mean(self.mrr_scores),
            "avg_ndcg": np.mean(self.ndcg_scores),
            "avg_hit_rate": np.mean(self.hit_rates),
            "avg_p@1": np.mean(self.precision_at_k[1]),
            "avg_p@3": np.mean(self.precision_at_k[3]),
            "avg_p@5": np.mean(self.precision_at_k[5]),
            "timestamp": datetime.now().isoformat()
        }
        return summary

    def plot_metrics(self):
        plt.style.use("seaborn-v0_8")
        fig = plt.figure(figsize=(20, 10))
        gs = fig.add_gridspec(2, 3)
        
        ax1 = fig.add_subplot(gs[0, 0])
        sns.histplot(self.hit_rates, kde=True, ax=ax1)
        ax1.set_title("Hit Rate Distribution")
        
        ax2 = fig.add_subplot(gs[0, 1])
        data = np.column_stack([self.mrr_scores, self.ndcg_scores])
        df_metrics = pd.DataFrame(data, columns=["MRR", "NDCG"])
        sns.boxplot(data=df_metrics, ax=ax2)
        ax2.set_title("MRR & NDCG Distribution")
        
        ax3 = fig.add_subplot(gs[0, 2])
        p_at_k_data = pd.DataFrame({f"P@{k}": scores for k, scores in self.precision_at_k.items()})
        sns.boxplot(data=p_at_k_data, ax=ax3)
        ax3.set_title("Precision@K Distribution")
        
        plt.tight_layout()
        plt.show()

In [20]:
def run_evaluation(rag_dataset, query_engine, llm=None):
    """
    Run evaluation on the provided dataset.
    
    For LLM-generated datasets (use_manual=False), the pipeline:
      - Uses a query engine to generate answers.
      - Evaluates retrieval using FaithfulnessEvaluator (with reference contexts) 
        and semantic similarity (if a reference answer is provided).
    
    For expert (human) datasets (use_manual=True), the pipeline:
      - Uses the query engine to generate answers.
      - Skips retrieval/faithfulness evaluation.
      - Evaluates the generated answer only via SemanticSimilarityEvaluator against
        the human-provided reference answer.
    
    Returns:
        detailed_results: A list of evaluation results for each query.
        (Optionally, you could also return aggregated metrics.)
    """

    evaluator = EvaluationMetrics(rag_dataset)
    detailed_results = []
    
    # Instantiate standardized evaluators from LlamaIndex
    semantic_evaluator = SemanticSimilarityEvaluator(
        embed_model=Settings.embed_model,
        similarity_threshold=0.8
    )
    faithfulness_evaluator = FaithfulnessEvaluator(
        llm=Settings.llm
    )
    
    for idx, query in enumerate(tqdm(rag_dataset.queries)):
        start_time = time.time()
        retrieved_contexts = query_engine.retrieve(query)
        retrieval_time = time.time() - start_time
        evaluator.retrieval_times.append(retrieval_time)
        
        generated_answer = ""
        if llm:
            try:
                generated_answer = str(query_engine.query(query))
            except Exception as e:
                print(f"Error generating answer for query '{query}': {e}")
        
        retrieval_metrics = evaluator.compute_retrieval_metrics(retrieved_contexts, idx)
        evaluator.retrieval_accuracies.append(retrieval_metrics["f1"])
        
        # Evaluate semantic similarity using standardized evaluator (if reference answer is available)
        semantic_score = 0.0
        if generated_answer and rag_dataset.reference_answers[idx]:
            sem_result = semantic_evaluator.evaluate_response(
                query="",
                response=generated_answer,
                reference=rag_dataset.reference_answers[idx]
            )
            semantic_score = sem_result.score if hasattr(sem_result, "score") else 0.0
            evaluator.semantic_scores.append(semantic_score)
        
        # Evaluate faithfulness using standardized evaluator (using retrieved contexts as supporting evidence)
        faithfulness_score = 0.0
        if generated_answer and retrieved_contexts:
            # Combine the retrieved texts into a single string
            context_text = " ".join([ctx.node.text for ctx in retrieved_contexts])
            faith_result = faithfulness_evaluator.evaluate_response(
                query="",
                response=generated_answer,
                contexts=[context_text]
            )
            faithfulness_score = faith_result.score if hasattr(faith_result, "score") else 0.0
            evaluator.faithfulness_scores.append(faithfulness_score)
        
        result = {
            "query_idx": idx,
            "query": query,
            "retrieval_time": retrieval_time,
            "retrieval_metrics": retrieval_metrics,
            "generated_answer": generated_answer,
            "semantic_score": semantic_score,
            "faithfulness_score": faithfulness_score,
            "timestamp": datetime.now().isoformat()
        }
        detailed_results.append(result)
        evaluator.results.append(result)
    
    return evaluator, detailed_results

In [21]:
dataset_path=str(paths['llm'] / "evaluation_dataset_20250321_154239.json")
documents_input=documents
use_manual=False
response_prompt=response_prompt
manual_path: str = None
results_path: Path = None

In [22]:
# Load dataset: either LLM-generated or expert-generated.
if use_manual and manual_path:
    eval_dataset = load_manual_questions(manual_path)
else:
    with open(dataset_path, 'r') as f:
        eval_dataset = json.load(f)

# Wrap dataset in your RagDataset (which provides properties for queries, reference answers, etc.)
rag_dataset = RagDataset(eval_dataset)

# Build the retrieval components (only used if retrieval evaluation is desired)
index = VectorStoreIndex.from_documents(documents_input, show_progress=True)
query_engine = index.as_query_engine(
    response_mode="compact",
    response_prompt=response_prompt,
    similarity_top_k=3,
    max_tokens=300,
    streaming=False
)

Parsing nodes:   0%|          | 0/4 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/8 [00:00<?, ?it/s]

In [23]:
semantic_evaluator = SemanticSimilarityEvaluator(
    embed_model=Settings.embed_model,
    similarity_threshold=0.8
)

# For LLM-generated dataset, instantiate the FaithfulnessEvaluator.
faithfulness_evaluator = None
if not use_manual:
    faithfulness_evaluator = FaithfulnessEvaluator(
        llm=Settings.llm
    )

In [26]:
detailed_results = []
for idx, query in enumerate(tqdm(rag_dataset.queries)):
    start_time = time.time()
    try:
        response_obj = query_engine.query(query)
    except Exception as e:
        print(f"Error generating answer for query '{query}': {e}")
        response_obj = None
    retrieval_time = time.time() - start_time
    
    semantic_score = 0.0
    faithfulness_score = None
    
    # Evaluate semantic similarity using the standardized evaluator.
    if response_obj and rag_dataset.reference_answers[idx]:
        sem_result = semantic_evaluator.evaluate_response(
            query="",
            response=response_obj,
            reference=rag_dataset.reference_answers[idx]
        )
        semantic_score = sem_result.score if hasattr(sem_result, "score") else 0.0
    
    # For LLM-generated datasets, evaluate faithfulness using the response's source_nodes.
    if not use_manual and response_obj and faithfulness_evaluator:
        faith_result = faithfulness_evaluator.evaluate_response(
            query="",
            response=response_obj
        )
        faithfulness_score = faith_result.score if hasattr(faith_result, "score") else 0.0
    
    result_entry = {
        "query_idx": idx,
        "query": query,
        "generated_answer": response_obj.response if response_obj else "",
        "retrieval_time": retrieval_time,
        "semantic_score": semantic_score,
        "faithfulness_score": faithfulness_score,
        "timestamp": datetime.now().isoformat()
    }
    detailed_results.append(result_entry)

if results_path:
    results_path.parent.mkdir(parents=True, exist_ok=True)
    with open(results_path, 'w') as f:
        json.dump(detailed_results, f, indent=2)
    print(f"Results saved to: {results_path}")


100%|██████████| 12/12 [1:37:38<00:00, 488.24s/it]  


In [28]:
detailed_results

[{'query_idx': 0,
  'query': 'What is the purpose of the context provided?',
  'generated_answer': '\nThe context provided is a set of guidelines for managing and safeguarding the Research Data Laboratory, a secure facility designed for conducting exploratory analysis on sensitive data. The document outlines the access control framework, laboratory security protocols, data management framework, incident management, and compliance and enforcement procedures for the Research Data Laboratory.',
  'retrieval_time': 8.802693128585815,
  'semantic_score': 0.0,
  'faithfulness_score': 0.0,
  'timestamp': '2025-03-24T17:10:21.649104'},
 {'query_idx': 1,
  'query': '2. Who is the author of the text?',
  'generated_answer': '\nThe text does not contain any information about the author. It appears to be a set of guidelines or procedures for managing and safeguarding a research data laboratory.',
  'retrieval_time': 15.972921133041382,
  'semantic_score': 0.0,
  'faithfulness_score': 0.0,
  'times

In [None]:
detailed_results = []
for idx, query in enumerate(tqdm(rag_dataset.queries)):
    start_time = time.time()
    retrieved_contexts = query_engine.retrieve(query)
    retrieval_time = time.time() - start_time
    
    # Use the response object directly
    try:
        response_obj = query_engine.query(query)
    except Exception as e:
        print(f"Error generating answer for query '{query}': {e}")
        response_obj = None
    
    # Evaluate semantic similarity using standardized evaluator.
    semantic_score = 0.0
    if response_obj and rag_

In [None]:
def run_evaluation_pipeline(
    dataset_path: str,
    documents,
    response_prompt: PromptTemplate,
    use_manual: bool = False,
    manual_path: str = None,
    results_path: Path = None
) -> tuple:
    """
    Run evaluation on the provided dataset.

    For an LLM-generated dataset (use_manual=False), the evaluation uses both
    retrieval metrics (and faithfulness evaluation) and semantic similarity.
    
    For an expert (human) dataset (use_manual=True), only the generated answer's 
    semantic similarity to the reference answer is evaluated.

    Returns:
        evaluator: (optional) An object with aggregated metrics.
        detailed_results: List of evaluation results per query.
    """
    # Load dataset: either LLM-generated or expert-generated.
    if use_manual and manual_path:
        eval_dataset = load_manual_questions(manual_path)
    else:
        with open(dataset_path, 'r') as f:
            eval_dataset = json.load(f)

    # Wrap dataset in your RagDataset (which provides properties for queries, reference answers, etc.)
    rag_dataset = RagDataset(eval_dataset)

    # Build the retrieval components (only used if retrieval evaluation is desired)
    index = VectorStoreIndex.from_documents(documents_input, show_progress=True)
    query_engine = index.as_query_engine(
        response_mode="compact",
        response_prompt=response_prompt,
        similarity_top_k=3,
        max_tokens=300,
        streaming=False
    )

    semantic_evaluator = SemanticSimilarityEvaluator(
    embed_model=Settings.embed_model,
    similarity_threshold=0.8
)

    # For LLM-generated dataset, instantiate the FaithfulnessEvaluator.
    faithfulness_evaluator = None
    if not use_manual:
        faithfulness_evaluator = FaithfulnessEvaluator(
            llm=Settings.llm
        )
    
    detailed_results = []
    for idx, query in enumerate(tqdm(rag_dataset.queries)):
        start_time = time.time()
        try:
            response_obj = query_engine.query(query)
        except Exception as e:
            print(f"Error generating answer for query '{query}': {e}")
            response_obj = None
        retrieval_time = time.time() - start_time
        
        semantic_score = 0.0
        faithfulness_score = None
        
        # Evaluate semantic similarity using the standardized evaluator.
        if response_obj and rag_dataset.reference_answers[idx]:
            sem_result = semantic_evaluator.evaluate_response(
                query="",
                response=response_obj,
                reference=rag_dataset.reference_answers[idx]
            )
            semantic_score = sem_result.score if hasattr(sem_result, "score") else 0.0
        
        # For LLM-generated datasets, evaluate faithfulness using the response's source_nodes.
        if not use_manual and response_obj and faithfulness_evaluator:
            faith_result = faithfulness_evaluator.evaluate_response(
                query="",
                response=response_obj
            )
            faithfulness_score = faith_result.score if hasattr(faith_result, "score") else 0.0
        
        result_entry = {
            "query_idx": idx,
            "query": query,
            "generated_answer": response_obj.response if response_obj else "",
            "retrieval_time": retrieval_time,
            "semantic_score": semantic_score,
            "faithfulness_score": faithfulness_score,
            "timestamp": datetime.now().isoformat()
        }
        detailed_results.append(result_entry)

    if results_path:
        results_path.parent.mkdir(parents=True, exist_ok=True)
        with open(results_path, 'w') as f:
            json.dump(detailed_results, f, indent=2)
        print(f"Results saved to: {results_path}")




In [25]:


evaluator_llm, dataset_llm = run_evaluation_pipeline(
    dataset_path=str(paths['llm'] / "evaluation_dataset_20250321_154239.json"),
    documents=documents,
    use_manual=False,
    response_prompt=response_prompt
)

Parsing nodes:   0%|          | 0/4 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/12 [00:10<?, ?it/s]


AttributeError: 'str' object has no attribute 'response'

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
evaluator_llm.save_results(paths["results"] / f"evaluation_results_{timestamp}.json")

In [None]:
debug_df = create_debug_dataframe(evaluator_llm)

In [None]:
debug_df.to_csv(str(paths['debug'] / f"evaluation_debug_{timestamp}.csv"), index=False)