In [1]:
import numpy as np
import json
from datetime import datetime
from typing import List, Dict
from tqdm import tqdm
import pandas as pd
import seaborn as sns
from pathlib import Path
from datetime import datetime
import json
from typing import List, Dict
import matplotlib.pyplot as plt
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Settings,
    PromptTemplate
)
from llama_index.readers.file import PDFReader, DocxReader
from llama_index.readers.file.tabular import PandasExcelReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.evaluation import generate_question_context_pairs

from llm_loader import load_llm
import time
from sklearn.metrics.pairwise import cosine_similarity




## Create QnA Dataset

In [30]:
def setup_evaluation_directories(base_path: str = "./Document/evaluation_datasets") -> dict:
    """Create and return evaluation directory structure"""
    paths = {
        "base": Path(base_path),
        "llm": Path(base_path) / "llm_generated",
        "manual": Path(base_path) / "manual",
        "results": Path(base_path) / "results",
        "debug": Path(base_path) / "debug",
    }
    
    for path in paths.values():
        path.mkdir(parents=True, exist_ok=True)
    
    return paths

In [31]:
paths = setup_evaluation_directories()

In [9]:
def load_documents(directory: str = "./Document"):
    """Load documents using LlamaIndex readers"""
    file_readers = {
        ".pdf": PDFReader(),
        ".docx": DocxReader(),
        ".xlsx": PandasExcelReader()
    }
    
    reader = SimpleDirectoryReader(
        input_dir=directory,
        file_extractor=file_readers,
        filename_as_id=True
    )
    
    documents = reader.load_data()
    print(f"Loaded {len(documents)} documents")
    return documents

In [None]:
def create_evaluation_dataset(
    documents,
    num_questions_per_chunk: int = 3,
    base_output_path: str = "./Document/evaluation_datasets"
):
    """Create evaluation dataset using LlamaIndex's generate_question_context_pairs"""

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = Path(base_output_path) / f"evaluation_dataset_{timestamp}.json"
    
    # Create node parser for chunking
    node_parser = SentenceSplitter(
        chunk_size=512,
        chunk_overlap=50
    )
    nodes = node_parser.get_nodes_from_documents(documents)
    print(f"Total documents: {len(documents)}")
    print(f"Total chunks: {len(nodes)}")
    print(f"Estimated questions to generate: {len(nodes) * num_questions_per_chunk}")

    # You can also see chunk distribution per document
    doc_chunks = {}
    for node in nodes:
        doc_id = node.metadata.get('file_name', 'unknown')
        doc_chunks[doc_id] = doc_chunks.get(doc_id, 0) + 1

    print("\nChunks per document:")
    for doc_id, count in doc_chunks.items():
        print(f"- {doc_id}: {count} chunks")

    # Generate questions with proper prompt
    question_prompt = PromptTemplate("""
    Generate {num_questions} specific questions from this text. 
    Questions should be complete, answerable questions about the content:
    
    Text: {context}
    
    Questions:
    """)

    # Generate question-context pairs
    print("Generating question-context pairs...")
    qc_pairs = generate_question_context_pairs(
        nodes=nodes,
        llm=Settings.llm,
        num_questions_per_chunk=num_questions_per_chunk,
        prompt_template=question_prompt
    )
    
    # Format into dataset structure
    dataset = {
        "examples": [
            {
                "query": pair[0],
                "reference_contexts": [pair[1]],  # Keeping original context handling
                "reference_answer": pair[2] if len(pair) > 2 else "",
                "query_by": {
                    "model_name": "llama-2",
                    "type": "ai",
                    "timestamp": timestamp
                },
                "metadata": {
                    "source": getattr(pair[1], 'metadata', {}).get('file_name', ''),
                    "doc_id": getattr(pair[1], 'node_id', '')
                }
            }
            for pair in qc_pairs
        ],
        "dataset_info": {
            "total_documents": len(documents),
            "total_chunks": len(nodes),
            "questions_per_chunk": num_questions_per_chunk,
            "creation_timestamp": timestamp,
            "chunk_distribution": doc_chunks
        }
    }
    
    # Save dataset with timestamp
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w') as f:
        json.dump(dataset, f, indent=2)
    
    print(f"Created dataset with {len(dataset['examples'])} examples")
    print(f"Saved to: {output_path}")
    return dataset

In [13]:

custom_prompt = PromptTemplate(
    """\
Rewrite the user's follow-up question as a standalone question.

1. Include all relevant past context.
2. Keep it natural and grammatically correct.
3. If already standalone, return it unchanged.

<Chat History>
{chat_history}

<User's Follow-Up Question>
{question}

<Rewritten Standalone Question>
"""
)


response_prompt = PromptTemplate(
    """\
You are an AI assistant providing structured responses.

### **Instructions:**
- Answer clearly and concisely.
- Summarize retrieved context to avoid duplication.
- Summarize the key facts efficiently.
- If the context lacks enough details, say: "I don’t have enough information."
- Format responses in natural sentences.

<Retrieved Context>
{context}

<User's Query>
{question}

### **AI Response:**
"""
)

In [11]:
# Initialize LLM
llm = load_llm()
Settings.llm = llm

Settings.embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    cache_folder="./embedding_cache"
)
documents = load_documents()

llama_init_from_model: n_ctx_per_seq (3904) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_b

Loaded 4 documents


In [None]:



eval_dataset = create_evaluation_dataset(
    documents=documents,
    num_questions_per_chunk=3,
    base_output_path=paths['llm']
)

Loaded 4 documents
Total documents: 4
Total chunks: 14
Estimated questions to generate: 42

Chunks per document:
- AIBots - FAQs.docx: 5 chunks
- Aibot - Datalab.docx: 5 chunks
- Synthetic Data Dictionary.xlsx: 1 chunks
- Synthetic Standard Operating Procedures for Research Data Laboratory.docx: 3 chunks
Generating question-context pairs...


100%|██████████| 14/14 [01:43<00:00,  7.37s/it]

Created dataset with 4 examples
Saved to: Document/evaluation_datasets/llm_generated/evaluation_dataset_20250320_153607.json





# Read DataLab Inputs

In [None]:
file_name = 'Evaluation Scores.csv'

Unnamed: 0,Category,Question,Answer
0,Data Dictionaries,What do the data fields and values mean?,Please refer to the code table.
1,Lab Availability,When does the data lab have available terminal...,Please check the resource calendar at the MOM ...
2,Software Availability,What software is available on the Data Lab ter...,"Tableau, Microsoft Office, Python, R, Stata"
3,Data Availability,What is the latest year and month of data avai...,Please refer to the Software and Data Products...
4,Tableau/Code Generation,How do I create my own groups in Tableau?,Select the Dimension to Group:\n\nIn the Data ...


In [None]:
def load_manual_questions(excel_path: str)-> Dict:
    """Load manual questions from Excel and convert to dataset format"""
    
    human_truth_df = pd.read_csv(excel_path)[['Category', 'Question', 'Answer']]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Convert Excel data to dataset format
    manual_dataset = {
        "examples": [
            {
                "category": row["Category"],
                "question": row['Question'],  # Will be filled during evaluation
                "reference_answer": row["Answer"],
                "query_by": {
                    "model_name": "human",
                    "type": "expert",
                    "timestamp": timestamp
                },
                "metadata": {
                    "source": f"{file_name}.xlsx",
                    "doc_id": f"manual_{idx}"
                }
            }
            for idx, row in human_truth_df.iterrows()
        ],
        "dataset_info": {
            "creation_timestamp": timestamp,
            "source": "manual",
            "total_questions": len(human_truth_df)
        }
    }
    
    return manual_dataset

In [56]:
manual_dataset_df = load_manual_questions(str(paths['manual'])+'/' + file_name)
manual_dataset_df['examples'][0]

{'category': 'Data Dictionaries',
 'question': 'What do the data fields and values mean?',
 'reference_answer': 'Please refer to the code table.',
 'query_by': {'model_name': 'human',
  'type': 'expert',
  'timestamp': '20250320_162642'},
 'metadata': {'source': 'Evaluation Scores.csv.xlsx', 'doc_id': 'manual_0'}}

## RUN EVALUATION

In [25]:
def create_debug_dataframe(evaluator) -> pd.DataFrame:
    """Create DataFrame for debugging evaluation results"""
    debug_data = []
    
    for result in evaluator.results:
        # Get reference context and answer
        ref_context = evaluator.rag_dataset.reference_contexts[result['query_idx']]
        ref_answer = evaluator.rag_dataset.reference_answers[result['query_idx']]
        
        debug_data.append({
            'Question': result['query'],
            'Reference Answer': ref_answer,
            'Generated Answer': result['generated_answer'],
            'Reference Context': ref_context,
            'Retrieved Chunks': [node.node.text for node in result.get('retrieved_contexts', [])],
            'Retrieval Time': f"{result['retrieval_time']:.3f}s",
            'Accuracy': result['retrieval_metrics']['accuracy'],
            'Semantic Score': result['semantic_score']
        })
    
    df = pd.DataFrame(debug_data)
    return df

In [2]:
class RagDataset:
    def __init__(self, data):
        """Initialize with JSON data
        
        Args:
            data (dict): Dictionary containing examples with queries and contexts
        """
        self.data = data
        self.examples = self.data.get("examples", [])
    
    @property 
    def queries(self):
        """Get list of all queries"""
        return [example["query"] for example in self.examples]
    
    @property
    def reference_contexts(self):
        """Get list of reference contexts for each example"""
        # Handle both string and dict contexts
        contexts = []
        for example in self.examples:
            context = example.get("reference_contexts", [])
            # If context is dictionary, get values
            if isinstance(context[0], dict):
                contexts.append(list(context[0].values())[0])
            else:
                contexts.append(context[0])
        return contexts
    
    @property
    def reference_answers(self):
        """Get list of reference answers"""
        return [example.get("reference_answer", "") for example in self.examples]
    
    def get_example(self, idx):
        """Get complete example at index"""
        if idx >= len(self.examples):
            raise IndexError(f"Index {idx} out of range for dataset with {len(self.examples)} examples")
        return self.examples[idx]


In [None]:


class EvaluationMetrics:
    def __init__(self, rag_dataset):
        """Initialize with RAG dataset and metrics storage"""
        self.rag_dataset = rag_dataset
        self.results = []
        self.retrieval_times: List[float] = []
        self.retrieval_accuracies: List[float] = []
        self.semantic_scores: List[float] = []
        self.mrr_scores: List[float] = []
        self.ndcg_scores: List[float] = []
        self.precision_at_k: Dict[int, List[float]] = {1: [], 3: [], 5: []}
        self.map_scores: List[float] = []
        # use global embedding model
        self.embedding_model = Settings.embed_model

    def evaluate_retrieval_accuracy(self, retrieved_contexts: List[str], query_idx: int)-> Dict:
        """
        Evaluate accuracy of retrieved contexts against reference contexts
        
        Args:
            retrieved_contexts: List of retrieved text chunks
            query_idx: Index of query in dataset
            
        Returns:
            Dictionary containing accuracy metrics
        """
        reference_contexts = self.rag_dataset.reference_contexts[query_idx]

        retrieved_texts = [node.node.text for node in retrieved_contexts]
        
        # Convert contexts to sets of sentences for comparison
        retrieved_set = set(' '.join(retrieved_texts).split('.'))
        reference_set = set(' '.join(reference_contexts).split('.'))
        
        # Calculate metrics
        correct_retrievals = len(retrieved_set.intersection(reference_set))
        precision = correct_retrievals / len(retrieved_set) if retrieved_set else 0
        recall = correct_retrievals / len(reference_set) if reference_set else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        mrr = self._calculate_mrr(retrieved_texts, reference_contexts)
        ndcg = self._calculate_ndcg(retrieved_texts, reference_contexts)
        map_score = self._calculate_map(retrieved_texts, reference_contexts)
        # Calculate P@k for different k values
        precision_k = {}
        for k in self.precision_at_k.keys():
            p_at_k = self._calculate_precision_at_k(retrieved_texts, reference_contexts, k)
            self.precision_at_k[k].append(p_at_k)
            precision_k[f"p@{k}"] = p_at_k
        
        # Store scores
        self.mrr_scores.append(mrr)
        self.ndcg_scores.append(ndcg)
        self.map_scores.append(map_score)
        
        return {
            **precision_k,
            "mrr": mrr,
            "ndcg": ndcg,
            "map": map_score,
            "accuracy": f1,  # Using F1 score as accuracy metric
            "precision": precision,
            "recall": recall,
            "correct_retrievals": correct_retrievals,
            "total_retrieved": len(retrieved_set),
            "total_reference": len(reference_set)
        }
    
    def _calculate_mrr(self, retrieved, reference) -> float:
        """Calculate Mean Reciprocal Rank"""
        for i, doc in enumerate(retrieved, 1):
            if doc in reference:
                return 1.0 / i
        return 0.0
    
    def _calculate_ndcg(self, retrieved, reference, k=None) -> float:
        """Calculate NDCG"""
        if k is None:
            k = len(retrieved)
        
        relevance = [1 if doc in reference else 0 for doc in retrieved[:k]]
        ideal = sorted(relevance, reverse=True)
        
        dcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(relevance))
        idcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(ideal))
        
        return dcg / idcg if idcg > 0 else 0.0

    def _calculate_precision_at_k(self, retrieved, reference, k: int) -> float:
        """Calculate Precision@K"""
        retrieved_k = retrieved[:k]
        relevant_k = sum(1 for doc in retrieved_k if doc in reference)
        return relevant_k / k if k > 0 else 0.0

    def _calculate_map(self, retrieved, reference) -> float:
        """Calculate Mean Average Precision"""
        if not reference:
            return 0.0
        
        precisions = []
        relevant_found = 0
        
        for i, doc in enumerate(retrieved, 1):
            if doc in reference:
                relevant_found += 1
                precision_at_i = relevant_found / i
                precisions.append(precision_at_i)
        
        return sum(precisions) / len(reference) if precisions else 0.0
        
    def evaluate_semantic_quality(self, generated_answer: str, query_idx: int) -> Dict:
        """
        Evaluate semantic similarity between generated and reference answers
        
        Args:
            generated_answer: Generated answer to evaluate
            query_idx: Index of query in dataset
            
        Returns:
            Dictionary containing semantic quality metrics
        """
        reference_answer = self.rag_dataset.reference_answers[query_idx]
        
        # Get embeddings using LlamaIndex's embedding model
        gen_embedding = self.embedding_model.get_text_embedding(generated_answer)
        ref_embedding = self.embedding_model.get_text_embedding(reference_answer)
        
        # Calculate cosine similarity
        similarity = cosine_similarity(
            np.array(gen_embedding).reshape(1, -1),
            np.array(ref_embedding).reshape(1, -1)
        )[0][0]
        
        return {
            "semantic_similarity": similarity,
            "generated_length": len(generated_answer.split()),
            "reference_length": len(reference_answer.split())
        }
    
    def evaluate_all_queries(self, query_engine, llm=None):
        """
        Evaluate all queries in the dataset
        
        Args:
            query_engine: RAG query engine for retrieving contexts
            llm: Language model for generating answers (optional)
        """
        print(f"Evaluating {len(self.rag_dataset.queries)} queries...")
        
        for idx, query in enumerate(tqdm(self.rag_dataset.queries)):
            # Measure retrieval time and get contexts
            start_time = time.time()
            retrieved_contexts = query_engine.retrieve(query)
            retrieval_time = time.time() - start_time
            self.retrieval_times.append(retrieval_time)
            
            # Generate answer if LLM provided
            generated_answer = ""
            if llm:
                generated_answer = self._generate_answer(query_engine, query)
            
            # Evaluate retrieval accuracy
            retrieval_metrics = self.evaluate_retrieval_accuracy(
                retrieved_contexts,
                idx
            )
            self.retrieval_accuracies.append(retrieval_metrics['accuracy'])
            
            # Evaluate semantic quality if answer generated
            semantic_score = 0.0
            if generated_answer:
                semantic_metrics = self.evaluate_semantic_quality(
                    generated_answer,
                    idx
                )
                semantic_score = semantic_metrics['semantic_similarity']
                self.semantic_scores.append(semantic_score)
            
            # Store complete results
            result = {
                "query_idx": idx,
                "query": query,
                "retrieval_time": retrieval_time,
                "retrieval_metrics": retrieval_metrics,
                "generated_answer": generated_answer,
                "semantic_score": semantic_score,
                "timestamp": datetime.now().isoformat()
            }
            self.results.append(result)
            
    def get_summary_metrics(self):
        """Get summary of all evaluation metrics"""
        summary = {
            "total_queries": len(self.results),
            "avg_retrieval_time": np.mean(self.retrieval_times),
            "avg_retrieval_accuracy": np.mean(self.retrieval_accuracies),
            "avg_semantic_score": np.mean(self.semantic_scores) if self.semantic_scores else 0.0,
            "avg_mrr": np.mean(self.mrr_scores),
            "avg_ndcg": np.mean(self.ndcg_scores),
            "avg_map": np.mean(self.map_scores),
            "timestamp": datetime.now().isoformat()
        }
        
        # Add average P@k scores
        for k in self.precision_at_k.keys():
            summary[f"avg_p@{k}"] = np.mean(self.precision_at_k[k])
        
        return summary
    
    def plot_results(self):
        """Enhanced visualization with ranking metrics"""
        plt.style.use('seaborn-v0_8')
        fig = plt.figure(figsize=(20, 10))
        
        # Create grid for subplots
        gs = fig.add_gridspec(2, 3)
        
        # Plot 1: Retrieval Times
        ax1 = fig.add_subplot(gs[0, 0])
        sns.histplot(self.retrieval_times, kde=True, ax=ax1)
        ax1.set_title('Retrieval Time Distribution')
        ax1.set_xlabel('Time (seconds)')
        
        # Plot 2: Accuracy Metrics
        ax2 = fig.add_subplot(gs[0, 1])
        accuracy_data = pd.DataFrame({
            'F1': self.retrieval_accuracies,
            'MRR': self.mrr_scores,
            'NDCG': self.ndcg_scores,
            'MAP': self.map_scores
        })
        sns.boxplot(data=accuracy_data, ax=ax2)
        ax2.set_title('Ranking Metrics Distribution')
        ax2.set_ylabel('Score')
        
        # Plot 3: P@K Values
        ax3 = fig.add_subplot(gs[0, 2])
        p_at_k_data = pd.DataFrame({f'P@{k}': scores 
                                   for k, scores in self.precision_at_k.items()})
        sns.boxplot(data=p_at_k_data, ax=ax3)
        ax3.set_title('Precision@K Distribution')
        ax3.set_ylabel('Score')

                # Plot 4: Semantic Scores
        ax4 = fig.add_subplot(gs[1, 0])
        if self.semantic_scores:
            sns.histplot(self.semantic_scores, kde=True, ax=ax4)
            ax4.set_title('Semantic Score Distribution')
            ax4.set_xlabel('Semantic Score')
        
        # Plot 5: Metrics Correlation
        ax5 = fig.add_subplot(gs[1, 1:])
        metrics = np.column_stack([
            self.retrieval_accuracies,
            self.mrr_scores,
            self.ndcg_scores,
            self.map_scores
        ])
        sns.heatmap(
            np.corrcoef(metrics.T),
            annot=True,
            xticklabels=['F1', 'MRR', 'NDCG', 'MAP'],
            yticklabels=['F1', 'MRR', 'NDCG', 'MAP'],
            ax=ax5
        )
        ax5.set_title('Metrics Correlation')
        
        plt.tight_layout()
        plt.show()
    
    def save_results(self, filepath):
        """Save evaluation results with source information"""
        output = {
            "summary_metrics": self.get_summary_metrics(),
            "detailed_results": self.results,
            "metadata": {
                "timestamp": datetime.now().isoformat(),
                "total_queries": len(self.results),
                "sources": {
                    "llm": len([r for r in self.results if r.get("source") == "llm"]),
                    "manual": len([r for r in self.results if r.get("source") == "manual"])
                }
            }
        }
        with open(filepath, 'w') as f:
            json.dump(output, f, indent=2)

    def _generate_answer(self, query_engine, query):
        """Helper to generate answer using Query Engine"""
        try:
            # Use the query engine directly since it already has the prompt setup
            response = query_engine.query(query)
            return str(response)
        except Exception as e:
            print(f"Error generating answer: {e}")
            return ""

In [22]:
def run_evaluation_pipeline(
    dataset_path: str,
    documents,
    response_prompt: PromptTemplate,
    use_manual: bool = False,
    manual_path: str = None,
    results_path: Path = None
) -> tuple:
    """Run evaluation and optionally save results"""
    # load llm generated QnA
    with open(dataset_path, 'r') as f:
        llm_dataset = json.load(f)
    # check for manual datasets
    if use_manual and manual_path:
        manual_dataset = load_manual_questions(manual_path)

        # Combine datasets
        combined_dataset = {
            "examples": llm_dataset["examples"] + manual_dataset["examples"],
            "dataset_info": {
                "total_questions": len(llm_dataset["examples"]) + len(manual_dataset["examples"]),
                "sources": ['llm', 'manual'],
                "creation_timestamp": datetime.now().strftime("%Y%m%d_%H%M%S")
            }
        }
        eval_dataset = combined_dataset
    else:
        eval_dataset = llm_dataset 
        
    
    # Setup RAG components
    rag_dataset = RagDataset(eval_dataset)
    index = VectorStoreIndex.from_documents(documents, show_progress=True)
    query_engine = index.as_query_engine(
        response_mode="compact",
        response_prompt=response_prompt,
        similarity_top_k=3,
        max_tokens=300,
        streaming=False
    )
    
    # Run evaluation
    evaluator = EvaluationMetrics(rag_dataset)
    evaluator.evaluate_all_queries(query_engine, llm=Settings.llm)
    
    # Analyze results by source
    if use_manual:
        llm_results = [r for r in evaluator.results 
                      if eval_dataset["examples"][r["query_idx"]]["query_by"]["type"] == "ai"]
        manual_results = [r for r in evaluator.results 
                         if eval_dataset["examples"][r["query_idx"]]["query_by"]["type"] == "expert"]
        
        print("\nResults by Source:")
        print(f"LLM Questions ({len(llm_results)})")
        print(f"Manual Questions ({len(manual_results)})")
    
    return evaluator, eval_dataset

In [23]:


evaluator_llm, dataset_llm = run_evaluation_pipeline(
    dataset_path=str(paths['llm'] / "evaluation_dataset_20250320_153607.json"),
    documents=documents,
    response_prompt=response_prompt
)

Parsing nodes:   0%|          | 0/4 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/8 [00:00<?, ?it/s]

Evaluating 4 queries...


100%|██████████| 4/4 [01:38<00:00, 24.62s/it]


In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
evaluator_llm.save_results(paths["results"] / f"evaluation_results_{timestamp}.json")

In [26]:
debug_df = create_debug_dataframe(evaluator_llm)

In [32]:
debug_df.to_csv(str(paths['debug'] / f"evaluation_debug_{timestamp}.csv"), index=False)