# 🚀 Comprehensive RAG Pipeline Evaluation

This notebook is the final tool for evaluating our advanced RAG pipeline.

**Workflow:**
1.  Loads the high-performance `BAAI/bge-large-en-v1.5` model and re-ranker.
2.  Loops through a formal list of evaluation questions from a CSV file.
3.  For each question, it generates an answer and displays it with its sources.
4.  It then prompts for a **manual quality score (1-5)** and **qualitative comments**.
5.  Finally, it saves all results, including the manual scores, to a CSV file.

In [1]:
import sys
import os

# Go two levels up from the notebook to the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))

# Join the path to 'src'
src_path = os.path.join(project_root, "src")

# Add 'src' to Python path
if src_path not in sys.path:
    sys.path.append(src_path)

# Confirm it's added
print("src path added:", src_path)

src path added: c:\Users\ABC\Desktop\10Acadamy\week_6\Intelligent-Complaint-Analysis-for-Financial-Services\src


In [2]:
import yaml
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import pipeline
from typing import Dict, List, Tuple
from RAG_pipeline_eval import RAGPipeline 

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# --- THE FIX IS HERE ---
# 1. Import your prompt template from your module
from prompts import PROMPT_TEMPLATE

# 2. Update the config to use the imported template
config = {
    "embedding": { "model_name": "BAAI/bge-base-en-v1.5" },
    "reranker": { "model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", "k": 5 },
    "llm": { "model_name": "google/flan-t5-base", "max_new_tokens": 256 },
    "retrieval": { "k": 25 },
    "prompt": {
        "template": PROMPT_TEMPLATE # Use the imported variable here
    },
    "data": {
        "index_path": "data/vector_store/index_bge_large_300_20.faiss",
        "meta_path": "data/vector_store/meta_bge_large_300_20.csv"
    }
}

In [None]:
# 1. Initialize the pipeline
rag_system = RAGPipeline(config=config)

# 2. Load the evaluation questions from your CSV file
# Using a corrected, more standard path
eval_df = pd.read_csv("../evaluation/evaluation_dataset.csv")
questions = eval_df["question"].tolist()
print(f"\nLoaded {len(questions)} evaluation questions.")

# 3. Loop through questions, generate answers, and collect manual feedback
evaluation_results = []

for q in questions:
    print("\n" + "="*80)
    print(f"🔍 Evaluating Question: {q}")
    print("="*80)
    
    # UPDATED: Unpack three values (answer, sources, sentiment)
    answer, sources, sentiment = rag_system.query(q)
    
    print(f"\n🧠 Generated Answer:\n{answer}\n")

    # NEW: Display the sentiment summary
    print("--- Sentiment of Sources ---")
    print(f"📊 {sentiment}")
    print("----------------------------\n")

    print("--- Top Sources Used ---")
    for i, source in enumerate(sources, 1):
        print(f"Source {i}: {source['chunk_text'][:200]}...")
    print("-" * 26)

    # Manual input for scoring
    while True:
        try:
            quality_score = int(input("💯 Enter quality score (1–5): "))
            if 1 <= quality_score <= 5:
                break
            else:
                print("❌ Please enter a number between 1 and 5.")
        except ValueError:
            print("❌ Invalid input. Please enter a number.")
    
    comments = input("📝 Enter your comments/analysis: ")

    # UPDATED: Add the sentiment summary to the results
    evaluation_results.append({
        "Question": q,
        "Generated Answer": answer,
        "Sentiment of Sources": sentiment, # New column
        "Retrieved Sources": " | ".join([s['chunk_text'] for s in sources]),
        "Manual Quality Score (1-5)": quality_score,
        "Comments/Analysis": comments
    })

# 4. Save the results to a CSV file
results_df = pd.DataFrame(evaluation_results)
results_df.to_csv("../evaluation/manual_evaluation_results.csv", index=False)

print("\n\n✅ Evaluation complete! Results saved to 'evaluation/manual_evaluation_results.csv'")

  eval_df = pd.read_csv("..\..\csv_files\evaluation_dataset.csv")


--- Initializing RAG Pipeline ---


RuntimeError: Error in __cdecl faiss::FileIOReader::FileIOReader(const char *) at D:\a\faiss-wheels\faiss-wheels\faiss\faiss\impl\io.cpp:68: Error: 'f' failed: could not open data/vector_store/index_bge_large_300_20.faiss for reading: No such file or directory

In [None]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall

print("\n--- Starting Automated RAGAs Evaluation ---")

# 1. Load the manual evaluation results you just created
manual_results_df = pd.read_csv("../../csv_file/manual_evaluation_results.csv")

# 2. Format the data for RAGAs (RAGAs needs the sources as a list of strings)
# Note: We are splitting the saved string of sources back into a list
ragas_data = {
    "question": manual_results_df["Question"].tolist(),
    "answer": manual_results_df["Generated Answer"].tolist(),
    "contexts": [s.split(' | ') for s in manual_results_df["Retrieved Sources"]],
}
ragas_dataset = Dataset.from_dict(ragas_data)

# 3. Run the RAGAs evaluation
result = evaluate(
    ragas_dataset,
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
)

# 4. Combine your manual scores with the automated RAGAs scores
ragas_df = result.to_pandas()
comprehensive_df = pd.concat([manual_results_df, ragas_df.drop(columns=['question', 'answer', 'contexts'])], axis=1)

# 5. Save the final comprehensive report
comprehensive_df.to_csv("evaluation/comprehensive_evaluation_results.csv", index=False)

print("\n✅ Comprehensive evaluation complete!")
print("Final results with both manual and RAGAs scores saved to 'evaluation/comprehensive_evaluation_results.csv'")

display(comprehensive_df)