# TCS RAG System Evaluation with LLM Scoring

This notebook evaluates our TCS Annual Report RAG system by using GPT-4.1 as an independent evaluator.
The LLM will score both Original and Query Expansion approaches on a 0/1 scale with rationales.

## Evaluation Process:
1. Connect to existing ChromaDB vector database
2. Load existing evaluation results CSV
3. For each question, retrieve 8 most relevant chunks from TCS report
4. Show GPT-4.1 the question + both answers + 8 relevant chunks
5. Get binary scores (0/1) and explanations for each answer
6. Update CSV with LLM evaluation results
7. Analyze patterns and compare approaches

In [None]:
# Step 1: Setup & Dependencies
import pandas as pd
import os
from dotenv import load_dotenv
from openai import OpenAI
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import time
from datetime import datetime

# Load environment variables
load_dotenv()

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# File paths
CHROMA_DB_PATH = "./chroma_db"
CSV_PATH = "tcs_rag_evaluation_20250927_214934.csv"
COLLECTION_NAME = "tcs_annual_report_2024"

print("✅ Dependencies loaded successfully!")
print(f"🗃️  ChromaDB Path: {CHROMA_DB_PATH}")
print(f"📊 CSV Path: {CSV_PATH}")
print(f"📚 Collection: {COLLECTION_NAME}")

In [None]:
# Step 2: Connect to ChromaDB Vector Database
def connect_to_chromadb(chroma_path, collection_name):
    """
    Connect to existing ChromaDB collection from the main RAG notebook.
    """
    print(f"🗃️  Connecting to ChromaDB at {chroma_path}...")

    # Initialize ChromaDB client
    chroma_client = chromadb.PersistentClient(path=chroma_path)

    # Create embedding function (same as main notebook)
    embedding_function = SentenceTransformerEmbeddingFunction()

    try:
        # Get existing collection
        chroma_collection = chroma_client.get_collection(
            collection_name,
            embedding_function=embedding_function
        )

        # Verify collection
        count = chroma_collection.count()
        print(f"✅ Connected to existing collection: {collection_name}")
        print(f"📊 Total document chunks: {count}")

        # Test a sample query to ensure everything works
        test_results = chroma_collection.query(
            query_texts=["TCS revenue"],
            n_results=2
        )
        print(f"🧪 Test query successful - retrieved {len(test_results['documents'][0])} chunks")

        return chroma_collection

    except Exception as e:
        print(f"❌ Failed to connect to ChromaDB collection: {str(e)}")
        print(f"💡 Make sure you've run the main RAG notebook first to create the collection")
        raise

# Connect to the vector database
chroma_collection = connect_to_chromadb(CHROMA_DB_PATH, COLLECTION_NAME)

print(f"\n📋 Vector Database Ready:")
print(f"   Collection: {COLLECTION_NAME}")
print(f"   Chunks available: {chroma_collection.count():,}")
print(f"   Ready for 8-chunk evaluation! 🚀")

In [None]:
# Step 3: Load CSV and Prepare for Evaluation
def load_and_prepare_csv(csv_path):
    """
    Load the evaluation CSV and add columns for LLM scores.
    """
    print(f"📊 Loading evaluation data from {csv_path}...")
    
    # Load existing CSV
    df = pd.read_csv(csv_path)
    
    # Add new columns for LLM evaluation if they don't exist
    new_columns = [
        'llm_original_score',
        'llm_expansion_score', 
        'llm_original_rationale',
        'llm_expansion_rationale'
    ]
    
    for col in new_columns:
        if col not in df.columns:
            df[col] = None
            print(f"   ➕ Added column: {col}")
        else:
            print(f"   ✅ Column exists: {col}")
    
    print(f"\n📋 Dataset Summary:")
    print(f"   Total questions: {len(df)}")
    print(f"   Difficulty breakdown:")
    difficulty_counts = df['difficulty'].value_counts()
    for difficulty, count in difficulty_counts.items():
        print(f"     {difficulty}: {count} questions")
    
    # Check how many questions already have LLM scores
    evaluated_count = df['llm_original_score'].notna().sum()
    remaining_count = len(df) - evaluated_count
    
    print(f"\n🔍 Evaluation Status:")
    print(f"   Already evaluated: {evaluated_count}")
    print(f"   Remaining: {remaining_count}")
    
    return df

# Load and prepare the dataframe
df = load_and_prepare_csv(CSV_PATH)

# Display first few questions for reference
print(f"\n📝 Sample Questions:")
for i in range(min(3, len(df))):
    row = df.iloc[i]
    print(f"   {i+1}. [{row['difficulty']}] {row['question']}")

In [None]:
# Step 4: LLM Evaluation Function with 8-Chunk Retrieval
def evaluate_single_question(question, original_answer, expansion_answer, chroma_collection):
    """
    Evaluate both answers for a single question using GPT-4.1.
    Retrieves 8 most relevant chunks from ChromaDB for context.

    Args:
        question: The question being asked
        original_answer: Answer from original RAG approach
        expansion_answer: Answer from query expansion approach
        chroma_collection: ChromaDB collection with TCS report chunks

    Returns:
        dict: Contains scores, rationales, and retrieved chunks
    """

    # Retrieve 8 most relevant chunks for this question
    try:
        retrieval_results = chroma_collection.query(
            query_texts=[question],
            n_results=8
        )

        if not retrieval_results['documents'][0]:
            print(f"⚠️  No relevant chunks found for question: {question}")
            context_chunks = ["No relevant context found in TCS report."]
        else:
            context_chunks = retrieval_results['documents'][0]

        # Combine chunks into context
        context = "\n\n--- Chunk ---\n\n".join(context_chunks)

    except Exception as e:
        print(f"❌ Error retrieving chunks: {str(e)}")
        context = "Error retrieving context from TCS report."
        context_chunks = []

    evaluation_prompt = f"""You are an expert evaluator with access to relevant excerpts from the TCS Annual Report.

Question: {question}

Answer A (Original RAG): {original_answer}

Answer B (Query Expansion RAG): {expansion_answer}

Relevant TCS Annual Report Context (8 most relevant chunks):
{context}

For each answer, provide:
1. Score (1 = factually correct and well-supported by the provided context, 0 = incorrect/inaccurate/unsupported)
2. Brief rationale explaining the score based on the context above

Format your response exactly as follows:
Original Answer Score: [0 or 1]
Original Rationale: [explanation]
Expansion Answer Score: [0 or 1]
Expansion Rationale: [explanation]"""

    try:
        response = client.responses.create(
            model="gpt-4.1",
            input=evaluation_prompt
        )

        content = response.output_text if hasattr(response, 'output_text') else str(response)

        # Parse the response
        lines = content.strip().split('\n')

        results = {
            'original_score': None,
            'original_rationale': None,
            'expansion_score': None,
            'expansion_rationale': None,
            'retrieved_chunks': len(context_chunks),
            'context_preview': context[:200] + "..." if len(context) > 200 else context
        }

        for line in lines:
            if line.startswith('Original Answer Score:'):
                results['original_score'] = int(line.split(':')[1].strip())
            elif line.startswith('Original Rationale:'):
                results['original_rationale'] = line.split(':', 1)[1].strip()
            elif line.startswith('Expansion Answer Score:'):
                results['expansion_score'] = int(line.split(':')[1].strip())
            elif line.startswith('Expansion Rationale:'):
                results['expansion_rationale'] = line.split(':', 1)[1].strip()

        return results

    except Exception as e:
        print(f"❌ Error during evaluation: {str(e)}")
        return {
            'original_score': None,
            'original_rationale': f"Error: {str(e)}",
            'expansion_score': None,
            'expansion_rationale': f"Error: {str(e)}",
            'retrieved_chunks': 0,
            'context_preview': "Error occurred"
        }

print("✅ LLM evaluation function ready!")
print("🔧 Function will:")
print("   • Retrieve 8 most relevant chunks for each question")
print("   • Send question + both answers + 8 chunks to GPT-4.1")
print("   • Get 0/1 scores and rationales for each answer")
print("   • Show which chunks were used for evaluation")

In [None]:
# Step 5: Question-by-Question Evaluation
def process_single_question(df, question_idx, chroma_collection, save_progress=True):
    """
    Process a single question and update the dataframe.
    
    Args:
        df: DataFrame containing questions and answers
        question_idx: Index of question to process
        chroma_collection: ChromaDB collection for chunk retrieval
        save_progress: Whether to save CSV after evaluation
    
    Returns:
        bool: True if evaluation was successful
    """
    row = df.iloc[question_idx]
    
    # Check if already evaluated
    if pd.notna(row['llm_original_score']):
        print(f"⏭️  Question {question_idx + 1} already evaluated, skipping...")
        return True
    
    print(f"\n🔍 Evaluating Question {question_idx + 1}/{len(df)}")
    print(f"📋 Difficulty: {row['difficulty']}")
    print(f"❓ Question: {row['question']}")
    print("-" * 60)
    
    # Get evaluation results
    start_time = time.time()
    results = evaluate_single_question(
        row['question'],
        row['original_answer'],
        row['expansion_answer'],
        chroma_collection
    )
    evaluation_time = time.time() - start_time
    
    # Update dataframe
    if results['original_score'] is not None:
        df.loc[question_idx, 'llm_original_score'] = results['original_score']
        df.loc[question_idx, 'llm_expansion_score'] = results['expansion_score']
        df.loc[question_idx, 'llm_original_rationale'] = results['original_rationale']
        df.loc[question_idx, 'llm_expansion_rationale'] = results['expansion_rationale']
        
        # Display results
        print(f"\n📊 Evaluation Results (took {evaluation_time:.1f}s):")
        print(f"   📚 Retrieved {results.get('retrieved_chunks', 'Unknown')} chunks for context")
        print(f"   🔵 Original Answer Score: {results['original_score']}")
        print(f"   🔵 Original Rationale: {results['original_rationale'][:100]}{'...' if len(results['original_rationale']) > 100 else ''}")
        print(f"   🟠 Expansion Answer Score: {results['expansion_score']}")
        print(f"   🟠 Expansion Rationale: {results['expansion_rationale'][:100]}{'...' if len(results['expansion_rationale']) > 100 else ''}")
        
        # Save progress
        if save_progress:
            df.to_csv(CSV_PATH, index=False)
            print(f"💾 Progress saved to {CSV_PATH}")
        
        return True
    else:
        print(f"❌ Evaluation failed for question {question_idx + 1}")
        return False

print("✅ Question processing function ready!")
print("🔧 Function will:")
print("   • Skip already-evaluated questions")
print("   • Retrieve 8 relevant chunks per question")
print("   • Show progress and results for each question")
print("   • Save progress after each successful evaluation")
print("   • Handle errors gracefully")

In [None]:
# Process Questions One by One
# You can run this cell multiple times - it will skip already-evaluated questions

print("🚀 Starting LLM Evaluation Process")
print("=" * 50)
print("💡 Run this cell to process the next unevaluated question")
print("🔄 Re-run to continue with subsequent questions")
print("⏭️  Already-evaluated questions will be skipped automatically")
print()

# Find next unevaluated question
unevaluated_mask = df['llm_original_score'].isna()
unevaluated_indices = df[unevaluated_mask].index.tolist()

if unevaluated_indices:
    next_idx = unevaluated_indices[0]
    print(f"📋 Processing next question: {next_idx + 1}/{len(df)}")
    print(f"📊 Remaining questions: {len(unevaluated_indices)}")
    
    # Process the question
    success = process_single_question(df, next_idx, chroma_collection)
    
    if success:
        remaining = len(unevaluated_indices) - 1
        print(f"\n✅ Question {next_idx + 1} completed successfully!")
        print(f"📈 Progress: {len(df) - remaining}/{len(df)} questions evaluated")
        
        if remaining > 0:
            print(f"🔄 Re-run this cell to evaluate the next question ({remaining} remaining)")
        else:
            print(f"🎉 All questions have been evaluated!")
    else:
        print(f"❌ Failed to evaluate question {next_idx + 1}")
        print(f"🔄 You can try re-running this cell to retry")
        
else:
    print("🎉 All questions have already been evaluated!")
    print("📊 Ready for analysis and summary")

In [None]:
# Step 6: Analysis and Summary
def analyze_evaluation_results(df):
    """
    Analyze the LLM evaluation results and provide insights.
    """
    print("📊 LLM EVALUATION ANALYSIS")
    print("=" * 50)
    
    # Filter for evaluated questions only
    evaluated_df = df[df['llm_original_score'].notna()].copy()
    
    if len(evaluated_df) == 0:
        print("❌ No questions have been evaluated yet.")
        return
    
    print(f"📋 Total Evaluated Questions: {len(evaluated_df)}/{len(df)}")
    print()
    
    # Overall accuracy
    original_accuracy = evaluated_df['llm_original_score'].mean()
    expansion_accuracy = evaluated_df['llm_expansion_score'].mean()
    
    print("🎯 Overall Accuracy:")
    print(f"   🔵 Original RAG: {original_accuracy:.1%} ({evaluated_df['llm_original_score'].sum()}/{len(evaluated_df)})")
    print(f"   🟠 Query Expansion: {expansion_accuracy:.1%} ({evaluated_df['llm_expansion_score'].sum()}/{len(evaluated_df)})")
    
    improvement = expansion_accuracy - original_accuracy
    if improvement > 0:
        print(f"   ✅ Query Expansion is {improvement:.1%} better")
    elif improvement < 0:
        print(f"   📉 Query Expansion is {abs(improvement):.1%} worse")
    else:
        print(f"   🟰 Both approaches perform equally")
    
    print()
    
    # Accuracy by difficulty
    print("📈 Accuracy by Difficulty Level:")
    for difficulty in ['Easy', 'Medium', 'Hard']:
        subset = evaluated_df[evaluated_df['difficulty'] == difficulty]
        if len(subset) > 0:
            orig_acc = subset['llm_original_score'].mean()
            exp_acc = subset['llm_expansion_score'].mean()
            print(f"   {difficulty:6}: Original {orig_acc:.1%} | Expansion {exp_acc:.1%} ({len(subset)} questions)")
    
    print()
    
    # Questions where approaches differ
    different_scores = evaluated_df[evaluated_df['llm_original_score'] != evaluated_df['llm_expansion_score']]
    print(f"🔄 Questions with Different Scores: {len(different_scores)}/{len(evaluated_df)}")
    
    if len(different_scores) > 0:
        print("\n📋 Questions where approaches differed:")
        for idx, row in different_scores.iterrows():
            orig_score = int(row['llm_original_score'])
            exp_score = int(row['llm_expansion_score'])
            winner = "Expansion" if exp_score > orig_score else "Original"
            print(f"   Q{row['question_id']}: Original={orig_score}, Expansion={exp_score} → {winner} wins")
            print(f"      {row['question'][:80]}{'...' if len(row['question']) > 80 else ''}")
    
    print(f"\n💾 Updated results saved to: {CSV_PATH}")
    print(f"📊 CSV now contains LLM scores and rationales for analysis")
    
    return evaluated_df

# Run the analysis
analysis_df = analyze_evaluation_results(df)