# TCS RAG System Evaluation Questions

This notebook contains our evaluation question set for testing the TCS Annual Report RAG system.
We have 10 carefully designed questions across three difficulty levels to test different aspects of our RAG pipeline.

## Question Distribution:
- **Easy (3 questions)**: Direct fact extraction from the report
- **Medium (4 questions)**: Analysis and synthesis requiring multiple chunks
- **Hard (3 questions)**: Complex reasoning and multi-hop information retrieval

In [None]:
# Setup & Dependencies for ChromaDB connection
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

# ChromaDB configuration
CHROMA_DB_PATH = "./chroma_db"
COLLECTION_NAME = "tcs_annual_report_2024"

print("✅ Dependencies loaded successfully!")
print(f"🗃️  ChromaDB Path: {CHROMA_DB_PATH}")
print(f"📚 Collection: {COLLECTION_NAME}")

In [None]:
# Connect to ChromaDB Vector Database
def connect_to_chromadb(chroma_path, collection_name):
    """
    Connect to existing ChromaDB collection from the main RAG notebook.
    """
    print(f"🗃️  Connecting to ChromaDB at {chroma_path}...")

    # Initialize ChromaDB client
    chroma_client = chromadb.PersistentClient(path=chroma_path)

    # Create embedding function (same as main notebook)
    embedding_function = SentenceTransformerEmbeddingFunction()

    try:
        # Get existing collection
        chroma_collection = chroma_client.get_collection(
            collection_name,
            embedding_function=embedding_function
        )

        # Verify collection
        count = chroma_collection.count()
        print(f"✅ Connected to existing collection: {collection_name}")
        print(f"📊 Total document chunks: {count:,}")

        return chroma_collection

    except Exception as e:
        print(f"❌ Failed to connect to ChromaDB collection: {str(e)}")
        print(f"💡 Make sure you've run the main RAG notebook first to create the collection")
        raise

# Connect to the vector database
chroma_collection = connect_to_chromadb(CHROMA_DB_PATH, COLLECTION_NAME)

print(f"\n🚀 ChromaDB Ready for Evaluation!")

In [None]:
# TCS RAG Evaluation Questions
evaluation_questions = [
    # Easy Questions (3) - Direct fact extraction
    {
        "id": 1,
        "difficulty": "Easy",
        "question": "What was TCS's total revenue in FY 2025?",
        "expected_answer_hint": "US $30 billion or ₹255,324 crore"
    },
    {
        "id": 2, 
        "difficulty": "Easy",
        "question": "How many associates does TCS have globally?",
        "expected_answer_hint": "607,979 associates"
    },
    {
        "id": 3,
        "difficulty": "Easy", 
        "question": "What was the total dividend per share for FY 2025?",
        "expected_answer_hint": "₹126 per share"
    },
    
    # Medium Questions (4) - Analysis and synthesis
    {
        "id": 4,
        "difficulty": "Medium",
        "question": "What are TCS's main business segments or industry verticals?",
        "expected_answer_hint": "Banking Financial Services, Manufacturing, Communications Media, Life Sciences Healthcare Energy, etc."
    },
    {
        "id": 5,
        "difficulty": "Medium",
        "question": "What was TCS's operating margin in FY 2025 and how does it compare to the previous year?",
        "expected_answer_hint": "24.3% in FY 2025, slight decline from FY 2024"
    },
    {
        "id": 6,
        "difficulty": "Medium", 
        "question": "What are the key AI/GenAI initiatives mentioned in the report?",
        "expected_answer_hint": "TCS WisdomNext platform, AI agents, drug discovery solutions, etc."
    },
    {
        "id": 7,
        "difficulty": "Medium",
        "question": "What major partnerships or client wins are highlighted in the report?",
        "expected_answer_hint": "Air New Zealand, Xerox, Primark, DNB, etc."
    },
    
    # Hard Questions (3) - Complex analysis and synthesis
    {
        "id": 8,
        "difficulty": "Hard",
        "question": "How is TCS positioning itself for the AI transformation, and what specific investments are they making?",
        "expected_answer_hint": "Largest AI-trained workforce, TCS WisdomNext platform, AI data centers, human+AI model, etc."
    },
    {
        "id": 9,
        "difficulty": "Hard", 
        "question": "What are the key challenges TCS faced in FY 2025 and how did they address them?",
        "expected_answer_hint": "Geopolitical disruption, supply chain impacts - addressed through client partnerships, technology transformation"
    },
    {
        "id": 10,
        "difficulty": "Hard",
        "question": "Based on the Chairman's letter, what are the four key progressions TCS plans for the future?",
        "expected_answer_hint": "1) AI agents pool, 2) Human+AI delivery, 3) AI data center investments, 4) Industry partnerships"
    }
]

print("📋 TCS RAG EVALUATION QUESTIONS READY")
print("=" * 50)
print(f"Total questions: {len(evaluation_questions)}")
print(f"Easy: {len([q for q in evaluation_questions if q['difficulty'] == 'Easy'])}")
print(f"Medium: {len([q for q in evaluation_questions if q['difficulty'] == 'Medium'])}")
print(f"Hard: {len([q for q in evaluation_questions if q['difficulty'] == 'Hard'])}")
print()

# Display all questions for review
for q in evaluation_questions:
    print(f"Q{q['id']} [{q['difficulty']}]: {q['question']}")
    print(f"    Expected: {q['expected_answer_hint']}")
    print()

In [None]:
# Test retrieval for a sample question to verify everything works
sample_question = evaluation_questions[0]['question']
print(f"🧪 Testing retrieval with sample question:")
print(f"Question: {sample_question}")
print()

# Query ChromaDB for relevant chunks
results = chroma_collection.query(query_texts=[sample_question], n_results=3)

if results['documents'][0]:
    print(f"✅ Successfully retrieved {len(results['documents'][0])} relevant chunks:")
    for i, doc in enumerate(results['documents'][0], 1):
        print(f"\nChunk {i}:")
        print(doc[:150] + "..." if len(doc) > 150 else doc)
else:
    print("❌ No relevant chunks found")

print("\n🎯 Ready to test RAG system with these evaluation questions!")

In [None]:
# Setup Environment and Import RAG Functions
import os
from dotenv import load_dotenv
from openai import OpenAI
import time
import pandas as pd

# Load environment variables
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

print("✅ Environment setup complete!")
print("🔌 OpenAI client initialized")

# Import the clean RAG functions from our new module
try:
    from tcs_rag import basic_rag, query_expansion_rag, multiple_queries_rag
    from sentence_transformers import CrossEncoder
    
    # Initialize cross-encoder for multiple_queries_rag
    print("🤖 Loading cross-encoder model...")
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    
    print("✅ Successfully imported RAG functions from tcs_rag.py!")
    print("📋 Functions available:")
    print("   • basic_rag(question, chroma_collection, client)")
    print("   • query_expansion_rag(question, chroma_collection, client)")
    print("   • multiple_queries_rag(question, chroma_collection, client, cross_encoder)")
    
except ImportError as e:
    print(f"❌ Failed to import RAG functions: {e}")
    print("💡 Make sure tcs_rag.py is in the same directory")
    raise

print("\n🎯 Ready for RAG evaluation!")

In [None]:
# Remove old complex import cell - now using clean tcs_rag.py import
print("🗑️  Old import method removed - now using clean tcs_rag.py imports!")

In [None]:
# Single Question Evaluation Test
print("🧪 SINGLE QUESTION EVALUATION TEST")
print("=" * 50)

# Test with Question 1 (Easy)
test_question = evaluation_questions[0]
question = test_question['question']
expected = test_question['expected_answer_hint']
difficulty = test_question['difficulty']

print(f"Question ID: {test_question['id']}")
print(f"Difficulty: {difficulty}")
print(f"Question: {question}")
print(f"Expected: {expected}")
print()

# Define methods with updated function calls (passing required parameters)
methods = [
    ("Basic RAG", lambda q: basic_rag(q, chroma_collection, client)),
    ("Query Expansion", lambda q: query_expansion_rag(q, chroma_collection, client)),
    ("Multiple Queries", lambda q: multiple_queries_rag(q, chroma_collection, client, cross_encoder))
]

results = []

for method_name, method_func in methods:
    print(f"🔄 Running {method_name}...")
    
    # Get answer with timing
    result = method_func(question)
    
    results.append({
        'Method': method_name,
        'Answer': result['answer'],
        'Time_seconds': result['runtime']
    })
    
    print(f"✅ {method_name} completed in {result['runtime']}s")
    print()

print("📋 All three methods completed! Now running judge evaluation...")

In [None]:
# Judge LLM Evaluation
def judge_answer(question, expected_answer, actual_answer):
    """
    Use GPT-4.1 as judge to evaluate an answer
    """
    judge_prompt = f"""You are evaluating a RAG system answer about the TCS Annual Report.

Question: {question}
Expected Answer: {expected_answer}
Actual Answer: {actual_answer}

Evaluate this answer:
- Score 1 if factually correct and aligns with expected answer
- Score 0 if incorrect, inaccurate, or doesn't answer the question

Format your response exactly as:
Score: [0 or 1]
Rationale: [brief explanation]"""
    
    try:
        response = client.responses.create(
            model="gpt-4.1",
            input=judge_prompt
        )
        
        content = response.output_text if hasattr(response, 'output_text') else str(response)
        
        # Parse score and rationale
        lines = content.strip().split('\n')
        score = None
        rationale = ""
        
        for line in lines:
            if line.startswith('Score:'):
                score = int(line.split(':')[1].strip())
            elif line.startswith('Rationale:'):
                rationale = line.split(':', 1)[1].strip()
        
        return score, rationale
    
    except Exception as e:
        return None, f"Error: {str(e)}"

# Evaluate each answer
print("🤖 Judge LLM Evaluation")
print("-" * 30)

for i, result in enumerate(results):
    method_name = result['Method']
    answer = result['Answer']
    
    print(f"\nEvaluating {method_name}...")
    
    score, rationale = judge_answer(question, expected, answer)
    
    # Add to results
    results[i]['Score'] = score
    results[i]['Rationale'] = rationale
    
    print(f"Score: {score}")
    print(f"Rationale: {rationale}")

print("\n✅ Judge evaluation complete!")

In [None]:
# Display Results in DataFrame
print("📊 EVALUATION RESULTS")
print("=" * 60)

# Create DataFrame
df_results = pd.DataFrame(results)

# Display the full results
print(f"Question: {question}")
print(f"Expected: {expected}")
print()

# Show summary table
summary_df = df_results[['Method', 'Score', 'Time_seconds']].copy()
print("📋 Summary:")
print(summary_df.to_string(index=False))
print()

# Show detailed answers and rationales
print("📝 Detailed Results:")
print()

for _, row in df_results.iterrows():
    print(f"{'='*50}")
    print(f"Method: {row['Method']}")
    print(f"Score: {row['Score']} | Time: {row['Time_seconds']}s")
    print(f"\nAnswer:")
    print(row['Answer'])
    print(f"\nJudge Rationale:")
    print(row['Rationale'])
    print()

# Simple analysis
print(f"\n🏆 RESULTS SUMMARY:")
print(f"Correct answers: {df_results['Score'].sum()}/{len(df_results)}")
print(f"Average time: {df_results['Time_seconds'].mean():.1f}s")
if df_results['Score'].sum() > 0:
    best_methods = df_results[df_results['Score'] == 1]['Method'].tolist()
    print(f"Best performing methods: {', '.join(best_methods)}")

print("\n🎯 Single question test complete! Ready to scale to all 10 questions.")

In [None]:
# Full Evaluation: All 10 Questions Across 3 RAG Methods
print("🚀 FULL EVALUATION: ALL 10 QUESTIONS")
print("=" * 60)

# Initialize results list for all evaluations
all_results = []

# Define methods with updated function calls
methods = [
    ("Basic RAG", lambda q: basic_rag(q, chroma_collection, client)),
    ("Query Expansion", lambda q: query_expansion_rag(q, chroma_collection, client)),
    ("Multiple Queries", lambda q: multiple_queries_rag(q, chroma_collection, client, cross_encoder))
]

# Loop through all 10 questions
for q_num, test_question in enumerate(evaluation_questions, 1):
    question = test_question['question']
    expected = test_question['expected_answer_hint']
    difficulty = test_question['difficulty']
    question_id = test_question['id']
    
    print(f"\n{'='*70}")
    print(f"QUESTION {q_num}/10 [ID: {question_id}] - {difficulty.upper()}")
    print(f"{'='*70}")
    print(f"Q: {question}")
    print(f"Expected: {expected}")
    print()
    
    # Test each RAG method for this question
    for method_name, method_func in methods:
        print(f"🔄 Running {method_name}...")
        
        try:
            # Get answer with timing
            result = method_func(question)
            answer = result['answer']
            runtime = result['runtime']
            
            print(f"✅ {method_name} completed in {runtime}s")
            
            # Judge evaluation
            print(f"🤖 Judging {method_name}...")
            score, rationale = judge_answer(question, expected, answer)
            
            # Store complete result
            all_results.append({
                'Question_ID': question_id,
                'Question_Number': q_num,
                'Difficulty': difficulty,
                'Question': question,
                'Expected_Answer': expected,
                'Method': method_name,
                'Answer': answer,
                'Time_seconds': runtime,
                'Score': score,
                'Judge_Rationale': rationale
            })
            
            print(f"📊 Score: {score} | {rationale[:100]}...")
            
        except Exception as e:
            print(f"❌ Error with {method_name}: {str(e)}")
            # Store error result
            all_results.append({
                'Question_ID': question_id,
                'Question_Number': q_num,
                'Difficulty': difficulty,
                'Question': question,
                'Expected_Answer': expected,
                'Method': method_name,
                'Answer': f"ERROR: {str(e)}",
                'Time_seconds': 0,
                'Score': 0,
                'Judge_Rationale': f"Error occurred: {str(e)}"
            })
    
    print(f"✅ Question {q_num} complete!")

print(f"\n🎉 ALL EVALUATIONS COMPLETE!")
print(f"Total evaluations: {len(all_results)}")
print(f"Expected: {len(evaluation_questions) * len(methods)} (10 questions × 3 methods)")

In [None]:
# Create Comprehensive Results DataFrame and Export to CSV
print("📊 CREATING COMPREHENSIVE RESULTS DATAFRAME")
print("=" * 50)

# Create DataFrame from all results
df_comprehensive = pd.DataFrame(all_results)

# Display basic info
print(f"📋 Total evaluations: {len(df_comprehensive)}")
print(f"📋 Questions evaluated: {df_comprehensive['Question_ID'].nunique()}")
print(f"📋 Methods tested: {df_comprehensive['Method'].nunique()}")
print()

# Show column info
print("📋 DataFrame columns:")
for col in df_comprehensive.columns:
    print(f"   • {col}")
print()

# Quick summary stats
print("📊 QUICK SUMMARY:")
print(f"Overall success rate: {df_comprehensive['Score'].mean():.1%}")
print(f"Average response time: {df_comprehensive['Time_seconds'].mean():.1f}s")
print()

# Method performance summary
method_summary = df_comprehensive.groupby('Method').agg({
    'Score': ['mean', 'sum', 'count'],
    'Time_seconds': 'mean'
}).round(3)

method_summary.columns = ['Success_Rate', 'Correct_Answers', 'Total_Questions', 'Avg_Time_seconds']
print("📋 Method Performance Summary:")
print(method_summary)
print()

# Difficulty analysis
difficulty_summary = df_comprehensive.groupby('Difficulty').agg({
    'Score': ['mean', 'sum', 'count']
}).round(3)
difficulty_summary.columns = ['Success_Rate', 'Correct_Answers', 'Total_Attempts']
print("📋 Performance by Difficulty:")
print(difficulty_summary)
print()

# Generate timestamp for unique filename
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"tcs_rag_evaluation_{timestamp}.csv"

# Export to CSV
df_comprehensive.to_csv(csv_filename, index=False)
print(f"💾 Results exported to: {csv_filename}")

# Display first few rows to verify
print("\\n📋 Sample of exported data:")
print(df_comprehensive[['Question_ID', 'Method', 'Score', 'Time_seconds']].head(10))

print(f"\\n✅ Comprehensive evaluation complete!")
print(f"📁 Full results saved to: {csv_filename}")
print(f"📊 {len(df_comprehensive)} total evaluations across {len(evaluation_questions)} questions and {len(methods)} methods")

In [None]:
# 🏆 FINAL SCORECARD: ALL METHODS ACROSS ALL 10 QUESTIONS
print("🏆 FINAL SCORECARD: TCS RAG EVALUATION RESULTS")
print("=" * 60)

# Overall Performance Summary
print("📊 OVERALL PERFORMANCE SUMMARY")
print("-" * 40)

final_summary = df_comprehensive.groupby('Method').agg({
    'Score': ['sum', 'count', 'mean'],
    'Time_seconds': 'mean'
}).round(3)

final_summary.columns = ['Correct_Answers', 'Total_Questions', 'Accuracy_Rate', 'Avg_Time_seconds']
final_summary['Accuracy_Percentage'] = (final_summary['Accuracy_Rate'] * 100).round(1)

print(final_summary[['Correct_Answers', 'Total_Questions', 'Accuracy_Percentage', 'Avg_Time_seconds']])
print()

# Performance by Difficulty Level
print("📈 PERFORMANCE BY DIFFICULTY LEVEL")
print("-" * 40)

difficulty_breakdown = df_comprehensive.pivot_table(
    index='Method', 
    columns='Difficulty', 
    values='Score', 
    aggfunc=['sum', 'count', 'mean']
)

# Show correct answers out of total for each difficulty
for difficulty in ['Easy', 'Medium', 'Hard']:
    if difficulty in difficulty_breakdown['sum'].columns:
        print(f"\n{difficulty} Questions:")
        for method in difficulty_breakdown.index:
            correct = difficulty_breakdown['sum'][difficulty].loc[method]
            total = difficulty_breakdown['count'][difficulty].loc[method]
            percentage = (correct/total * 100) if total > 0 else 0
            print(f"  {method}: {correct}/{total} ({percentage:.1f}%)")

print()

# Winner Analysis
print("🥇 WINNER ANALYSIS")
print("-" * 40)

# Overall winner
best_overall = final_summary.loc[final_summary['Accuracy_Rate'].idxmax()]
print(f"🏆 Overall Winner: {best_overall.name}")
print(f"   Score: {best_overall['Correct_Answers']:.0f}/{best_overall['Total_Questions']:.0f} ({best_overall['Accuracy_Percentage']:.1f}%)")
print(f"   Avg Time: {best_overall['Avg_Time_seconds']:.1f}s")

# Fastest method
fastest_method = final_summary.loc[final_summary['Avg_Time_seconds'].idxmin()]
print(f"\n⚡ Fastest Method: {fastest_method.name}")
print(f"   Avg Time: {fastest_method['Avg_Time_seconds']:.1f}s")
print(f"   Accuracy: {fastest_method['Accuracy_Percentage']:.1f}%")

print()

# Key Insights
print("💡 KEY INSIGHTS")
print("-" * 40)

total_questions = len(evaluation_questions)
total_methods = len(methods)

print(f"• Evaluated {total_methods} RAG methods across {total_questions} questions")
print(f"• Overall success rate: {df_comprehensive['Score'].mean():.1%}")
print(f"• Best performing method: {best_overall.name} ({best_overall['Accuracy_Percentage']:.1f}% accuracy)")
print(f"• Speed vs Accuracy trade-off: {fastest_method.name} is fastest but {best_overall.name} is most accurate")

# Check if any method got perfect score
perfect_methods = final_summary[final_summary['Accuracy_Rate'] == 1.0]
if not perfect_methods.empty:
    print(f"• Perfect score achieved by: {', '.join(perfect_methods.index)}")

# Difficulty insights
easy_avg = df_comprehensive[df_comprehensive['Difficulty'] == 'Easy']['Score'].mean()
medium_avg = df_comprehensive[df_comprehensive['Difficulty'] == 'Medium']['Score'].mean()
hard_avg = df_comprehensive[df_comprehensive['Difficulty'] == 'Hard']['Score'].mean()

print(f"• Difficulty progression: Easy ({easy_avg:.1%}) > Medium ({medium_avg:.1%}) > Hard ({hard_avg:.1%})")

print("\n✅ EVALUATION COMPLETE!")
print(f"📁 Detailed results exported to: {csv_filename}")
print("🎯 Use these insights to improve your RAG system!")