# TCS Annual Report RAG System

Learning embeddings through building a simple question-answering system for TCS Annual Report.

This notebook follows a step-by-step approach to understand how embeddings work in retrieval-augmented generation (RAG).

In [None]:
# Step 1: Helper function for readable text display
def word_wrap(text, width=80):
    """
    Simple word wrap function to make long text readable.
    Wraps text at word boundaries within the specified width.
    """
    words = text.split()
    lines = []
    current_line = []
    current_length = 0
    
    for word in words:
        if current_length + len(word) + len(current_line) > width:
            if current_line:
                lines.append(' '.join(current_line))
                current_line = [word]
                current_length = len(word)
            else:
                lines.append(word)
                current_length = 0
        else:
            current_line.append(word)
            current_length += len(word)
    
    if current_line:
        lines.append(' '.join(current_line))
    
    return '\n'.join(lines)

# Test the function
test_text = "This is a very long sentence that we will use to test our word wrapping function to make sure it works correctly and makes text readable."
print(word_wrap(test_text))

In [None]:
# Step 2: PDF Reading - Extract text from TCS Annual Report
from pypdf import PdfReader

# Load the PDF and extract text from all pages
reader = PdfReader("TCS_Annual_Report.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter out empty strings (blank pages)
pdf_texts = [text for text in pdf_texts if text]

print(f"Total pages with content: {len(pdf_texts)}")
print("\nFirst page content:")
print("=" * 50)
print(word_wrap(pdf_texts[0]))

In [None]:
# Step 3: Character Chunking - Split into 1000-character chunks with overlap
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create character splitter with 50-character overlap (improvement over reference)
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=50  # Adding overlap to preserve context
)

# Join all pages and split into character chunks
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

print("Sample chunk (index 10):")
print("=" * 40)
print(word_wrap(character_split_texts[10]))
print(f"\nTotal character chunks: {len(character_split_texts)}")
print(f"First chunk length: {len(character_split_texts[0])} characters")
print(f"Last chunk length: {len(character_split_texts[-1])} characters")

In [None]:
# Step 4: Token Chunking - Further split into 256-token chunks with overlap
from langchain.text_splitter import SentenceTransformersTokenTextSplitter

# Create token splitter with 20-token overlap (improvement over reference)
token_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=20,  # Adding overlap to preserve context
    tokens_per_chunk=256
)

# Split each character chunk into token chunks
token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

print("Sample token chunk (index 10):")
print("=" * 40)
print(word_wrap(token_split_texts[10]))
print(f"\nTotal token chunks: {len(token_split_texts)}")

# Let's also check a few more details
print(f"Character chunks: {len(character_split_texts)}")
print(f"Token chunks: {len(token_split_texts)}")
print(f"Ratio (token/char chunks): {len(token_split_texts)/len(character_split_texts):.1f}")

In [None]:
# Step 5: Embedding Generation - Convert text chunks to numerical vectors
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

# Create embedding function (uses sentence-transformers model)
embedding_function = SentenceTransformerEmbeddingFunction()

# Test with one chunk to see what embeddings look like
sample_embedding = embedding_function([token_split_texts[10]])
print("Sample embedding (first 10 values):")
print(sample_embedding[0][:10])
print(f"\nEmbedding dimensions: {len(sample_embedding[0])}")
print(f"Data type: {type(sample_embedding[0][0])}")

# Quick check - embeddings are normalized vectors (should sum to ~1.0 when squared)
import numpy as np
magnitude = np.linalg.norm(sample_embedding[0])
print(f"Vector magnitude (should be ~1.0): {magnitude:.3f}")

In [12]:
# Step 6: ChromaDB Setup - Create collection and store all document chunks
# Use persistent storage in the repo directory
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Create collection for TCS annual report (or get existing one)
try:
    chroma_collection = chroma_client.get_collection(
        "tcs_annual_report_2024",
        embedding_function=embedding_function
    )
    print("📁 Using existing collection from disk")
    skip_adding = True
except:
    chroma_collection = chroma_client.create_collection(
        "tcs_annual_report_2024",
        embedding_function=embedding_function
    )
    print("📁 Created new persistent collection")
    skip_adding = False

# Only add documents if we created a new collection
if not skip_adding:
    # Create IDs for each chunk (simple sequential numbering)
    ids = [str(i) for i in range(len(token_split_texts))]
    
    # Add all chunks to the collection (this will generate embeddings for all chunks)
    print(f"Adding {len(token_split_texts)} chunks to ChromaDB...")
    chroma_collection.add(ids=ids, documents=token_split_texts)

# Verify the collection
count = chroma_collection.count()
print(f"✅ Collection ready!")
print(f"Total documents in collection: {count}")
print(f"Collection name: {chroma_collection.name}")
print(f"Storage location: ./chroma_db/")

📁 Using existing collection from disk
✅ Collection ready!
Total documents in collection: 1324
Collection name: tcs_annual_report_2024
Storage location: ./chroma_db/


In [13]:
# Step 7: Environment Setup & Retrieval Testing
import os
from dotenv import load_dotenv
from openai import OpenAI

# Load environment variables from .env file
load_dotenv()

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def test_retrieval(query, n_results=3):
    """
    Test function to see what chunks our embeddings retrieve for a given query.
    This helps us understand how semantic search works.
    """
    print(f"🔍 Query: '{query}'")
    print("=" * 60)
    
    # Query ChromaDB for similar chunks
    results = chroma_collection.query(query_texts=[query], n_results=n_results)
    
    if not results['documents'][0]:
        print("❌ No relevant chunks found!")
        return []
    
    print(f"📄 Found {len(results['documents'][0])} relevant chunks:")
    print()
    
    # Display each retrieved chunk
    for i, doc in enumerate(results['documents'][0]):
        print(f"--- Chunk {i+1} ---")
        print(word_wrap(doc))
        print()
    
    return results['documents'][0]

# Test retrieval with a sample query
print("🧪 Testing semantic search retrieval:")
print()
chunks = test_retrieval("What is TCS's revenue?")

🧪 Testing semantic search retrieval:

🔍 Query: 'What is TCS's revenue?'
📄 Found 3 relevant chunks:

--- Chunk 1 ---
( in fy 2021 ) and settlement ( in fy 2024 ) of legal claim tcs has consistently
grown its earnings per share ( eps ), achieving a cagr of 9. 3 % over the past
five financial years. this steady increase highlights the company ’ s growing
earnings and its commitment to delivering long - term value to shareholders.
earnings per share

--- Chunk 2 ---
capabilities in independent analyst reports. tcs has been ranked as a leader in
analyst competitive surveys across multiple firms covering areas such as ai,
genai, analytics, data and automation. 1includes multiple investors in group
meetings growth 12. 0 % $ 20. 2 $ 22. 7 fy 2021 fy 2025 fy 2021 fy 2025 228 298
130 64 101 48 us $ 100mn + us $ 50mn + us $ 20mn + 148 182 fy 2024 fy 2025 tcs '
global innovation network includes 11 pace ports and studios, fostering
collaboration on cutting - edge solutions through cxo discussions,

In [None]:
# Step 8: Approach 1 - Basic RAG Implementation
import time

def basic_rag(question):
    """
    Basic RAG: retrieve 5 chunks, generate answer with GPT-4.1
    Returns: {"answer": str, "runtime": float}
    """
    start_time = time.time()

    # Retrieve 5 most relevant chunks
    results = chroma_collection.query(query_texts=[question], n_results=5)

    if not results['documents'][0]:
        runtime = time.time() - start_time
        return {
            "answer": "No relevant information found in TCS report.",
            "runtime": round(runtime, 2)
        }

    # Combine chunks into context
    context = "\n\n".join(results['documents'][0])

    # Generate answer with GPT-4.1
    try:
        response = client.responses.create(
            model="gpt-4.1",
            input=f"""Based on the following excerpts from the TCS Annual Report, please answer this question: {question}

Context from TCS Annual Report:
{context}

Please provide a clear, accurate answer based only on the information provided above. If the context doesn't contain enough information to fully answer the question, please say so."""
        )

        answer = response.output_text if hasattr(response, 'output_text') else str(response)
        runtime = time.time() - start_time

        return {
            "answer": answer.strip(),
            "runtime": round(runtime, 2)
        }

    except Exception as e:
        runtime = time.time() - start_time
        return {
            "answer": f"Error calling OpenAI API: {str(e)}",
            "runtime": round(runtime, 2)
        }

print("✅ Basic RAG function implemented!")
print("📋 Function signature: basic_rag(question)")
print("📤 Returns: {'answer': str, 'runtime': float}")

In [None]:
# Step 9: Test Basic RAG Function
# Test with our standard question to verify it works

test_question = "What are TCS's main business segments?"
print(f"🧪 Testing Basic RAG with: '{test_question}'")
print("=" * 60)

result = basic_rag(test_question)

print(f"📊 Result:")
print(f"⏱️  Runtime: {result['runtime']}s")
print(f"💡 Answer:")
print("-" * 40)
print(word_wrap(result['answer']))
print()
print("✅ Basic RAG test complete!")

In [None]:
# Approaches 2 & 3: Coming Next
# Once Basic RAG is tested and working, we'll implement:
# - Approach 2: Query Expansion (HyDE) 
# - Approach 3: Multiple Queries + Cross-Encoder

# Step 10: Approach 2 - Query Expansion (HyDE) Implementation

def query_expansion_rag(question):
    """
    Query Expansion RAG: generate hypothetical answer, combine with question, retrieve 5 chunks
    Returns: {"answer": str, "runtime": float, "hypothetical_answer": str}
    """
    start_time = time.time()

    # Step 1: Generate hypothetical answer
    try:
        hyp_response = client.responses.create(
            model="gpt-4.1",
            input=f"""You are a helpful expert financial research assistant. Provide an example answer to the given question, that might be found in a document like an annual report.

Question: {question}

Generate a realistic, detailed answer that would typically appear in an annual report:"""
        )
        
        hypothetical_answer = hyp_response.output_text if hasattr(hyp_response, 'output_text') else str(hyp_response)
        hypothetical_answer = hypothetical_answer.strip()
        
    except Exception as e:
        # Fallback to original question if hypothetical generation fails
        hypothetical_answer = ""
    
    # Step 2: Create expanded query (original + hypothetical)
    if hypothetical_answer:
        expanded_query = f"{question} {hypothetical_answer}"
    else:
        expanded_query = question
    
    # Step 3: Retrieve 5 chunks using expanded query
    results = chroma_collection.query(query_texts=[expanded_query], n_results=5)

    if not results['documents'][0]:
        runtime = time.time() - start_time
        return {
            "answer": "No relevant information found in TCS report.",
            "runtime": round(runtime, 2),
            "hypothetical_answer": hypothetical_answer
        }

    # Step 4: Generate final answer using ONLY retrieved context (not hypothetical)
    context = "\n\n".join(results['documents'][0])

    try:
        response = client.responses.create(
            model="gpt-4.1",
            input=f"""Based on the following excerpts from the TCS Annual Report, please answer this question: {question}

Context from TCS Annual Report:
{context}

Please provide a clear, accurate answer based only on the information provided above. If the context doesn't contain enough information to fully answer the question, please say so."""
        )

        answer = response.output_text if hasattr(response, 'output_text') else str(response)
        runtime = time.time() - start_time

        return {
            "answer": answer.strip(),
            "runtime": round(runtime, 2),
            "hypothetical_answer": hypothetical_answer
        }

    except Exception as e:
        runtime = time.time() - start_time
        return {
            "answer": f"Error calling OpenAI API: {str(e)}",
            "runtime": round(runtime, 2),
            "hypothetical_answer": hypothetical_answer
        }

print("✅ Query Expansion RAG function implemented!")
print("📋 Function signature: query_expansion_rag(question)")
print("📤 Returns: {'answer': str, 'runtime': float, 'hypothetical_answer': str}")

In [None]:
# Step 11: Test Query Expansion RAG Function
# Test with the same question to compare with Basic RAG

test_question = "What are TCS's main business segments?"
print(f"🧪 Testing Query Expansion RAG with: '{test_question}'")
print("=" * 60)

result = query_expansion_rag(test_question)

print(f"📊 Result:")
print(f"⏱️  Runtime: {result['runtime']}s")
print(f"🤖 Hypothetical Answer:")
print("-" * 40)
print(word_wrap(result['hypothetical_answer']))
print()
print(f"💡 Final Answer:")
print("-" * 40)
print(word_wrap(result['answer']))
print()
print("✅ Query Expansion RAG test complete!")

In [None]:
# Step 12: Approach 3 - Multiple Queries + Cross-Encoder Implementation

# Import cross-encoder for re-ranking
from sentence_transformers import CrossEncoder
import numpy as np

# Initialize cross-encoder (same model as L4-student.md)
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def multiple_queries_rag(question):
    """
    Multiple Queries RAG: generate 5 related questions, retrieve chunks, cross-encoder re-rank, take top 5
    Returns: {"answer": str, "runtime": float, "generated_queries": list}
    """
    start_time = time.time()

    # Step 1: Generate 5 related questions
    try:
        queries_response = client.responses.create(
            model="gpt-4.1",
            input=f"""You are a helpful expert financial research assistant. Your users are asking questions about the TCS Annual Report.

Suggest up to 5 additional related questions to help them find the information they need, for the provided question.
Suggest only short questions without compound sentences. Suggest a variety of questions that cover different aspects of the topic.
Make sure they are complete questions, and that they are related to the original question.
Output one question per line. Do not number the questions.

Original question: {question}

Generate 5 related questions:"""
        )
        
        content = queries_response.output_text if hasattr(queries_response, 'output_text') else str(queries_response)
        generated_queries = [q.strip() for q in content.split("\n") if q.strip()]
        
        # Ensure we have exactly 5 questions
        if len(generated_queries) > 5:
            generated_queries = generated_queries[:5]
        elif len(generated_queries) < 5:
            while len(generated_queries) < 5:
                generated_queries.append(generated_queries[0] if generated_queries else question)
                
    except Exception as e:
        generated_queries = []

    # Step 2: Create list of all queries (original + 5 related)
    all_queries = [question] + generated_queries

    # Step 3: Retrieve chunks from all queries and deduplicate
    all_chunks = []
    for query in all_queries:
        try:
            results = chroma_collection.query(query_texts=[query], n_results=10)  # Get more for better cross-encoder selection
            if results['documents'][0]:
                all_chunks.extend(results['documents'][0])
        except Exception as e:
            continue

    # Step 4: Deduplicate chunks using exact string matching
    unique_chunks = []
    seen_chunks = set()
    for chunk in all_chunks:
        if chunk not in seen_chunks:
            unique_chunks.append(chunk)
            seen_chunks.add(chunk)

    if not unique_chunks:
        runtime = time.time() - start_time
        return {
            "answer": "No relevant information found in TCS report.",
            "runtime": round(runtime, 2),
            "generated_queries": generated_queries
        }

    # Step 5: Cross-encoder re-ranking - score all chunks against original question
    pairs = [[question, chunk] for chunk in unique_chunks]
    scores = cross_encoder.predict(pairs)

    # Step 6: Get top 5 highest scoring chunks
    top_indices = np.argsort(scores)[::-1][:5]  # Top 5 indices
    top_chunks = [unique_chunks[i] for i in top_indices]

    # Step 7: Generate final answer using top 5 re-ranked chunks
    context = "\n\n".join(top_chunks)

    try:
        response = client.responses.create(
            model="gpt-4.1",
            input=f"""Based on the following excerpts from the TCS Annual Report, please answer this question: {question}

Context from TCS Annual Report:
{context}

Please provide a clear, accurate answer based only on the information provided above. If the context doesn't contain enough information to fully answer the question, please say so."""
        )

        answer = response.output_text if hasattr(response, 'output_text') else str(response)
        runtime = time.time() - start_time

        return {
            "answer": answer.strip(),
            "runtime": round(runtime, 2),
            "generated_queries": generated_queries
        }

    except Exception as e:
        runtime = time.time() - start_time
        return {
            "answer": f"Error calling OpenAI API: {str(e)}",
            "runtime": round(runtime, 2),
            "generated_queries": generated_queries
        }

print("✅ Multiple Queries + Cross-Encoder RAG function implemented!")
print("📋 Function signature: multiple_queries_rag(question)")
print("📤 Returns: {'answer': str, 'runtime': float, 'generated_queries': list}")
print("🎯 Uses cross-encoder re-ranking for better chunk selection!")

In [None]:
# Step 13: Test Multiple Queries + Cross-Encoder RAG Function
# Test with the same question to compare all three approaches

test_question = "What are TCS's main business segments?"
print(f"🧪 Testing Multiple Queries + Cross-Encoder RAG with: '{test_question}'")
print("=" * 60)

result = multiple_queries_rag(test_question)

print(f"📊 Result:")
print(f"⏱️  Runtime: {result['runtime']}s")
print(f"🔍 Generated Related Queries:")
print("-" * 40)
for i, query in enumerate(result['generated_queries'], 1):
    print(f"  {i}. {query}")
print()
print(f"💡 Final Answer (Cross-Encoder Re-ranked):")
print("-" * 40)
print(word_wrap(result['answer']))
print()
print("✅ Multiple Queries + Cross-Encoder RAG test complete!")

In [None]:
# Step 18: Quick Test - Compare All Three Approaches
# Test the three RAG approaches side by side on a sample question

def compare_all_three_approaches(question):
    """
    Run all three approaches on a single question for comparison.
    """
    print(f"🔍 TEST QUESTION: {question}")
    print("=" * 80)
    print()
    
    print("🔵 APPROACH 1: ORIGINAL RAG")
    print("-" * 40)
    try:
        original_answer = ask_tcs_report(question)
        print("✅ Original RAG completed")
    except Exception as e:
        original_answer = f"Error: {str(e)}"
        print(f"❌ Original RAG failed: {str(e)}")
    
    print("\\n\\n🟠 APPROACH 2: QUERY EXPANSION (HYPOTHETICAL ANSWER)")  
    print("-" * 40)
    try:
        expansion_answer = ask_tcs_report_with_expansion(question, show_process=False)
        print("✅ Query Expansion (HyDE) completed")
    except Exception as e:
        expansion_answer = f"Error: {str(e)}"
        print(f"❌ Query Expansion failed: {str(e)}")
    
    print("\\n\\n🟢 APPROACH 3: MULTIPLE QUERY EXPANSION")
    print("-" * 40)
    try:
        multiple_answer = ask_tcs_report_with_multiple_queries(question, show_process=False)
        print("✅ Multiple Query Expansion completed")
    except Exception as e:
        multiple_answer = f"Error: {str(e)}"
        print(f"❌ Multiple Query Expansion failed: {str(e)}")
    
    print("\\n" + "🔹" * 80)
    print("✅ All three approaches tested!")
    print("🔹" * 80)
    
    return original_answer, expansion_answer, multiple_answer

# Test with a sample question
print("🧪 TESTING ALL THREE RAG APPROACHES")
print("=" * 80)
print("This will show how each approach handles the same question")
print()

test_question = "What are TCS's main business segments?"
results = compare_all_three_approaches(test_question)

print("\\n" + "✅" * 50)
print("🎉 THREE-WAY COMPARISON COMPLETE!")
print("✅" * 50)
print()
print("📋 What we've tested:")
print("  🔵 Original RAG: Standard semantic search with 3 chunks")
print("  🟠 Query Expansion (HyDE): Hypothetical answer + original query") 
print("  🟢 Multiple Query Expansion: Original + 5 related questions")
print()
print("🎯 Next: Generate answers for all 15 evaluation questions using the new approach!")

In [None]:
# Step 14: Summary - Three RAG Approaches Implemented

print("🎉 TCS RAG SYSTEM COMPLETE!")
print("=" * 50)
print()
print("📋 Three approaches implemented and ready for testing:")
print()
print("🔵 1. Basic RAG")
print("   • Function: basic_rag(question)")
print("   • Retrieves 5 chunks directly")
print("   • Returns: {'answer': str, 'runtime': float}")
print()
print("🟠 2. Query Expansion (HyDE)")  
print("   • Function: query_expansion_rag(question)")
print("   • Generates hypothetical answer for better retrieval")
print("   • Returns: {'answer': str, 'runtime': float, 'hypothetical_answer': str}")
print()
print("🟢 3. Multiple Queries + Cross-Encoder")
print("   • Function: multiple_queries_rag(question)")
print("   • Generates 5 related questions, cross-encoder re-ranks")
print("   • Returns: {'answer': str, 'runtime': float, 'generated_queries': list}")
print()
print("🧪 All functions tested with: 'What are TCS's main business segments?'")
print("✅ Ready for evaluation and comparison!")
print()
print("🎯 Next steps:")
print("   • Test with different questions")
print("   • Compare answer quality across approaches") 
print("   • Build evaluation framework when needed")