# TCS Annual Report RAG System

Learning embeddings through building a simple question-answering system for TCS Annual Report.

This notebook follows a step-by-step approach to understand how embeddings work in retrieval-augmented generation (RAG).

In [None]:
# Step 1: Helper function for readable text display
def word_wrap(text, width=80):
    """
    Simple word wrap function to make long text readable.
    Wraps text at word boundaries within the specified width.
    """
    words = text.split()
    lines = []
    current_line = []
    current_length = 0
    
    for word in words:
        if current_length + len(word) + len(current_line) > width:
            if current_line:
                lines.append(' '.join(current_line))
                current_line = [word]
                current_length = len(word)
            else:
                lines.append(word)
                current_length = 0
        else:
            current_line.append(word)
            current_length += len(word)
    
    if current_line:
        lines.append(' '.join(current_line))
    
    return '\n'.join(lines)

# Test the function
test_text = "This is a very long sentence that we will use to test our word wrapping function to make sure it works correctly and makes text readable."
print(word_wrap(test_text))

In [None]:
# Step 2: PDF Reading - Extract text from TCS Annual Report
from pypdf import PdfReader

# Load the PDF and extract text from all pages
reader = PdfReader("TCS_Annual_Report.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter out empty strings (blank pages)
pdf_texts = [text for text in pdf_texts if text]

print(f"Total pages with content: {len(pdf_texts)}")
print("\nFirst page content:")
print("=" * 50)
print(word_wrap(pdf_texts[0]))

In [None]:
# Step 3: Character Chunking - Split into 1000-character chunks with overlap
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create character splitter with 50-character overlap (improvement over reference)
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=50  # Adding overlap to preserve context
)

# Join all pages and split into character chunks
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

print("Sample chunk (index 10):")
print("=" * 40)
print(word_wrap(character_split_texts[10]))
print(f"\nTotal character chunks: {len(character_split_texts)}")
print(f"First chunk length: {len(character_split_texts[0])} characters")
print(f"Last chunk length: {len(character_split_texts[-1])} characters")

In [None]:
# Step 4: Token Chunking - Further split into 256-token chunks with overlap
from langchain.text_splitter import SentenceTransformersTokenTextSplitter

# Create token splitter with 20-token overlap (improvement over reference)
token_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=20,  # Adding overlap to preserve context
    tokens_per_chunk=256
)

# Split each character chunk into token chunks
token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

print("Sample token chunk (index 10):")
print("=" * 40)
print(word_wrap(token_split_texts[10]))
print(f"\nTotal token chunks: {len(token_split_texts)}")

# Let's also check a few more details
print(f"Character chunks: {len(character_split_texts)}")
print(f"Token chunks: {len(token_split_texts)}")
print(f"Ratio (token/char chunks): {len(token_split_texts)/len(character_split_texts):.1f}")

In [None]:
# Step 5: Embedding Generation - Convert text chunks to numerical vectors
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

# Create embedding function (uses sentence-transformers model)
embedding_function = SentenceTransformerEmbeddingFunction()

# Test with one chunk to see what embeddings look like
sample_embedding = embedding_function([token_split_texts[10]])
print("Sample embedding (first 10 values):")
print(sample_embedding[0][:10])
print(f"\nEmbedding dimensions: {len(sample_embedding[0])}")
print(f"Data type: {type(sample_embedding[0][0])}")

# Quick check - embeddings are normalized vectors (should sum to ~1.0 when squared)
import numpy as np
magnitude = np.linalg.norm(sample_embedding[0])
print(f"Vector magnitude (should be ~1.0): {magnitude:.3f}")

In [None]:
# Step 6: ChromaDB Setup - Create collection and store all document chunks
# Use persistent storage in the repo directory
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Create collection for TCS annual report (or get existing one)
try:
    chroma_collection = chroma_client.get_collection(
        "tcs_annual_report_2024",
        embedding_function=embedding_function
    )
    print("📁 Using existing collection from disk")
    skip_adding = True
except:
    chroma_collection = chroma_client.create_collection(
        "tcs_annual_report_2024",
        embedding_function=embedding_function
    )
    print("📁 Created new persistent collection")
    skip_adding = False

# Only add documents if we created a new collection
if not skip_adding:
    # Create IDs for each chunk (simple sequential numbering)
    ids = [str(i) for i in range(len(token_split_texts))]
    
    # Add all chunks to the collection (this will generate embeddings for all chunks)
    print(f"Adding {len(token_split_texts)} chunks to ChromaDB...")
    chroma_collection.add(ids=ids, documents=token_split_texts)

# Verify the collection
count = chroma_collection.count()
print(f"✅ Collection ready!")
print(f"Total documents in collection: {count}")
print(f"Collection name: {chroma_collection.name}")
print(f"Storage location: ./chroma_db/")