# RAG System

**Phase A:** Load Documents → Chunk → Embed → Store in Vector DB

**Phase B:** Query → Retrieve Similar Chunks → Generate Answer

In [2]:
# CELL 1: Install dependencies (run once)
# !pip install -r requirements.txt

In [3]:
# CELL 2: Imports
import os, warnings
warnings.filterwarnings('ignore')

from dotenv import load_dotenv
load_dotenv()

from core import DocumentLoader, TextChunker, EmbeddingGenerator, VectorStore, Retriever, ResponseGenerator
from config import RAGConfig
from widgets import create_embedding_visualization, create_similarity_chart, create_chunk_statistics_dashboard

print("Imports OK!")
print(f"GEMINI_API_KEY: {'SET' if os.getenv('GEMINI_API_KEY') else 'NOT SET'}")

Imports OK!
GEMINI_API_KEY: SET


In [4]:
# CELL 3: Configuration - EDIT THESE VALUES
config = RAGConfig(
    chunk_size=500,
    chunk_overlap=50,
    chunking_strategy='sentence',
    embedding_model='all-MiniLM-L6-v2',
    top_k=5,
    llm_model='gemini-2.5-flash-lite',
    temperature=0.7,
    max_tokens=1024
)
print("Config set!")

Config set!


---
## Phase A: Indexing
---

In [None]:
# CELL 4: Load Documents - EDIT PATH HERE
documents = DocumentLoader.load_directory('./data/lotr')

for doc in documents:
    print(f"{doc.source}: {len(doc):,} chars")
print(f"\nTotal: {len(documents)} docs")

sample_ai.txt: 4,677 chars
sample_python.txt: 5,589 chars

Total: 2 docs


In [6]:
# CELL 5: Chunk documents
chunker = TextChunker(chunk_size=config.chunk_size, overlap=config.chunk_overlap, strategy=config.chunking_strategy)
chunks = chunker.chunk_documents(documents)
print(f"Created {len(chunks)} chunks (avg {chunker.get_statistics(chunks)['avg_length']:.0f} chars)")

Created 19 chunks (avg 539 chars)


In [7]:
# CELL 6: Visualize chunks (optional)
create_chunk_statistics_dashboard(chunks).show()

In [None]:
# CELL 7: Generate embeddings
embedder = EmbeddingGenerator(model_name=config.embedding_model, device='cuda:0')
embedded_chunks = embedder.embed_chunks(chunks, batch_size=32, show_progress=True)
print(f"\nGenerated {len(embedded_chunks)} embeddings")

In [None]:
# CELL 8: Store in vector database
vector_store = VectorStore(collection_name='rag_collection', persist_directory='./chroma_db', reset=True)
count = vector_store.add(embedded_chunks)
print(f"Stored {count} vectors")

In [None]:
# CELL 9: Visualize embedding space (optional)
all_embeddings, all_metadata = vector_store.get_all_embeddings()
create_embedding_visualization(all_embeddings, all_metadata, method='UMAP').show()

---
## Phase B: Query
---

In [None]:
# CELL 10: Initialize retriever and generator
retriever = Retriever(embedder, vector_store)
generator = ResponseGenerator(model=config.llm_model, temperature=config.temperature, max_tokens=config.max_tokens)
print(f"Ready! Using {config.llm_model}")

In [None]:
# CELL 11: ASK A QUESTION - EDIT YOUR QUERY HERE
query = "What is machine learning?"

# Retrieve
results = retriever.retrieve(query=query, k=config.top_k)
print(f"Query: {query}\n")
print("Retrieved chunks:")
for r in results:
    print(f"\n{'='*80}")
    print(f"[{r.rank}] Score: {r.score:.3f} | Source: {r.source}")
    print(f"{'='*80}")
    print(r.text)

# Generate
response = generator.generate(query=query, context_chunks=results)
print(f"\n{'='*50}\nANSWER:\n{'='*50}")
print(response.response)
print(f"\nSources: {', '.join(response.sources)}")

In [None]:
# CELL 12: Visualize query in embedding space with score coloring
query_embedding = retriever.last_query_embedding
retrieved_indices = retriever.get_retrieved_indices(results, all_metadata)
scores = [r.score for r in results]

fig = create_embedding_visualization(
    all_embeddings, all_metadata, method='UMAP',
    query_embedding=query_embedding,
    retrieved_indices=retrieved_indices,
    retrieval_scores=scores
)
fig.show()

In [None]:
# CELL 13: Quick ask function - use for more questions
def ask(question, top_k=5):
    results = retriever.retrieve(query=question, k=top_k)
    response = generator.generate(query=question, context_chunks=results)
    print(f"Q: {question}\n\nA: {response.response}\n\nSources: {', '.join(response.sources)}")
    return response

In [None]:
# CELL 14: Ask more questions
ask("What is deep learning?")

In [None]:
ask("How do I create a virtual environment in Python?")