# GTD RAG System

This notebook implements a Retrieval-Augmented Generation (RAG) system over the Getting Things Done (GTD) book chapters.

## Features
- Document loading and preprocessing from chapters/ directory
- Text chunking for optimal retrieval
- Vector embeddings using sentence-transformers
- Semantic search and retrieval
- Integration with OpenAI for generation
- Interactive query interface

In [None]:
# Install required packages
%pip install -qU sentence-transformers faiss-cpu openai python-dotenv tiktoken

# Import necessary libraries
import os
import glob
import re
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple
import json

# For text processing
import tiktoken
from sentence_transformers import SentenceTransformer

# For vector storage and retrieval
import faiss

# For OpenAI integration
import openai
from dotenv import load_dotenv

# For display
from IPython.display import display, Markdown

print("✅ Required packages installed and imported successfully!")

Looking in indexes: https://pypi.org/simple, https://packagecloud.io/github/git-lfs/pypi/simple
Note: you may need to restart the kernel to use updated packages.
✅ Required packages installed and imported successfully!


In [2]:
# Configuration
class RAGConfig:
    def __init__(self):
        self.chapters_dir = "chapters/"
        self.chunk_size = 1000
        self.chunk_overlap = 200
        self.embedding_model = "all-MiniLM-L6-v2"
        self.max_tokens = 8192
        self.top_k = 5
        
        # Load environment variables
        load_dotenv()
        
        # Set up OpenAI API key
        openai.api_key = os.getenv("OPENAI_API_KEY")
        if not openai.api_key:
            print("⚠️  Warning: OPENAI_API_KEY not found in environment variables.")
            print("   You can still use the retrieval functionality, but generation will be limited.")

config = RAGConfig()
print("✅ Configuration loaded successfully!")
print(f"   Chapters directory: {config.chapters_dir}")
print(f"   Chunk size: {config.chunk_size}")
print(f"   Embedding model: {config.embedding_model}")


✅ Configuration loaded successfully!
   Chapters directory: chapters/
   Chunk size: 1000
   Embedding model: all-MiniLM-L6-v2


In [3]:
# Document Loading
class DocumentLoader:
    def __init__(self, chapters_dir: str):
        self.chapters_dir = chapters_dir
        
    def load_documents(self) -> List[Dict[str, str]]:
        """Load all markdown documents from the chapters directory."""
        documents = []
        
        # Get all markdown files
        md_files = glob.glob(os.path.join(self.chapters_dir, "*.md"))
        
        print(f"Found {len(md_files)} markdown files to process...")
        
        for file_path in sorted(md_files):
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    
                    # Extract filename without extension
                    filename = os.path.basename(file_path)
                    
                    # Skip README file
                    if filename.lower() == 'readme.md':
                        continue
                    
                    documents.append({
                        'filename': filename,
                        'filepath': file_path,
                        'content': content,
                        'word_count': len(content.split())
                    })
                    
                print(f"✅ Loaded: {filename} ({len(content.split())} words)")
                    
            except Exception as e:
                print(f"❌ Error loading {file_path}: {str(e)}")
                
        return documents

# Load documents
loader = DocumentLoader(config.chapters_dir)
documents = loader.load_documents()

print(f"\n📚 Successfully loaded {len(documents)} documents")
print(f"   Total words: {sum(doc['word_count'] for doc in documents):,}")

# Display document summary
doc_summary = pd.DataFrame([
    {
        'Chapter': doc['filename'], 
        'Words': doc['word_count']
    } 
    for doc in documents
])
print("\n📊 Document Summary:")
display(doc_summary)


Found 16 markdown files to process...
✅ Loaded: chapter-01-the-art-of-getting-things-done.md (8819 words)
✅ Loaded: chapter-02-getting-control-of-y-our-life-the-five-steps.md (10388 words)
✅ Loaded: chapter-03-getting-projects-creatively-under-w-ay-the.md (8751 words)
✅ Loaded: chapter-04-getting-started-setting-up-the-t-ime-space.md (7915 words)
✅ Loaded: chapter-05-capturing-corralling-y-our-stuf-f.md (4628 words)
✅ Loaded: chapter-06-clarifying-getting-in-to-empty.md (6630 words)
✅ Loaded: chapter-07-or-ganizing-setting-up-the-right-buckets.md (17748 words)
✅ Loaded: chapter-08-reflecting-keeping-it-all-fresh-and.md (4667 words)
✅ Loaded: chapter-09-engaging-making-the-best-action-choices.md (8491 words)
✅ Loaded: chapter-10-getting-projects-under-control.md (4032 words)
✅ Loaded: chapter-11-the-power-of-the-capturing-habit.md (3830 words)
✅ Loaded: chapter-12-the-power-of-the-next-action-decision.md (4613 words)
✅ Loaded: chapter-13-the-power-of-outcome-focusing.md (3093 words)
✅ L

Unnamed: 0,Chapter,Words
0,chapter-01-the-art-of-getting-things-done.md,8819
1,chapter-02-getting-control-of-y-our-life-the-f...,10388
2,chapter-03-getting-projects-creatively-under-w...,8751
3,chapter-04-getting-started-setting-up-the-t-im...,7915
4,chapter-05-capturing-corralling-y-our-stuf-f.md,4628
5,chapter-06-clarifying-getting-in-to-empty.md,6630
6,chapter-07-or-ganizing-setting-up-the-right-bu...,17748
7,chapter-08-reflecting-keeping-it-all-fresh-and.md,4667
8,chapter-09-engaging-making-the-best-action-cho...,8491
9,chapter-10-getting-projects-under-control.md,4032


In [4]:
# Text Chunking
class TextChunker:
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.tokenizer = tiktoken.get_encoding("cl100k_base")
        
    def clean_text(self, text: str) -> str:
        """Clean and normalize text."""
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove markdown headers (keep content)
        text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
        # Remove excessive newlines
        text = re.sub(r'\n\s*\n', '\n\n', text)
        return text.strip()
    
    def chunk_text(self, text: str, source_info: Dict[str, str]) -> List[Dict[str, str]]:
        """Split text into overlapping chunks."""
        cleaned_text = self.clean_text(text)
        
        # Split into sentences for better chunk boundaries
        sentences = re.split(r'(?<=[.!?])\s+', cleaned_text)
        
        chunks = []
        current_chunk = ""
        current_tokens = 0
        
        for sentence in sentences:
            sentence_tokens = len(self.tokenizer.encode(sentence))
            
            # If adding this sentence would exceed chunk size, save current chunk
            if current_tokens + sentence_tokens > self.chunk_size and current_chunk:
                chunks.append({
                    'text': current_chunk.strip(),
                    'source': source_info['filename'],
                    'filepath': source_info['filepath'],
                    'chunk_id': len(chunks),
                    'tokens': current_tokens
                })
                
                # Start new chunk with overlap
                if self.chunk_overlap > 0:
                    # Take last few sentences for overlap
                    overlap_sentences = current_chunk.split('. ')[-2:]
                    current_chunk = '. '.join(overlap_sentences) + '. ' + sentence
                    current_tokens = len(self.tokenizer.encode(current_chunk))
                else:
                    current_chunk = sentence
                    current_tokens = sentence_tokens
            else:
                current_chunk += " " + sentence if current_chunk else sentence
                current_tokens += sentence_tokens
        
        # Add the last chunk
        if current_chunk.strip():
            chunks.append({
                'text': current_chunk.strip(),
                'source': source_info['filename'],
                'filepath': source_info['filepath'],
                'chunk_id': len(chunks),
                'tokens': current_tokens
            })
        
        return chunks
    
    def process_documents(self, documents: List[Dict[str, str]]) -> List[Dict[str, str]]:
        """Process all documents and return chunks."""
        all_chunks = []
        
        print("🔄 Processing documents into chunks...")
        
        for doc in documents:
            chunks = self.chunk_text(doc['content'], doc)
            all_chunks.extend(chunks)
            print(f"✅ {doc['filename']}: {len(chunks)} chunks created")
        
        print(f"\n📝 Total chunks created: {len(all_chunks)}")
        print(f"   Average tokens per chunk: {np.mean([chunk['tokens'] for chunk in all_chunks]):.0f}")
        
        return all_chunks

# Create text chunks
chunker = TextChunker(config.chunk_size, config.chunk_overlap)
chunks = chunker.process_documents(documents)

# Display chunk statistics
chunk_stats = pd.DataFrame([
    {
        'Source': chunk['source'],
        'Chunk ID': chunk['chunk_id'],
        'Tokens': chunk['tokens'],
        'Preview': chunk['text'][:100] + "..." if len(chunk['text']) > 100 else chunk['text']
    }
    for chunk in chunks[:10]  # Show first 10 chunks
])

print("\n📊 Sample Chunks:")
display(chunk_stats)


🔄 Processing documents into chunks...
✅ chapter-01-the-art-of-getting-things-done.md: 12 chunks created
✅ chapter-02-getting-control-of-y-our-life-the-five-steps.md: 14 chunks created
✅ chapter-03-getting-projects-creatively-under-w-ay-the.md: 12 chunks created
✅ chapter-04-getting-started-setting-up-the-t-ime-space.md: 11 chunks created
✅ chapter-05-capturing-corralling-y-our-stuf-f.md: 7 chunks created
✅ chapter-06-clarifying-getting-in-to-empty.md: 9 chunks created
✅ chapter-07-or-ganizing-setting-up-the-right-buckets.md: 24 chunks created
✅ chapter-08-reflecting-keeping-it-all-fresh-and.md: 6 chunks created
✅ chapter-09-engaging-making-the-best-action-choices.md: 11 chunks created
✅ chapter-10-getting-projects-under-control.md: 5 chunks created
✅ chapter-11-the-power-of-the-capturing-habit.md: 5 chunks created
✅ chapter-12-the-power-of-the-next-action-decision.md: 7 chunks created
✅ chapter-13-the-power-of-outcome-focusing.md: 4 chunks created
✅ chapter-14-gtd-and-cognitive-science

Unnamed: 0,Source,Chunk ID,Tokens,Preview
0,chapter-01-the-art-of-getting-things-done.md,0,987,Chapter 1: The Art of Getting Things Done *Par...
1,chapter-01-the-art-of-getting-things-done.md,1,992,And most people are to some degree frustrated ...
2,chapter-01-the-art-of-getting-things-done.md,2,992,And if you could keep life in general more in ...
3,chapter-01-the-art-of-getting-things-done.md,3,999,(Even in the 1980s many professionals consider...
4,chapter-01-the-art-of-getting-things-done.md,4,980,The Promise: The “Ready State” of the Martial ...
5,chapter-01-the-art-of-getting-things-done.md,5,979,"Y ou probably had a sense of being in control,..."
6,chapter-01-the-art-of-getting-things-done.md,6,994,"Now , describe, in a single written sentence, ..."
7,chapter-01-the-art-of-getting-things-done.md,7,975,—Kerry Gleeson you haven’ t decided what the v...
8,chapter-01-the-art-of-getting-things-done.md,8,992,Stuf f is not inherently a bad thing. Things t...
9,chapter-01-the-art-of-getting-things-done.md,9,968,"Clarifying things on the front end, when they ..."


In [5]:
# Embedding and Vector Storage
class VectorStore:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        print(f"🔄 Loading embedding model: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.dimension = self.model.get_sentence_embedding_dimension()
        self.index = None
        self.chunks = []
        print(f"✅ Model loaded! Embedding dimension: {self.dimension}")
        
    def create_embeddings(self, chunks: List[Dict[str, str]]) -> np.ndarray:
        """Create embeddings for all chunks."""
        print("🔄 Creating embeddings...")
        
        texts = [chunk['text'] for chunk in chunks]
        
        # Create embeddings in batches to manage memory
        batch_size = 32
        embeddings = []
        
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            batch_embeddings = self.model.encode(batch_texts, show_progress_bar=True)
            embeddings.append(batch_embeddings)
            print(f"   Processed batch {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1}")
        
        embeddings = np.vstack(embeddings)
        print(f"✅ Created {embeddings.shape[0]} embeddings")
        
        return embeddings
    
    def build_index(self, chunks: List[Dict[str, str]]):
        """Build FAISS index from chunks."""
        self.chunks = chunks
        embeddings = self.create_embeddings(chunks)
        
        print("🔄 Building FAISS index...")
        
        # Create FAISS index
        self.index = faiss.IndexFlatIP(self.dimension)  # Inner product for cosine similarity
        
        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(embeddings)
        
        # Add embeddings to index
        self.index.add(embeddings.astype('float32'))
        
        print(f"✅ FAISS index built with {self.index.ntotal} vectors")
    
    def search(self, query: str, top_k: int = 5) -> List[Dict[str, any]]:
        """Search for similar chunks."""
        if self.index is None:
            raise ValueError("Index not built. Call build_index() first.")
        
        # Create query embedding
        query_embedding = self.model.encode([query])
        faiss.normalize_L2(query_embedding)
        
        # Search
        scores, indices = self.index.search(query_embedding.astype('float32'), top_k)
        
        # Format results
        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.chunks):  # Valid index
                result = self.chunks[idx].copy()
                result['similarity_score'] = float(score)
                results.append(result)
        
        return results

# Create vector store and build index
vector_store = VectorStore(config.embedding_model)
vector_store.build_index(chunks)

print(f"\n🎯 Vector store ready!")
print(f"   Index size: {vector_store.index.ntotal} vectors")
print(f"   Embedding dimension: {vector_store.dimension}")

# Test search
test_query = "What is the two-minute rule?"
test_results = vector_store.search(test_query, top_k=3)

print(f"\n🔍 Test search: '{test_query}'")
for i, result in enumerate(test_results, 1):
    print(f"\n{i}. [{result['source']}] (Score: {result['similarity_score']:.3f})")
    print(f"   {result['text'][:200]}...")


🔄 Loading embedding model: all-MiniLM-L6-v2
✅ Model loaded! Embedding dimension: 384
🔄 Creating embeddings...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

   Processed batch 1/5


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

   Processed batch 2/5


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

   Processed batch 3/5


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

   Processed batch 4/5


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

   Processed batch 5/5
✅ Created 139 embeddings
🔄 Building FAISS index...
✅ FAISS index built with 139 vectors

🎯 Vector store ready!
   Index size: 139 vectors
   Embedding dimension: 384

🔍 Test search: 'What is the two-minute rule?'

1. [chapter-06-clarifying-getting-in-to-empty.md] (Score: 0.485)
   That’ s a rather dramatic testimonial, but it’ s an indication of just how critical some of these simple processing behaviors can be, especially as the volume and speed of the input increase for you p...

2. [chapter-09-engaging-making-the-best-action-choices.md] (Score: 0.253)
   Y ou have three pages of scribbled notes from the conversation. There’ s a meeting scheduled with your staf f at eleven, about half an hour from now .. Y ou were out late last night with your spouse’ ...

3. [chapter-01-the-art-of-getting-things-done.md] (Score: 0.225)
   —Kerry Gleeson you haven’ t decided what the very next physical action step is; and/or you haven’ t put reminders of the outcome and the act

In [None]:
# RAG Generation System
class RAGSystem:
    def __init__(self, vector_store: VectorStore, config: RAGConfig):
        self.vector_store = vector_store
        self.config = config
        self.client = openai.OpenAI() if openai.api_key else None
        
    def retrieve_context(self, query: str, top_k: int = None) -> str:
        """Retrieve relevant context for a query."""
        if top_k is None:
            top_k = self.config.top_k
            
        results = self.vector_store.search(query, top_k)
        
        context_parts = []
        for i, result in enumerate(results, 1):
            context_parts.append(
                f"**Source {i}: {result['source']}** (Relevance: {result['similarity_score']:.3f})\n"
                f"{result['text']}\n"
            )
        
        return "\n---\n".join(context_parts)
    
    def generate_prompt(self, query: str, context: str) -> str:
        """Generate a prompt for the language model."""
        prompt = f"""You are an AI assistant specialized in David Allen's "Getting Things Done" (GTD) methodology. Use the provided context from the GTD book to answer the user's question accurately and comprehensively.

**Context from GTD book:**
{context}

**User Question:** {query}

**Instructions:**
- Answer based primarily on the provided context
- If the context doesn't fully address the question, indicate what information is missing
- Use GTD terminology and concepts accurately
- Provide practical, actionable advice when appropriate
- Maintain David Allen's tone and approach
- If you reference specific GTD principles or techniques, explain them clearly

**Answer:**"""
        return prompt
    
    def generate_answer(self, query: str, max_tokens: int = None) -> Dict[str, any]:
        """Generate an answer using RAG."""
        if max_tokens is None:
            max_tokens = self.config.max_tokens
            
        # Retrieve context
        context = self.retrieve_context(query)
        
        # Generate prompt
        prompt = self.generate_prompt(query, context)
        
        result = {
            'query': query,
            'context': context,
            'prompt': prompt,
            'answer': None,
            'error': None
        }
        
        # Generate answer if OpenAI is available
        if self.client:
            try:
                response = self.client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": "You are a GTD (Getting Things Done) expert assistant."},
                        {"role": "user", "content": prompt}
                    ],
                    max_tokens=max_tokens,
                    temperature=0.7
                )
                result['answer'] = response.choices[0].message.content
                
            except Exception as e:
                result['error'] = f"OpenAI API error: {str(e)}"
        else:
            result['error'] = "OpenAI API key not configured. Only retrieval available."
        
        return result
    
    def display_result(self, result: Dict[str, any], show_context: bool = False):
        """Display RAG result in a formatted way."""
        print(f"🔍 **Query:** {result['query']}")
        print("=" * 60)
        
        if result['answer']:
            print("📝 **Answer:**")
            display(Markdown(result['answer']))
        elif result['error']:
            print(f"❌ **Error:** {result['error']}")
        
        if show_context:
            print("\n📚 **Retrieved Context:**")
            print("-" * 40)
            display(Markdown(result['context']))

# Initialize RAG system
rag_system = RAGSystem(vector_store, config)

# Test the RAG system
test_questions = [
    "What is the two-minute rule in GTD?",
    "How do I organize my GTD system?",
    "What are the five steps of GTD workflow?"
]

print("🤖 Testing RAG System:")
print("=" * 50)

for question in test_questions:
    print(f"\n💭 Testing: {question}")
    result = rag_system.generate_answer(question)
    
    if result['answer']:
        print("✅ Generated answer successfully")
    else:
        print(f"⚠️  {result['error']}")
        print("📚 Retrieved context available for manual review")

print("\n🎯 RAG System is ready for use!")


In [None]:
# Interactive Query Interface

def ask_gtd(question: str, show_context: bool = False, top_k: int = None):
    """
    Ask a question about GTD methodology.
    
    Args:
        question (str): Your question about GTD
        show_context (bool): Whether to display the retrieved context
        top_k (int): Number of relevant chunks to retrieve (default: 5)
    """
    if top_k:
        rag_system.config.top_k = top_k
    
    result = rag_system.generate_answer(question)
    rag_system.display_result(result, show_context=show_context)
    return result

# Demo queries
print("🎯 **GTD RAG System - Ready to Answer Your Questions!**")
print("=" * 60)
print("\n📋 **Usage Examples:**")
print("1. `ask_gtd('What is the two-minute rule?')`")
print("2. `ask_gtd('How do I do a weekly review?', show_context=True)`")
print("3. `ask_gtd('What are contexts in GTD?', top_k=3)`")

print("\n💡 **Sample Questions to Try:**")
sample_questions = [
    "What is the two-minute rule in GTD?",
    "How do I set up a GTD system?",
    "What is the difference between projects and next actions?",
    "How often should I do a weekly review?",
    "What are contexts and how do I use them?",
    "How do I capture everything in my head?",
    "What is the GTD workflow process?",
    "How do I organize my reference materials?",
    "What is the purpose of the inbox in GTD?",
    "How do I handle waiting-for items?"
]

for i, q in enumerate(sample_questions, 1):
    print(f"{i:2d}. {q}")

print(f"\n🔧 **System Status:**")
print(f"   📚 Documents loaded: {len(documents)}")
print(f"   📝 Text chunks: {len(chunks)}")
print(f"   🔍 Vector index size: {vector_store.index.ntotal}")
print(f"   🤖 OpenAI integration: {'✅ Ready' if rag_system.client else '❌ Not configured'}")

print(f"\n🚀 **Ready to answer your GTD questions!**")


In [None]:
# Demo: Try the RAG System!

# Example 1: Basic question
print("🎬 **Demo 1: Basic GTD Question**")
print("=" * 40)
ask_gtd("What is the two-minute rule in GTD?")


In [None]:
# Demo: Show Context Retrieval

print("\n🎬 **Demo 2: Context Retrieval**")
print("=" * 40)
print("This shows how the system retrieves relevant context from the GTD chapters:")

# Example with context display
ask_gtd("How do I organize my next actions?", show_context=True, top_k=3)

print("\n" + "="*60)
print("🎉 **GTD RAG System Complete!**")
print("The system is now ready to answer questions about Getting Things Done methodology.")
print("Use the `ask_gtd()` function to query the system interactively.")
