In [None]:
# Setup: Add project root to path
import sys
import os
from pathlib import Path

# Add parent directory to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Python path configured")

## 1. Initialize Pipeline

Load the RAG pipeline with default configuration.

In [None]:
from src.main import RAGPipeline
from src.utils.logger import setup_logger

# Setup logging
setup_logger("Pipeline_Demo", log_dir="../logs", log_to_file=False)

# Initialize pipeline
pipeline = RAGPipeline(config_dir="../config")

print("\n" + "="*60)
print("RAG Pipeline Initialized")
print("="*60)
print(f"Chunk size: {pipeline.settings['chunk_size']} words")
print(f"Overlap: {pipeline.settings['overlap']} words")
print(f"Top-K retrieval: {pipeline.settings['top_k']}")
print(f"Similarity threshold: {pipeline.settings['similarity_threshold']}")

## 2. Check Database Status

View current vector database statistics.

In [None]:
# Get database stats
stats = pipeline.get_database_stats()

print("\nDatabase Statistics:")
print(f"Total chunks: {stats['total_chunks']}")
print(f"Unique PDFs: {stats['num_pdfs']}")

if stats['pdfs']:
    print("\nProcessed PDFs:")
    for pdf_name, count in stats['pdfs'].items():
        print(f"  - {pdf_name}: {count} chunks")
else:
    print("\nNo PDFs processed yet. Upload a PDF to data/raw_pdfs/ and process it.")

## 3. Process a PDF (Optional)

If you have a PDF in `data/raw_pdfs/`, uncomment and run this cell to process it.

In [None]:
# Uncomment to process a PDF
# pdf_path = "../data/raw_pdfs/your_document.pdf"
# 
# if os.path.exists(pdf_path):
#     print(f"Processing: {pdf_path}")
#     success = pipeline.process_pdf(pdf_path)
#     
#     if success:
#         print("\nPDF processed successfully!")
#         # Refresh stats
#         stats = pipeline.get_database_stats()
#         print(f"Updated total chunks: {stats['total_chunks']}")
#     else:
#         print("\nPDF processing failed. Check logs for details.")
# else:
#     print(f"PDF not found: {pdf_path}")

print("Skipping PDF processing (uncomment to enable)")

## 4. Test Retrieval

Query the vector database to see what chunks are retrieved.

In [None]:
# Test query
test_query = "What is the main topic of this document?"

print(f"Query: {test_query}\n")

# Check if database has chunks
stats = pipeline.get_database_stats()
if stats['total_chunks'] == 0:
    print("No chunks in database. Process a PDF first.")
else:
    # Retrieve relevant chunks
    retriever_result = pipeline.retriever.retrieve_and_format(test_query)
    
    print(f"Retrieved {retriever_result['num_chunks']} chunks:\n")
    
    for i, chunk in enumerate(retriever_result['chunks'][:3], 1):  # Show top 3
        print(f"[Chunk {i}]")
        print(f"PDF: {chunk['pdf_name']}")
        print(f"Page: {chunk['page_number']}")
        print(f"Similarity: {chunk['similarity']:.3f}")
        print(f"Text preview: {chunk['text'][:200]}...\n")
        print("-" * 60)

## 5. Generate Answer

Use the LLM to generate an answer based on retrieved chunks.

In [None]:
# Check if database has chunks
stats = pipeline.get_database_stats()
if stats['total_chunks'] == 0:
    print("No chunks in database. Process a PDF first.")
else:
    # Ask question
    result = pipeline.ask_question(test_query, show_sources=True)
    
    print("\n" + "="*60)
    print("ANSWER")
    print("="*60)
    print(result['answer'])
    
    if result['sources']:
        print("\n" + "="*60)
        print("SOURCES")
        print("="*60)
        for source in result['sources']:
            print(f"\n[{source['pdf_name']} - Page {source['page_number']}]")
            print(f"Similarity: {source['similarity']:.3f}")
            print(f"Text: {source['text'][:150]}...")

## 6. Test Conversation Memory

Demonstrate how conversation history affects answers.

In [None]:
# Simulate a conversation
stats = pipeline.get_database_stats()
if stats['total_chunks'] == 0:
    print("No chunks in database. Process a PDF first.")
else:
    conversation_history = []
    
    # Question 1
    q1 = "What is this document about?"
    print(f"Q1: {q1}")
    result1 = pipeline.ask_question(q1)
    print(f"A1: {result1['answer'][:200]}...\n")
    
    # Store in history
    conversation_history.append({
        "question": q1,
        "answer": result1['answer']
    })
    
    # Question 2 (with context)
    q2 = "Tell me more about that"
    print(f"Q2: {q2}")
    result2 = pipeline.ask_question(q2, conversation_history=conversation_history)
    print(f"A2: {result2['answer'][:200]}...\n")
    
    print("\nConversation memory allows follow-up questions to reference previous context.")

## 7. Performance Analysis

Measure retrieval and generation performance.

In [None]:
import time

stats = pipeline.get_database_stats()
if stats['total_chunks'] == 0:
    print("No chunks in database. Process a PDF first.")
else:
    queries = [
        "What is the main topic?",
        "Who is the author?",
        "What are the key findings?"
    ]
    
    print("Performance Benchmark:\n")
    
    for query in queries:
        start_time = time.time()
        result = pipeline.ask_question(query)
        elapsed = time.time() - start_time
        
        print(f"Query: {query}")
        print(f"Time: {elapsed:.2f}s")
        print(f"Answer length: {len(result['answer'])} chars")
        print(f"Sources: {len(result.get('sources', []))}")
        print("-" * 40)

## Summary

This notebook demonstrated:
- Pipeline initialization and configuration
- Database statistics and management
- Semantic retrieval testing
- Answer generation with citations
- Conversation memory functionality
- Performance benchmarking

**Next Steps**:
- Try different queries to test retrieval quality
- Adjust `settings.yaml` parameters and observe effects
- Use `prompt_testing.ipynb` to experiment with prompt engineering