In [16]:
# Simple RAG System with PDF Section-Based Chunking

# This notebook implements a streamlined RAG system with:
# 1. PDF section-based chunking using tiktoken
# 2. OpenAI embeddings
# 3. Qdrant vector store
# 4. Score-filtered retriever
# 5. Simple RAG chain


In [17]:
# Install required packages if needed
# !pip install -qU langchain==0.2.14 langchain_openai==0.1.23 langchain_core==0.2.35 langchain-community
# !pip install -qU qdrant-client pymupdf tiktoken


In [18]:
import os
import getpass

# Set up OpenAI API key
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")


In [22]:
# Import the PDF section chunker
from pdf_section_chunker import chunk_northwind_pdf, get_encoding_info, TIKTOKEN_AVAILABLE

# Show available encodings
if TIKTOKEN_AVAILABLE:
    print("🔢 Available Tiktoken Encodings:")
    for encoding, models in get_encoding_info().items():
        print(f"   {encoding}: {models}")
else:
    print("⚠️ Tiktoken not available - using character-based chunking")


🔢 Available Tiktoken Encodings:
   cl100k_base: GPT-4, GPT-3.5-turbo, text-embedding-ada-002
   p50k_base: text-davinci-002, text-davinci-003
   r50k_base: GPT-3 models (davinci, curie, babbage, ada)
   gpt2: GPT-2 models


In [23]:
# Process the Northwind PDF with intelligent section-based chunking
print("🔄 Processing Northwind_Traders_Database_Overview.pdf...")

# Create chunks using our advanced PDF section chunker
chunks = chunk_northwind_pdf(
    pdf_path="data/Northwind_Traders_Database_Overview.pdf",
    use_tokens=True,
    encoding_name="cl100k_base"  # Optimal for OpenAI models
)

print(f"✅ Created {len(chunks)} intelligent chunks from the PDF")

# Display statistics
if chunks:
    char_counts = [chunk.metadata.get('char_count', len(chunk.content)) for chunk in chunks]
    token_counts = [chunk.metadata.get('token_count', 0) for chunk in chunks]
    
    print(f"\n📊 Chunk Statistics:")
    print(f"   Character count - Min: {min(char_counts)}, Max: {max(char_counts)}, Avg: {sum(char_counts)//len(char_counts)}")
    
    if any(token_counts):
        print(f"   Token count - Min: {min(token_counts)}, Max: {max(token_counts)}, Avg: {sum(token_counts)//len(token_counts)}")
        print(f"   Chunking method: {chunks[0].metadata.get('chunking_method', 'unknown')}")
        print(f"   Encoding: {chunks[0].metadata.get('encoding', 'N/A')}")


🔄 Processing Northwind_Traders_Database_Overview.pdf...
✓ Using token-based chunking with cl100k_base encoding
✅ Created 34 intelligent chunks from the PDF

📊 Chunk Statistics:
   Character count - Min: 159, Max: 2049, Avg: 1257
   Token count - Min: 34, Max: 396, Avg: 248
   Chunking method: tokens
   Encoding: cl100k_base


In [25]:
# Import the database document creation function
from Create_RAG_docs_from_db import create_comprehensive_northwind_business_documents

# Generate comprehensive business documents from database
print("🔄 Generating comprehensive business documents from Northwind database...")

try:
    # Database connection parameters (update these with your credentials)
    business_docs = create_comprehensive_northwind_business_documents(
        host="ep-aged-leaf-a5sdyft6-pooler.us-east-2.aws.neon.tech",
        username="neondb_owner",
        password="npg_m5bUF7retyMH", 
        database="neondb",
        schema="northwind"
    )
    
    print(f"✅ Generated {len(business_docs)} comprehensive business documents")
    print(f"📄 Total content length: {sum(len(doc) for doc in business_docs):,} characters")
    
except Exception as e:
    print(f"⚠️ Could not connect to database: {e}")
    print("📝 Using empty list for business documents - PDF documents will still work")
    business_docs = []


🔄 Generating comprehensive business documents from Northwind database...
Connecting to Northwind database and generating comprehensive business documents...
Generating customer analysis document...
Generating customer purchasing behavior analysis...
Generating comprehensive product analysis...
Generating supplier analysis...
Generating employee and territory analysis...
Generating shipping and logistics analysis...
Generating comprehensive financial analysis...
Generating advanced business intelligence insights...
Successfully created 8 comprehensive business documents from Northwind PostgreSQL database
Total document length: 37,030 characters
✅ Generated 8 comprehensive business documents
📄 Total content length: 37,030 characters


In [26]:
# Convert database documents to LangChain format
def convert_business_docs_to_langchain(business_docs):
    """Convert database business documents to LangChain Document format."""
    langchain_docs = []
    
    # Document type mapping for better metadata
    doc_types = [
        "customer_analysis", "customer_behavior", "product_catalog", 
        "supplier_analysis", "employee_performance", "shipping_logistics",
        "financial_performance", "business_intelligence", "operational_efficiency"
    ]
    
    for i, doc in enumerate(business_docs):
        # Create descriptive metadata
        doc_type = doc_types[i] if i < len(doc_types) else f"business_doc_{i}"
        
        # Create LangChain Document
        langchain_doc = Document(
            page_content=doc,
            metadata={
                "source": f"northwind_database_{doc_type}",
                "type": "business_analysis",
                "document_id": i,
                "comprehensive": True,
                "data_source": "postgresql_database",
                "section_title": doc_type.replace('_', ' ').title(),
                "section_level": 1,
                "char_count": len(doc),
                "chunk_type": "database_generated"
            }
        )
        langchain_docs.append(langchain_doc)
    
    return langchain_docs

# Convert business documents to LangChain format
db_documents = convert_business_docs_to_langchain(business_docs)

print(f"📚 Converted {len(db_documents)} database documents to LangChain format")

# Display sample metadata from database documents
if db_documents:
    print(f"📄 Sample DB metadata: {db_documents[0].metadata}")
else:
    print("📝 No database documents to convert")


📚 Converted 8 database documents to LangChain format
📄 Sample DB metadata: {'source': 'northwind_database_customer_analysis', 'type': 'business_analysis', 'document_id': 0, 'comprehensive': True, 'data_source': 'postgresql_database', 'section_title': 'Customer Analysis', 'section_level': 1, 'char_count': 5610, 'chunk_type': 'database_generated'}


In [27]:
# Convert our DocumentChunk objects to LangChain Document format
from langchain.schema import Document

def convert_chunks_to_langchain_docs(chunks):
    """Convert our DocumentChunk objects to LangChain Document format."""
    langchain_docs = []
    
    for chunk in chunks:
        # Create metadata that includes all our enhanced information
        metadata = {
            "source": chunk.metadata.get('source', 'Northwind_Traders_Database_Overview.pdf'),
            "page": chunk.page_number,
            "section_title": chunk.title,
            "section_level": chunk.section_level,
            "char_count": chunk.metadata.get('char_count', len(chunk.content)),
            "token_count": chunk.metadata.get('token_count', 0),
            "chunking_method": chunk.metadata.get('chunking_method', 'unknown'),
            "encoding": chunk.metadata.get('encoding', 'N/A'),
            "chunk_type": chunk.metadata.get('chunk_type', 'section'),
            "is_split": chunk.metadata.get('is_split', False)
        }
        
        # Create LangChain Document
        doc = Document(
            page_content=chunk.content,
            metadata=metadata
        )
        langchain_docs.append(doc)
    
    return langchain_docs

# Convert our smart chunks to LangChain format
pdf_documents = convert_chunks_to_langchain_docs(chunks)

# Combine PDF and database documents
all_documents = pdf_documents + db_documents

print(f"📚 Combined Documents Summary:")
print(f"   • PDF chunks: {len(pdf_documents)} documents")
print(f"   • Database docs: {len(db_documents)} documents") 
print(f"   • Total documents: {len(all_documents)} documents")

# Display sample metadata from both sources
if pdf_documents:
    print(f"📄 Sample PDF metadata: {pdf_documents[0].metadata}")
if db_documents:
    print(f"📄 Sample DB metadata: {db_documents[0].metadata}")


📚 Combined Documents Summary:
   • PDF chunks: 34 documents
   • Database docs: 8 documents
   • Total documents: 42 documents
📄 Sample PDF metadata: {'source': 'data/Northwind_Traders_Database_Overview.pdf', 'page': 1, 'section_title': 'Northwind Traders Database Overview', 'section_level': 1, 'char_count': 427, 'token_count': 77, 'chunking_method': 'tokens', 'encoding': 'cl100k_base', 'chunk_type': 'section', 'is_split': False}
📄 Sample DB metadata: {'source': 'northwind_database_customer_analysis', 'type': 'business_analysis', 'document_id': 0, 'comprehensive': True, 'data_source': 'postgresql_database', 'section_title': 'Customer Analysis', 'section_level': 1, 'char_count': 5610, 'chunk_type': 'database_generated'}


In [28]:
# Initialize OpenAI embeddings
from langchain_openai.embeddings import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
print("✅ Initialized OpenAI embeddings model")


✅ Initialized OpenAI embeddings model


In [29]:
# Create Qdrant vector store in memory with all documents
from langchain_community.vectorstores import Qdrant

# We may need to split large database documents for better retrieval
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split large database documents while keeping PDF chunks intact
def smart_split_documents(all_documents):
    """Split only large database documents while preserving PDF chunks."""
    final_documents = []
    
    # Text splitter for large documents only
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,  # Reasonable size for database docs
        chunk_overlap=200,
        length_function=len,
    )
    
    for doc in all_documents:
        # If it's a large database document, split it
        if doc.metadata.get('data_source') == 'postgresql_database' and len(doc.page_content) > 2000:
            splits = text_splitter.split_documents([doc])
            # Update metadata for splits
            for i, split in enumerate(splits):
                split.metadata.update({
                    'split_index': i,
                    'original_length': len(doc.page_content),
                    'is_split': True
                })
            final_documents.extend(splits)
        else:
            # Keep PDF chunks and smaller DB docs as-is
            final_documents.append(doc)
    
    return final_documents

# Apply smart splitting
processed_documents = smart_split_documents(all_documents)

print(f"📊 Document Processing Summary:")
print(f"   • Original documents: {len(all_documents)}")
print(f"   • After smart splitting: {len(processed_documents)}")

# Create the vector store
qdrant_vectorstore = Qdrant.from_documents(
    processed_documents,
    embedding_model,
    location=":memory:",
    collection_name="northwind_comprehensive_data",
)

print(f"✅ Created Qdrant vector store with {len(processed_documents)} documents (PDF + Database)")


📊 Document Processing Summary:
   • Original documents: 42
   • After smart splitting: 68
✅ Created Qdrant vector store with 68 documents (PDF + Database)


In [30]:
# Create a simple score-filtered retriever
from langchain.schema import BaseRetriever, Document
from typing import List, Any
from pydantic import Field

class ScoreFilteredRetriever(BaseRetriever):
    """Simple retriever that filters results by similarity score."""
    
    vectorstore: Any = Field()
    score_threshold: float = Field(default=0.5)
    k: int = Field(default=5)
    
    class Config:
        arbitrary_types_allowed = True
    
    def _get_relevant_documents(self, query: str) -> List[Document]:
        docs_with_scores = self.vectorstore.similarity_search_with_score(query, k=self.k)
        
        # Filter by score threshold
        filtered_docs = [
            doc for doc, score in docs_with_scores 
            if score >= self.score_threshold
        ]
        
        return filtered_docs

# Create the retriever
retriever = ScoreFilteredRetriever(
    vectorstore=qdrant_vectorstore,
    score_threshold=0.3,
    k=8
)

print("✅ Created score-filtered retriever")


✅ Created score-filtered retriever


In [31]:
# Define the enhanced RAG prompt template
from langchain_core.prompts import ChatPromptTemplate

RAG_PROMPT = """
CONTEXT:
{context}

QUERY:
{question}

You are a helpful assistant with access to comprehensive Northwind Traders information from both:
1. Database Overview PDF documentation (structural information)
2. Live database analysis reports (current business data and performance metrics)

Use the provided context to answer the question thoroughly. If you can't answer the question based on the context, say you don't know. When possible, distinguish between structural/design information and actual business performance data.

Answer:
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
print("✅ Created enhanced RAG prompt template")


✅ Created enhanced RAG prompt template


In [32]:
# Initialize the LLM
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")
print("✅ Initialized OpenAI chat model")


✅ Initialized OpenAI chat model


In [33]:
# Create the RAG chain
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser

rag_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | rag_prompt 
    | llm 
    | StrOutputParser()
)

print("✅ Created RAG chain")


✅ Created RAG chain


In [34]:
# Test the enhanced RAG system with comprehensive questions
test_questions = [
    "What are the main entities in the Northwind database?",
    "Who are the top performing customers and employees?", 
    "What are the best selling products and their suppliers?",
    "How does the order processing system work?",
    "What are the shipping and logistics insights?",
    "Describe customer demographics and geographic distribution.",
    "What inventory management challenges does Northwind face?"
]

print("🧪 Testing Enhanced RAG System (PDF + Database):\n")
print("=" * 80)

for i, question in enumerate(test_questions, 1):
    print(f"\n🔍 Question {i}: {question}")
    print("-" * 60)
    
    # Get response
    response = rag_chain.invoke({"question": question})
    print(response)
    
    # Show retrieved documents info and sources
    docs = retriever.invoke(question)
    pdf_docs = sum(1 for doc in docs if doc.metadata.get('data_source') != 'postgresql_database')
    db_docs = sum(1 for doc in docs if doc.metadata.get('data_source') == 'postgresql_database')
    
    print(f"\n📊 Retrieved {len(docs)} documents ({pdf_docs} PDF, {db_docs} Database)")
    
    if i < len(test_questions):
        print("\n" + "=" * 80)


🧪 Testing Enhanced RAG System (PDF + Database):


🔍 Question 1: What are the main entities in the Northwind database?
------------------------------------------------------------
The main entities in the Northwind database, as per the structural information provided, include:

1. **Customers**: Represents the companies or individuals who purchase products. The Customers table stores customer information such as company name, contact person, address, and other contact details.

2. **Products**: Represents the items that Northwind offers for sale. The Products table includes details like product name, description, packaging, price, and stock levels.

3. **Orders**: Represents customer orders. Each order links to a specific customer and employee handling the order, capturing information on order details.

4. **Order Details**: Associates each order with the products sold within that order. This indicates the quantity of each product and connects to the relevant Product entry.

5. **Employ

In [35]:
# Interactive question-answering function
def ask_question(question: str):
    """Ask a question and get an answer from the RAG system."""
    print(f"🔍 Question: {question}")
    print("-" * 60)
    
    # Get response
    response = rag_chain.invoke({"question": question})
    print(f"🤖 Answer: {response}")
    
    # Show retrieved documents
    docs = retriever.invoke(question)
    print(f"\n📚 Used {len(docs)} source documents for context")
    
    return response

# Example usage:
# ask_question("Who are the employees mentioned in the Northwind database?")


In [15]:
ask_question("Who are the employees mentioned in the Northwind database?")

🔍 Question: Who are the employees mentioned in the Northwind database?
------------------------------------------------------------
🤖 Answer: The context does not provide specific names of employees in the Northwind database. It discusses employee roles and organizational structure but does not list any individual employees. Therefore, I don't know the names of the employees mentioned in the Northwind database.

📚 Used 8 source documents for context


"The context does not provide specific names of employees in the Northwind database. It discusses employee roles and organizational structure but does not list any individual employees. Therefore, I don't know the names of the employees mentioned in the Northwind database."