# RAG Pipeline Demo

This notebook demonstrates the complete RAG (Retrieval-Augmented Generation) pipeline, including:
1. Data Ingestion Pipeline
2. Query Pipeline
3. Graph-based Query Processing

## 1. Setup and Imports

In [None]:
import os
import sys

# Add src directory to Python path
sys.path.append(os.path.join(os.getcwd(), 'src'))

from src import DataIngestionPipeline, QueryPipeline, QueryProcessorGraph
from pprint import pprint

## 2. Data Ingestion Pipeline

This section demonstrates how to ingest documents into the system. This only needs to be run once.

The ingestion pipeline will automatically process all documents in the specified folder and determine the company and service information from the document content.

In [None]:
# Define the input folder path
input_folder = "path/to/your/documents"  # Replace with your document folder path

# Initialize the ingestion pipeline
ingestion_pipeline = DataIngestionPipeline(input_folder)

# Process all documents in the folder
print("Starting document ingestion...")
results = ingestion_pipeline.process_folder()

# Display results
print("\nIngestion Results:")
for result in results:
    print(f"- Processed {result['file_name']}: {result['total_chunks']} chunks")
    if 'company' in result and 'service' in result:
        print(f"  Company: {result['company']}, Service: {result['service']}")

print("\nData ingestion completed successfully!")

## 3. Query Pipeline

This section demonstrates how to query the ingested documents. This can be run multiple times.

In [None]:
# Initialize the query pipeline
query_pipeline = QueryPipeline(input_folder)

# Example queries
queries = [
    "What are the benefits of cloud computing?",
    "How does cloud computing improve business operations?",
    "What are the security considerations for cloud computing?"
]

# Process each query
for query in queries:
    print(f"\nQuery: {query}")
    print("-" * 50)
    
    result = query_pipeline.process_query(
        query=query,
        n_results=3
    )
    
    print(f"Answer: {result['answer']}")
    print("\nSources:")
    for source in result['sources']:
        print(f"- {source}")
    print("-" * 50)

## 4. Interactive Query

This section allows you to input your own queries interactively.

In [None]:
def interactive_query():
    while True:
        print("\nEnter your query (or 'quit' to exit):")
        query = input().strip()
        
        if query.lower() == 'quit':
            break
            
        result = query_pipeline.process_query(
            query=query,
            n_results=3
        )
        
        print(f"\nAnswer: {result['answer']}")
        print("\nSources:")
        for source in result['sources']:
            print(f"- {source}")

# Run interactive query
interactive_query()

## 5. Additional Features

This section demonstrates some additional features of the pipeline.

In [None]:
# Example of a complex query
complex_query = "What are the benefits and challenges of implementing cloud computing, and how do they affect business operations?"

print(f"\nComplex Query: {complex_query}")
print("-" * 50)

result = query_pipeline.process_query(
    query=complex_query,
    n_results=5
)

print(f"Answer: {result['answer']}")
print("\nSources:")
for source in result['sources']:
    print(f"- {source}")
print("-" * 50)

# Display query analysis
print("\nQuery Analysis:")
pprint(result['query_analysis'])

## 6. Graph-based Query Processing

This section demonstrates how to use the graph-based query processor, which provides a more structured and flexible approach to query processing.

The `QueryProcessorGraph` uses LangGraph to manage the query processing workflow, including:
1. Query rewriting
2. Complexity analysis
3. Query decomposition (for complex queries)
4. Document retrieval
5. Result reranking
6. Answer generation

In [None]:
# Initialize the graph-based processor
graph_processor = QueryProcessorGraph(input_folder)

# Example of a simple query
simple_query = "What is cloud computing?"
print(f"\nSimple Query: {simple_query}")
print("-" * 50)

result = graph_processor.process_query(simple_query)
print(f"Answer: {result['answer']}")
print("\nSources:")
for source in result['sources']:
    print(f"- {source}")
print("-" * 50)

# Display query analysis
print("\nQuery Analysis:")
pprint(result['query_analysis'])

In [None]:
# Example of a complex query
complex_query = "Compare the benefits and challenges of cloud computing, and explain how to implement a cloud migration strategy."
print(f"\nComplex Query: {complex_query}")
print("-" * 50)

result = graph_processor.process_query(complex_query)
print(f"Answer: {result['answer']}")
print("\nSources:")
for source in result['sources']:
    print(f"- {source}")
print("-" * 50)

# Display query analysis and decomposition
print("\nQuery Analysis:")
pprint(result['query_analysis'])

if result['query_analysis']['is_complex']:
    print("\nSub-questions:")
    for i, sub_q in enumerate(result['sub_questions'], 1):
        print(f"{i}. {sub_q}")
    print("\nSub-answers:")
    for i, sub_a in enumerate(result['sub_answers'], 1):
        print(f"{i}. {sub_a['answer']}")

In [None]:
# Interactive query with graph processor and conversation memory
def interactive_graph_query():
    print("Welcome to the interactive query system with conversation memory!")
    print("You can ask follow-up questions based on previous answers.")
    print("Type 'quit' to exit, 'history' to see conversation history, or 'clear' to clear history.\n")
    
    while True:
        print("\nEnter your query (required, or 'quit'/'history'/'clear'):")
        query = input().strip()
        
        if query.lower() == 'quit':
            break
        elif query.lower() == 'history':
            if graph_processor.conversation_history:
                print("\nConversation History:")
                for i, msg in enumerate(graph_processor.conversation_history, 1):
                    print(f"\n{i}. Q: {msg['query']}")
                    print(f"   A: {msg['answer']}")
            else:
                print("No conversation history yet.")
            continue
        elif query.lower() == 'clear':
            graph_processor.conversation_history = []
            print("Conversation history cleared.")
            continue
        
        if not query:
            print("Query cannot be empty. Please try again.")
            continue
            
        print("Enter company name (optional, press Enter to skip):")
        company = input().strip()
        
        print("Enter service name (optional, press Enter to skip):")
        service = input().strip()
        
        # Prepare parameters
        params = {"query": query}
        if company:
            params["company"] = company
        if service:
            params["service"] = service
            
        result = graph_processor.process_query(**params)
        
        print(f"\nAnswer: {result['answer']}")
        print("\nSources:")
        for source in result['sources']:
            print(f"- {source}")
        
        print("\nQuery Analysis:")
        pprint(result['query_analysis'])
        
        if result['query_analysis']['is_complex']:
            print("\nSub-questions:")
            for i, sub_q in enumerate(result['sub_questions'], 1):
                print(f"{i}. {sub_q}")
            print("\nSub-answers:")
            for i, sub_a in enumerate(result['sub_answers'], 1):
                print(f"{i}. {sub_a['answer']}")

# Run interactive query
interactive_graph_query()