# Document Querying System

This notebook implements a document querying system that automatically accesses information from both Milvus vector database and Neo4j graph database based on user queries. The system intelligently determines whether to use semantic search or graph traversal based on the nature of the user's query.

## Import Required Libraries

We'll import the necessary libraries for connecting to Milvus and Neo4j, handling queries, processing language, and visualizing results.

In [None]:
# Standard libraries
import os
import re
import json
from typing import List, Dict, Union, Optional, Tuple, Any

# Vector database and embeddings
from langchain_community.vectorstores import Milvus
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document as LangchainDocument

# Graph database
from langchain_community.graphs import Neo4jGraph
import networkx as nx

# Natural language processing
from langchain.chains import RetrievalQA
from langchain_community.llms import LlamaCpp

# Visualization
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from pyvis.network import Network

# Interactive widgets
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

## Configure Database Connections

Let's set up connections to both the Milvus vector database and Neo4j graph database with proper configuration parameters and error handling.

In [None]:
class DatabaseConnector:
    """Handles connections to both Milvus and Neo4j databases"""
    
    def __init__(self, 
                 milvus_host: str = "localhost", 
                 milvus_port: int = 19530,
                 neo4j_uri: str = "bolt://localhost:7687", 
                 neo4j_user: str = "neo4j", 
                 neo4j_password: str = "venev",
                 embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
        """
        Initialize connections to both databases
        """
        self.milvus_config = {
            "host": milvus_host,
            "port": milvus_port
        }
        self.neo4j_config = {
            "uri": neo4j_uri,
            "username": neo4j_user,
            "password": neo4j_password
        }
        self.embedding_model_name = embedding_model_name
        self.neo4j_graph = None
        self.embeddings = None
        
    def connect(self):
        """Establish connections to both databases"""
        try:
            # Initialize the embeddings model
            self.embeddings = HuggingFaceEmbeddings(model_name=self.embedding_model_name)
            
            # Initialize Neo4j connection
            self.neo4j_graph = Neo4jGraph(
                url=self.neo4j_config["uri"],
                username=self.neo4j_config["username"],
                password=self.neo4j_config["password"]
            )
            
            # Test Neo4j connection
            result = self.neo4j_graph.query("MATCH (n) RETURN count(n) as count LIMIT 1")
            print(f"Successfully connected to Neo4j. Node count: {result[0]['count']}")
            
            # For Milvus, we'll test the connection when we first use it
            print("Embedding model loaded successfully")
            return True
            
        except Exception as e:
            print(f"Error connecting to databases: {str(e)}")
            return False
            
    def list_milvus_collections(self):
        """List available collections in Milvus"""
        try:
            # We'll use a temporary Milvus instance just to list collections
            temp_store = Milvus(
                embedding_function=self.embeddings,
                collection_name="temp_check",
                connection_args=self.milvus_config,
                auto_id=True
            )
            
            # Access the pymilvus client to list collections
            from pymilvus import utility
            collections = utility.list_collections()
            return collections
        except Exception as e:
            print(f"Error listing Milvus collections: {str(e)}")
            return []
            
    def test_connections(self):
        """Test both database connections and return status"""
        status = {
            "neo4j": False,
            "milvus": False,
            "embeddings": False
        }
        
        # Test embeddings
        try:
            if self.embeddings:
                test_embedding = self.embeddings.embed_query("test")
                status["embeddings"] = len(test_embedding) > 0
        except Exception as e:
            print(f"Embeddings test failed: {str(e)}")
            
        # Test Neo4j
        try:
            if self.neo4j_graph:
                result = self.neo4j_graph.query("RETURN 1 as test")
                status["neo4j"] = len(result) > 0
        except Exception as e:
            print(f"Neo4j test failed: {str(e)}")
            
        # Test Milvus
        try:
            collections = self.list_milvus_collections()
            status["milvus"] = True
            print(f"Available Milvus collections: {collections}")
        except Exception as e:
            print(f"Milvus test failed: {str(e)}")
            
        return status

In [None]:
# Initialize the database connector
db_connector = DatabaseConnector()
connection_successful = db_connector.connect()

if connection_successful:
    connection_status = db_connector.test_connections()
    print("Connection status:")
    for db, status in connection_status.items():
        print(f"- {db}: {'Connected' if status else 'Not connected'}")
else:
    print("Failed to initialize database connections")

## Create Document Query Interface Class

We'll implement a unified class that serves as the main interface for querying documents, with methods to access both vector and graph databases.

In [None]:
class DocumentQuerySystem:
    """
    A unified system for querying document knowledge from both vector and graph databases
    """
    
    def __init__(self, db_connector, llm_path=None):
        """
        Initialize the document query system
        
        Args:
            db_connector: DatabaseConnector instance with connections to Milvus and Neo4j
            llm_path: Path to a local LLM for query analysis (optional)
        """
        self.db_connector = db_connector
        self.llm = None
        self.vector_retrievers = {}
        
        # If a local LLM path is provided, initialize it
        if llm_path and os.path.exists(llm_path):
            try:
                self.llm = LlamaCpp(
                    model_path=llm_path,
                    temperature=0.1,
                    max_tokens=512,
                    top_p=0.95,
                    verbose=False
                )
            except Exception as e:
                print(f"Warning: Failed to load LLM from {llm_path}: {str(e)}")
                print("Proceeding without LLM-based query analysis")
        
        # Initialize vector retrievers for each collection
        self._initialize_vector_retrievers()
    
    def _initialize_vector_retrievers(self):
        """Initialize vector retrievers for available collections"""
        collections = self.db_connector.list_milvus_collections()
        
        for collection_name in collections:
            try:
                # Skip temporary or system collections
                if collection_name.startswith("temp_") or collection_name.startswith("_"):
                    continue
                    
                retriever = Milvus(
                    embedding_function=self.db_connector.embeddings,
                    collection_name=collection_name,
                    connection_args=self.db_connector.milvus_config
                ).as_retriever(
                    search_type="similarity",
                    search_kwargs={"k": 5}  # Return top 5 results by default
                )
                
                self.vector_retrievers[collection_name] = retriever
                print(f"Initialized retriever for collection: {collection_name}")
            except Exception as e:
                print(f"Failed to initialize retriever for '{collection_name}': {str(e)}")
    
    def analyze_query_type(self, query):
        """
        Analyze the query to determine if it's better suited for vector search or graph traversal
        
        Returns:
            dict: Query analysis with type and target collection
        """
        # Define patterns for different query types
        relation_patterns = [
            r"(relationship|connection|link|relate|connect|between)",
            r"(how does .* interact with)",
            r"(depends on|uses|calls|connects to)",
            r"(who|what) (is|are) (connected|linked|related) to"
        ]
        
        rule_patterns = [
            r"(rule|regulation|policy|requirement)",
            r"(what (is|are) the (guidelines|rules|requirements))",
            r"(how should|how must|how to)"
        ]
        
        business_patterns = [
            r"(business (logic|flow|process|rule))",
            r"(how does the business)",
            r"(what is the process for)"
        ]
        
        user_patterns = [
            r"(user scenario|user story|use case)",
            r"(how (do|can|would) (user|users))",
            r"(what (does|can|would) (user|users))"
        ]
        
        system_patterns = [
            r"(system (component|module|function|architecture))",
            r"(how (does|do) the system)",
            r"(technical (design|architecture|implementation))"
        ]
        
        # Check which pattern matches best
        query_lower = query.lower()
        
        for pattern in relation_patterns:
            if re.search(pattern, query_lower):
                return {"type": "graph", "confidence": 0.9}
        
        # For vector searches, determine the most appropriate collection
        collections_confidence = {}
        
        for pattern in rule_patterns:
            if re.search(pattern, query_lower):
                collections_confidence["rule_explanations"] = 0.8
                
        for pattern in business_patterns:
            if re.search(pattern, query_lower):
                collections_confidence["business_logic_flows"] = 0.8
                
        for pattern in user_patterns:
            if re.search(pattern, query_lower):
                collections_confidence["user_scenarios"] = 0.8
                
        for pattern in system_patterns:
            if re.search(pattern, query_lower):
                collections_confidence["system_interactions"] = 0.8
                
        # If we have a good match for a collection
        if collections_confidence:
            best_collection = max(collections_confidence.items(), key=lambda x: x[1])
            return {
                "type": "vector", 
                "collection": best_collection[0],
                "confidence": best_collection[1]
            }
        
        # Default to searching all vector stores if we can't determine
        return {"type": "vector", "collection": "all", "confidence": 0.4}
    
    def vector_search(self, query, collection_name=None, k=5):
        """
        Perform a semantic search in the vector database
        
        Args:
            query: The search query string
            collection_name: Specific collection to search (None for all)
            k: Number of results to return
            
        Returns:
            List of document chunks matching the query
        """
        results = []
        
        if collection_name and collection_name in self.vector_retrievers:
            # Search in a specific collection
            retriever = self.vector_retrievers[collection_name]
            retriever.search_kwargs["k"] = k
            docs = retriever.get_relevant_documents(query)
            
            for doc in docs:
                results.append({
                    "content": doc.page_content,
                    "metadata": doc.metadata,
                    "collection": collection_name
                })
                
        elif collection_name == "all" or collection_name is None:
            # Search across all collections
            for name, retriever in self.vector_retrievers.items():
                retriever.search_kwargs["k"] = max(2, k // len(self.vector_retrievers))
                try:
                    docs = retriever.get_relevant_documents(query)
                    for doc in docs:
                        results.append({
                            "content": doc.page_content,
                            "metadata": doc.metadata,
                            "collection": name
                        })
                except Exception as e:
                    print(f"Error searching collection {name}: {str(e)}")
                    
        else:
            print(f"Collection '{collection_name}' not found")
            
        # Sort by relevance (if available in metadata)
        return results
    
    def graph_search(self, query):
        """
        Perform a graph traversal search in Neo4j
        
        Args:
            query: The search query string
            
        Returns:
            Dict containing nodes and relationships found
        """
        if not self.db_connector.neo4j_graph:
            return {"error": "Neo4j connection not available"}
            
        # Extract potential entities from the query
        entities = self._extract_entities_from_query(query)
        
        results = {"nodes": [], "relationships": []}
        
        if not entities:
            # If no specific entities found, return overview of graph structure
            try:
                node_count = self.db_connector.neo4j_graph.query(
                    "MATCH (n) RETURN count(n) as count"
                )
                
                rel_types = self.db_connector.neo4j_graph.query(
                    "MATCH ()-[r]-() RETURN DISTINCT type(r) as relation_type, count(r) as count"
                )
                
                node_types = self.db_connector.neo4j_graph.query(
                    "MATCH (n) RETURN DISTINCT labels(n)[0] as node_type, count(n) as count"
                )
                
                results["summary"] = {
                    "node_count": node_count[0]["count"] if node_count else 0,
                    "relation_types": rel_types,
                    "node_types": node_types
                }
                
            except Exception as e:
                results["error"] = f"Error retrieving graph summary: {str(e)}"
                
        else:
            # Search for paths involving the identified entities
            for entity in entities:
                try:
                    # Find nodes that match this entity name (case-insensitive)
                    entity_nodes = self.db_connector.neo4j_graph.query(
                        """
                        MATCH (n) 
                        WHERE toLower(n.id) CONTAINS toLower($entity_name)
                        RETURN n.id as id, labels(n) as labels
                        LIMIT 5
                        """,
                        {"entity_name": entity}
                    )
                    
                    # If we found matching nodes, find their relationships
                    if entity_nodes:
                        for node in entity_nodes:
                            # Add the node to results
                            if node not in results["nodes"]:
                                results["nodes"].append(node)
                                
                            # Find relationships for this node (up to 2 hops)
                            paths = self.db_connector.neo4j_graph.query(
                                """
                                MATCH path = (n)-[r*1..2]-(m)
                                WHERE n.id = $node_id
                                RETURN nodes(path) as nodes, relationships(path) as rels
                                LIMIT 10
                                """,
                                {"node_id": node["id"]}
                            )
                            
                            for path in paths:
                                # Add all nodes and relationships from the path
                                for n in path["nodes"]:
                                    if n not in results["nodes"]:
                                        results["nodes"].append({
                                            "id": n["id"],
                                            "labels": n["labels"] if "labels" in n else []
                                        })
                                
                                for rel in path["rels"]:
                                    relation = {
                                        "source": rel["source"],
                                        "target": rel["target"],
                                        "type": rel["type"]
                                    }
                                    if relation not in results["relationships"]:
                                        results["relationships"].append(relation)
                    
                except Exception as e:
                    print(f"Error searching for entity '{entity}': {str(e)}")
        
        return results
    
    def _extract_entities_from_query(self, query):
        """
        Extract potential entity names from the query
        
        Args:
            query: The search query string
            
        Returns:
            List of potential entity names
        """
        # Extract noun phrases as potential entities
        # This is a simple approximation - in production you'd use NLP/NER
        
        # Look for specific patterns like "relationship between X and Y"
        relation_pattern = r"(?:relationship|connection|link|relate|connect)\s+(?:between|of|from|to)\s+(\w+)\s+(?:and|to|with)\s+(\w+)"
        matches = re.findall(relation_pattern, query, re.IGNORECASE)
        
        if matches:
            return [entity for match in matches for entity in match]
        
        # Extract capitalized words and words after keywords
        entities = []
        
        # Find words after keywords like "system", "user", "component", etc.
        keyword_pattern = r"(?:system|component|module|entity|table|user|business)\s+(\w+)"
        keyword_matches = re.findall(keyword_pattern, query, re.IGNORECASE)
        entities.extend(keyword_matches)
        
        # Find capitalized words (potential proper nouns)
        capitalized_pattern = r'\b([A-Z][a-z]+)\b'
        capitalized_matches = re.findall(capitalized_pattern, query)
        entities.extend(capitalized_matches)
        
        return list(set(entities))
    
    def hybrid_search(self, query, k=5):
        """
        Perform a hybrid search using both vector and graph databases based on query analysis
        
        Args:
            query: The search query string
            k: Number of results to return
            
        Returns:
            Dict containing search results and analysis info
        """
        # Analyze the query to determine the best approach
        analysis = self.analyze_query_type(query)
        
        results = {
            "query": query,
            "analysis": analysis,
            "vector_results": [],
            "graph_results": None
        }
        
        # Perform search based on analysis
        if analysis["type"] == "graph" or analysis["confidence"] < 0.5:
            # Either the query is explicitly for graph, or we're not confident
            # so we'll try both approaches
            results["graph_results"] = self.graph_search(query)
            
            # Also perform vector search as fallback
            results["vector_results"] = self.vector_search(
                query, 
                collection_name="all", 
                k=k
            )
            
        else:
            # For vector search
            collection = analysis.get("collection", "all")
            results["vector_results"] = self.vector_search(
                query,
                collection_name=collection,
                k=k
            )
        
        return results

## Implement Hybrid Search Functions

We've implemented the core hybrid search functionality in the `DocumentQuerySystem` class above, which determines whether to use semantic search (Milvus) or graph traversal (Neo4j) based on the nature of the user's query.

The key components are:
1. `analyze_query_type()` - Analyzes the user query to determine search strategy
2. `vector_search()` - Performs semantic search in Milvus vector database
3. `graph_search()` - Performs traversal in Neo4j graph database
4. `hybrid_search()` - Combines both approaches based on query analysis

Now, let's create the query processing pipeline.

## Create Query Processing Pipeline

We'll build a pipeline that analyzes user queries, routes them to the appropriate search method, and formats the results in a consistent way.

In [None]:
class QueryProcessor:
    """
    Processes queries and formats results consistently
    """
    
    def __init__(self, query_system):
        """
        Initialize the query processor
        
        Args:
            query_system: DocumentQuerySystem instance
        """
        self.query_system = query_system
        
    def process_query(self, query):
        """
        Process a user query and return formatted results
        
        Args:
            query: User query string
            
        Returns:
            Dict with formatted results and metadata
        """
        # Validate input
        if not query or not query.strip():
            return {
                "success": False,
                "message": "Please provide a valid query",
                "results": None
            }
        
        try:
            # Perform hybrid search
            raw_results = self.query_system.hybrid_search(query)
            
            # Format the results
            formatted_results = self._format_results(raw_results)
            
            return {
                "success": True,
                "message": "Query processed successfully",
                "results": formatted_results,
                "raw": raw_results  # Include raw results for debugging
            }
            
        except Exception as e:
            return {
                "success": False,
                "message": f"Error processing query: {str(e)}",
                "results": None
            }
    
    def _format_results(self, raw_results):
        """
        Format raw search results into a consistent structure
        
        Args:
            raw_results: Results from hybrid search
            
        Returns:
            Dict with formatted sections
        """
        formatted = {
            "query": raw_results["query"],
            "query_type": raw_results["analysis"]["type"],
            "confidence": raw_results["analysis"].get("confidence", 0.0),
            "vector_results": [],
            "graph_results": {
                "summary": {},
                "nodes": [],
                "relationships": []
            }
        }
        
        # Format vector results
        for result in raw_results["vector_results"]:
            formatted["vector_results"].append({
                "content": result["content"],
                "source": result.get("collection", "unknown"),
                "metadata": result.get("metadata", {})
            })
        
        # Format graph results
        if raw_results["graph_results"]:
            # Add summary if available
            if "summary" in raw_results["graph_results"]:
                formatted["graph_results"]["summary"] = raw_results["graph_results"]["summary"]
            
            # Add nodes
            if "nodes" in raw_results["graph_results"]:
                formatted["graph_results"]["nodes"] = raw_results["graph_results"]["nodes"]
            
            # Add relationships
            if "relationships" in raw_results["graph_results"]:
                formatted["graph_results"]["relationships"] = raw_results["graph_results"]["relationships"]
        
        return formatted
    
    def generate_html_report(self, formatted_results):
        """
        Generate an HTML report from formatted results
        
        Args:
            formatted_results: Results from _format_results method
            
        Returns:
            HTML string representation of results
        """
        if not formatted_results:
            return "<div class='error'>No results available</div>"
            
        html = [
            "<div class='query-results'>",
            f"<h3>Results for Query: <span class='query-text'>{formatted_results['query']}</span></h3>",
            f"<div class='query-info'>Query analyzed as: <span class='query-type'>{formatted_results['query_type']}</span> ",
            f"(Confidence: {int(formatted_results['confidence'] * 100)}%)</div>",
            "<hr>"
        ]
        
        # Vector Results Section
        if formatted_results["vector_results"]:
            html.append("<h4>Semantic Search Results</h4>")
            html.append("<div class='vector-results'>")
            
            for i, result in enumerate(formatted_results["vector_results"], 1):
                html.append(f"<div class='result-item'>")
                html.append(f"<h5>Result #{i} <span class='source-tag'>Source: {result['source']}</span></h5>")
                
                # Format content with newlines preserved
                content = result["content"].replace("\n", "<br>")
                html.append(f"<div class='content'>{content}</div>")
                
                # Add metadata if available
                if result["metadata"]:
                    meta_html = ["<div class='metadata'>"]
                    for key, value in result["metadata"].items():
                        meta_html.append(f"<span class='meta-item'>{key}: {value}</span>")
                    meta_html.append("</div>")
                    html.append("".join(meta_html))
                
                html.append("</div>")  # Close result-item
            
            html.append("</div>")  # Close vector-results
            html.append("<hr>")
            
        # Graph Results Section
        graph_results = formatted_results["graph_results"]
        if graph_results and (graph_results.get("nodes") or graph_results.get("summary")):
            html.append("<h4>Graph Search Results</h4>")
            
            # Graph Summary
            if graph_results.get("summary"):
                summary = graph_results["summary"]
                html.append("<div class='graph-summary'>")
                if "node_count" in summary:
                    html.append(f"<p>Total nodes: {summary['node_count']}</p>")
                
                # Node types
                if "node_types" in summary and summary["node_types"]:
                    html.append("<div class='node-types'>")
                    html.append("<h5>Node Types</h5>")
                    html.append("<ul>")
                    for type_info in summary["node_types"]:
                        html.append(f"<li>{type_info['node_type']}: {type_info['count']} nodes</li>")
                    html.append("</ul>")
                    html.append("</div>")
                
                # Relation types
                if "relation_types" in summary and summary["relation_types"]:
                    html.append("<div class='relation-types'>")
                    html.append("<h5>Relation Types</h5>")
                    html.append("<ul>")
                    for rel_info in summary["relation_types"]:
                        html.append(f"<li>{rel_info['relation_type']}: {rel_info['count']} relationships</li>")
                    html.append("</ul>")
                    html.append("</div>")
                
                html.append("</div>")  # Close graph-summary
            
            # Graph Entities
            if graph_results.get("nodes"):
                html.append("<div class='graph-entities'>")
                html.append("<h5>Entities Found</h5>")
                html.append("<ul class='entity-list'>")
                
                for node in graph_results["nodes"]:
                    labels = ", ".join(node.get("labels", ["Entity"]))
                    html.append(f"<li><span class='entity-id'>{node['id']}</span> <span class='entity-label'>[{labels}]</span></li>")
                
                html.append("</ul>")
                html.append("</div>")
                
                # Relationships
                if graph_results.get("relationships"):
                    html.append("<div class='graph-relationships'>")
                    html.append("<h5>Relationships</h5>")
                    html.append("<ul class='relationship-list'>")
                    
                    for rel in graph_results["relationships"]:
                        html.append(f"<li><span class='rel-source'>{rel['source']}</span> " +
                                    f"<span class='rel-type'>-[{rel['type']}]-></span> " +
                                    f"<span class='rel-target'>{rel['target']}</span></li>")
                    
                    html.append("</ul>")
                    html.append("</div>")
                
            html.append("<p class='graph-note'>Note: Use the visualization feature to see a graph representation.</p>")
        
        html.append("</div>")  # Close query-results
        
        # Add CSS styling
        html.append("""
        <style>
            .query-results { font-family: Arial, sans-serif; padding: 10px; }
            .query-text { color: #3366cc; }
            .query-type { color: #009933; font-weight: bold; }
            .result-item { margin-bottom: 15px; padding: 10px; border: 1px solid #ddd; border-radius: 4px; }
            .source-tag { font-size: 0.8em; color: #666; margin-left: 10px; }
            .content { margin: 10px 0; white-space: pre-line; }
            .metadata { margin-top: 5px; font-size: 0.8em; color: #666; }
            .meta-item { margin-right: 10px; }
            .graph-summary, .graph-entities, .graph-relationships { margin: 10px 0; }
            .entity-id { font-weight: bold; color: #3366cc; }
            .entity-label { color: #666; }
            .rel-source, .rel-target { color: #3366cc; }
            .rel-type { color: #009933; font-weight: bold; }
            .error { color: #cc0000; }
        </style>
        """)
        
        return "".join(html)

## Build Interactive Query Interface

Let's create an interactive interface using IPython widgets that allows users to input queries and view results without needing to write code.

In [None]:
class QueryInterface:
    """
    Interactive query interface using IPython widgets
    """
    
    def __init__(self, query_processor):
        """
        Initialize the query interface
        
        Args:
            query_processor: QueryProcessor instance
        """
        self.query_processor = query_processor
        self.last_results = None
        self.last_graph_data = None
        
        # Create widgets
        self.query_input = widgets.Text(
            value='',
            placeholder='Enter your query here...',
            description='Query:',
            disabled=False,
            layout=widgets.Layout(width='80%')
        )
        
        self.search_button = widgets.Button(
            description='Search',
            disabled=False,
            button_style='primary',
            tooltip='Run query',
            icon='search'
        )
        
        self.clear_button = widgets.Button(
            description='Clear',
            disabled=False,
            button_style='',
            tooltip='Clear results',
            icon='trash'
        )
        
        self.viz_button = widgets.Button(
            description='Visualize Graph',
            disabled=True,
            button_style='info',
            tooltip='Visualize graph results',
            icon='project-diagram'
        )
        
        self.results_output = widgets.Output(
            layout=widgets.Layout(border='1px solid #ddd', padding='10px', width='100%', overflow='auto')
        )
        
        self.viz_output = widgets.Output(
            layout=widgets.Layout(border='1px solid #ddd', padding='10px', width='100%', height='500px')
        )
        
        # Set up button actions
        self.search_button.on_click(self._on_search_button_click)
        self.clear_button.on_click(self._on_clear_button_click)
        self.viz_button.on_click(self._on_viz_button_click)
        
        # Enable search on Enter key
        self.query_input.on_submit(self._on_search_button_click)
        
    def display(self):
        """Display the interface"""
        # Create input area with buttons
        input_area = widgets.HBox([
            self.query_input, 
            self.search_button, 
            self.clear_button
        ])
        
        # Create tabs for results and visualization
        tabs = widgets.Tab()
        tabs.children = [self.results_output, self.viz_output]
        tabs.set_title(0, 'Results')
        tabs.set_title(1, 'Graph Visualization')
        
        # Display the interface
        display(widgets.VBox([
            input_area,
            self.viz_button,
            tabs
        ]))
        
    def _on_search_button_click(self, button):
        """Handle search button click"""
        query = self.query_input.value
        
        with self.results_output:
            clear_output()
            if not query:
                display(HTML("<div class='error'>Please enter a query</div>"))
                return
                
            print(f"Searching for: {query}")
            
            # Process the query
            result = self.query_processor.process_query(query)
            self.last_results = result
            
            if result["success"]:
                # Enable visualization if graph results are available
                has_graph_data = (
                    result["results"]["graph_results"] and 
                    (result["results"]["graph_results"].get("nodes") or 
                     result["results"]["graph_results"].get("relationships"))
                )
                self.viz_button.disabled = not has_graph_data
                
                # Store graph data for visualization
                if has_graph_data:
                    self.last_graph_data = {
                        "nodes": result["results"]["graph_results"].get("nodes", []),
                        "relationships": result["results"]["graph_results"].get("relationships", [])
                    }
                else:
                    self.last_graph_data = None
                    
                # Display HTML results
                html_report = self.query_processor.generate_html_report(result["results"])
                display(HTML(html_report))
            else:
                display(HTML(f"<div class='error'>{result['message']}</div>"))
    
    def _on_clear_button_click(self, button):
        """Handle clear button click"""
        self.query_input.value = ''
        with self.results_output:
            clear_output()
        with self.viz_output:
            clear_output()
        self.viz_button.disabled = True
        self.last_results = None
        self.last_graph_data = None
    
    def _on_viz_button_click(self, button):
        """Handle visualize button click"""
        if not self.last_graph_data:
            with self.viz_output:
                clear_output()
                display(HTML("<div class='error'>No graph data available to visualize</div>"))
            return
            
        with self.viz_output:
            clear_output()
            self._create_graph_visualization(self.last_graph_data)
    
    def _create_graph_visualization(self, graph_data):
        """Create a graph visualization from the data"""
        # Create a networkx graph
        G = nx.DiGraph()
        
        # Add nodes
        for node in graph_data["nodes"]:
            node_id = node["id"]
            labels = node.get("labels", ["Entity"])
            label_str = ", ".join(labels)
            G.add_node(node_id, label=label_str)
            
        # Add edges
        for rel in graph_data["relationships"]:
            G.add_edge(
                rel["source"], 
                rel["target"], 
                label=rel["type"]
            )
            
        # If we have no nodes or only one node, add a message
        if len(G.nodes) <= 1:
            display(HTML("<p>Not enough nodes to create a meaningful visualization.</p>"))
            return
            
        # Create visualization
        try:
            # Create PyVis network
            net = Network(height="450px", width="100%", notebook=True, directed=True)
            
            # Add nodes with color based on label
            for node, attrs in G.nodes(data=True):
                # Generate a color based on the first label
                label = attrs.get("label", "Entity")
                color = '#97c2fc'  # Default blue
                
                if "User" in label:
                    color = '#ffca3a'  # Yellow
                elif "System" in label:
                    color = '#8ac926'  # Green
                elif "Table" in label:
                    color = '#ff595e'  # Red
                elif "Business" in label:
                    color = '#9d4edd'  # Purple
                
                net.add_node(node, label=node, title=label, color=color)
                
            # Add edges with titles
            for source, target, attrs in G.edges(data=True):
                label = attrs.get("label", "")
                net.add_edge(source, target, title=label, label=label)
                
            # Set physics layout options
            net.set_options("""
            {
              "physics": {
                "forceAtlas2Based": {
                  "springLength": 100
                },
                "minVelocity": 0.75,
                "solver": "forceAtlas2Based"
              }
            }
            """)
                
            # Display the graph
            net.show("temp_graph.html")
            
        except Exception as e:
            print(f"Error creating visualization: {str(e)}")
            
            # Fallback to matplotlib
            try:
                plt.figure(figsize=(10, 8))
                pos = nx.spring_layout(G)
                nx.draw(
                    G, 
                    pos, 
                    with_labels=True, 
                    node_color='skyblue', 
                    node_size=1500, 
                    arrowsize=20
                )
                
                # Draw edge labels
                edge_labels = {(u, v): d.get('label', '') for u, v, d in G.edges(data=True)}
                nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
                
                plt.title("Graph Search Results")
                plt.axis('off')
                plt.show()
            except Exception as e2:
                print(f"Error with fallback visualization: {str(e2)}")

## Test with Example Queries

Now let's demonstrate the system with example queries showing different types of information retrieval from both databases.

In [None]:
# Initialize the system
document_query_system = DocumentQuerySystem(db_connector)
query_processor = QueryProcessor(document_query_system)
interface = QueryInterface(query_processor)

# Display the interface
interface.display()

## Example Queries to Try

Here are some example queries you can try with the system:

### Vector Search Queries (Milvus)
- "What are the requirements for document processing?"
- "Explain the business logic for data extraction"
- "How does a user upload a file?"
- "What are the steps in document analysis?"

### Graph Search Queries (Neo4j)
- "What systems interact with the document processor?"
- "Show the relationship between document extraction and storage"
- "How do users connect to the database?"
- "What components depend on the vector database?"

### Hybrid Queries
- "How does the system store data from tables?"
- "What is the relationship between document analysis and search functionality?"

## Visualize Knowledge Graph Results

The system includes built-in visualization capabilities for graph query results. When graph data is returned from a query, simply click the "Visualize Graph" button to see a network visualization of the nodes and relationships.

The visualization helps to better understand the relationships between entities in the knowledge graph. Different types of entities are color-coded for easier identification:
- Users: Yellow
- Systems: Green
- Tables: Red
- Business: Purple
- Other entities: Blue

The visualization is interactive - you can:
1. Drag nodes to rearrange the graph
2. Hover over nodes and edges to see additional information
3. Zoom in and out using the mouse wheel
4. Click and drag the background to pan around

This makes it much easier to understand complex relationships compared to looking at text-based results alone.

In [None]:
# Example of manually creating a visualization with sample data
# This is helpful if you want to see what the visualization looks like without running a query

sample_graph_data = {
    "nodes": [
        {"id": "DocumentProcessor", "labels": ["System"]},
        {"id": "Milvus", "labels": ["Database"]},
        {"id": "Neo4j", "labels": ["Database"]},
        {"id": "PDFExtractor", "labels": ["Component"]},
        {"id": "User", "labels": ["User"]},
        {"id": "TableData", "labels": ["Data"]}
    ],
    "relationships": [
        {"source": "DocumentProcessor", "target": "Milvus", "type": "STORES_IN"},
        {"source": "DocumentProcessor", "target": "Neo4j", "type": "STORES_RELATIONS_IN"},
        {"source": "PDFExtractor", "target": "DocumentProcessor", "type": "PART_OF"},
        {"source": "User", "target": "DocumentProcessor", "type": "USES"},
        {"source": "DocumentProcessor", "target": "TableData", "type": "EXTRACTS"}
    ]
}

# Example function to visualize sample data
def visualize_sample_graph():
    G = nx.DiGraph()
    
    # Add nodes
    for node in sample_graph_data["nodes"]:
        node_id = node["id"]
        labels = node.get("labels", ["Entity"])
        label_str = ", ".join(labels)
        G.add_node(node_id, label=label_str)
        
    # Add edges
    for rel in sample_graph_data["relationships"]:
        G.add_edge(
            rel["source"], 
            rel["target"], 
            label=rel["type"]
        )
    
    # Create visualization with matplotlib
    plt.figure(figsize=(10, 8))
    pos = nx.spring_layout(G, seed=42)  # For reproducibility
    
    # Create node colors based on labels
    node_colors = []
    for node, attrs in G.nodes(data=True):
        label = attrs.get("label", "")
        if "User" in label:
            node_colors.append('#ffca3a')  # Yellow
        elif "System" in label:
            node_colors.append('#8ac926')  # Green
        elif "Database" in label:
            node_colors.append('#ff595e')  # Red
        elif "Data" in label:
            node_colors.append('#9d4edd')  # Purple
        else:
            node_colors.append('#97c2fc')  # Default blue
    
    # Draw nodes
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=1200)
    
    # Draw edges
    nx.draw_networkx_edges(G, pos, width=1.5, arrowsize=20)
    
    # Draw labels
    nx.draw_networkx_labels(G, pos, font_size=12, font_weight="bold")
    
    # Draw edge labels
    edge_labels = {(u, v): d.get('label', '') for u, v, d in G.edges(data=True)}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=10)
    
    plt.title("Sample Document Processing System Graph")
    plt.axis('off')
    plt.tight_layout()
    plt.show()

# Uncomment to show sample visualization
# visualize_sample_graph()

## Conclusion

We've built a comprehensive document querying system that:

1. Integrates with both Milvus vector database and Neo4j graph database
2. Intelligently determines whether to use semantic search or graph traversal based on query analysis
3. Provides an interactive user interface for querying and visualizing results
4. Extracts and displays relationships between entities in the knowledge base

This hybrid approach allows us to get the best of both worlds:
- Semantic search through vector embeddings for finding relevant content
- Graph traversal for exploring relationships between entities

The system can be extended with:
- More sophisticated query analysis using LLMs or rule-based systems
- Additional visualization options for different types of data
- Integration with full document processing pipeline
- User feedback mechanisms to improve search results over time