In [None]:
# Configure AWS profile for local development
%env AWS_PROFILE=platform-developer

# Imports and Neptune client initialization
# All dependencies managed by uv and pyproject.toml
import pprint
from typing import Any

import pandas as pd
from tabulate import tabulate

from ingestor.queries.concept_queries import CONCEPT_QUERY

# Import catalogue graph modules (should work if running in proper uv environment)
from utils.aws import get_neptune_client

# Initialize Neptune client for local use
neptune_client = get_neptune_client(is_local=True)


# Utility functions for querying and result handling
def run_query(
    query: str, parameters: dict[str, Any] | None = None,
) -> list[dict]:
    """Execute an openCypher query with optional parameters."""
    return neptune_client.run_open_cypher_query(query, parameters)


def display_results(results: list[dict], limit: int = 10) -> None:
    """Display query results in a formatted table."""
    if not results:
        print("No results to display")
        return

    df = pd.DataFrame(results)
    print(f"Results: {len(results)} rows, {len(df.columns)} columns")

    display_df = df.head(limit)
    print(tabulate(display_df, headers="keys", tablefmt="grid", showindex=False))

    if len(results) > limit:
        print(f"... and {len(results) - limit} more rows")


def display_raw_results(results: list[dict], limit: int = 3) -> None:
    """Display query results using pretty print for exploration."""
    if not results:
        print("No results to display")
        return

    print(
        f"🔍 Raw results structure (showing {min(limit, len(results))} of {len(results)}):"
    )
    for i, result in enumerate(results[:limit]):
        print(f"\n--- Result {i + 1} ---")
        pprint.pprint(result, width=80, depth=2)


def export_results_to_csv(results: list[dict], filename: str):
    """Export query results to a CSV file."""
    if results:
        pd.DataFrame(results).to_csv(filename, index=False)
        print(f"✓ Results exported to {filename}")

# Catalogue Graph Query Notebook

This notebook provides an interface for querying the Wellcome Collection catalogue graph using openCypher queries. It presumes you are running in the `uv` environment and are using VS Code with the appropriate Python kernel.

## Prerequisites
- Ensure you're running this notebook in the proper `uv` environment
- Have the `platform-developer` AWS profile configured
- Access to the Wellcome Collection VPN/network

## Quick Start
1. Make sure you're in the `catalogue_graph` directory
2. Run `uv sync` to ensure all dependencies are installed
3. Create or activate a virtual environment using `uv venv`
4. Select the appropriate kernel `catalogue_graph/.venv/bin/python` in VS Code
5. Run the setup cell above to initialize the connection 
6. Use the pre-built query functions or execute custom queries below

---

## Basic Graph Exploration

Let's start with some basic queries to understand the structure of the catalogue graph.

In [None]:
# ==== Catalogue Graph Primer (Read-only, fast) ====
# Quick tour of the graph & Cypher querying patterns.
# Core syntax reminders:
#   (n:Label)            single label
#   (n:LabelA:LabelB)    multi-label
#   (a)-[:REL_TYPE]->(b)  directed relationship
#   Use $param for parameters, passed as dict.
# All queries here are read-only (MATCH only) and limited / aggregated for speed.

# Known labels (extend if new labels are added in the model layer):
KNOWN_LABELS = [
    "Concept",
    "SourceLocation",
    "SourceName",
    "SourceConcept",   # Base concept from external vocabularies (may also carry SourceLocation/SourceName)
    "Work",
    "PathIdentifier",
]

print("\n==== 1. Node label counts (targeted)")
label_count_rows: list[dict] = []
for label in KNOWN_LABELS:
    if label == "SourceConcept":
        total_query = """
MATCH (n:SourceConcept)
RETURN count(n) AS c
"""
        total_res = run_query(total_query)
        total = total_res[0]["c"] if total_res else 0

        pure_query = """
MATCH (n:SourceConcept)
WHERE NOT n:SourceLocation AND NOT n:SourceName
RETURN count(n) AS c
"""
        pure_res = run_query(pure_query)
        pure = pure_res[0]["c"] if pure_res else 0
        label_count_rows.append({"label": "SourceConcept (total)", "count": total})
        label_count_rows.append({"label": "SourceConcept (pure)", "count": pure})
    else:
        q = f"""
MATCH (n:`{label}`)
RETURN count(n) AS c
"""
        res = run_query(q)
        count_val = res[0]["c"] if res else 0
        label_count_rows.append({"label": label, "count": count_val})
label_count_rows = sorted(label_count_rows, key=lambda r: r["count"], reverse=True)
display_results(label_count_rows, limit=len(label_count_rows))

print("\n==== 2. Relationship type counts")
relationship_types_query = """
MATCH ()-[r]->()
RETURN type(r) AS relationship_type, count(r) AS count
ORDER BY count DESC
"""
display_results(run_query(relationship_types_query))

print("\n==== 3. SourceConcept specialisations")
source_concept_kinds_query = """
MATCH (sc:SourceConcept)
WITH sc,
     CASE
       WHEN 'SourceLocation' IN labels(sc) THEN 'SourceLocation'
       WHEN 'SourceName'     IN labels(sc) THEN 'SourceName'
       ELSE 'SourceConcept'
     END AS kind
RETURN kind, count(*) AS count
ORDER BY count DESC
"""
display_results(run_query(source_concept_kinds_query))

print("\n==== 4. Concept → SourceConcept links (top 10 source combinations)")
concept_to_source_summary_query = """
MATCH (c:Concept)-[:HAS_SOURCE_CONCEPT]->(sc:SourceConcept)
RETURN c.source AS concept_source, sc.source AS source_concept_source, count(*) AS links
ORDER BY links DESC
LIMIT 10
"""
display_results(run_query(concept_to_source_summary_query))

print("\n==== 5. Work → Concept relationship referenced_type sample (top 10)")
work_concept_rel_types_query = """
MATCH (:Work)-[hc:HAS_CONCEPT]->(c:Concept)
RETURN hc.referenced_type AS referenced_type, count(*) AS count
ORDER BY count DESC
LIMIT 10
"""
display_results(run_query(work_concept_rel_types_query))

print("\n==== 6. SAME_AS equivalence edges sample (top 10)")
same_as_sample_query = """
MATCH (a:SourceConcept)-[:SAME_AS]->(b:SourceConcept)
WITH a, b
RETURN a.source AS from_source, b.source AS to_source, count(*) AS edges
ORDER BY edges DESC
LIMIT 10
"""
display_results(run_query(same_as_sample_query))

print("\n==== 7. PathIdentifier presence")
path_identifier_query = """
MATCH (p:PathIdentifier)
RETURN count(p) AS path_identifier_count
"""
display_results(run_query(path_identifier_query))

print("\n==== 8a. Concept detail (parameterised)")
example_concept_id = "w2k2tx9h"   # Adjust if needed
concept_detail_query = """
MATCH (c:Concept {id: $concept_id})
OPTIONAL MATCH (c)-[:HAS_SOURCE_CONCEPT]->(sc:SourceConcept)
OPTIONAL MATCH (c)<-[:HAS_CONCEPT]-(w:Work)
RETURN c.id AS concept_id,
       c.source AS concept_source,
       count(DISTINCT sc) AS source_concept_count,
       count(DISTINCT w) AS referencing_work_count
"""
display_results(run_query(concept_detail_query, {"concept_id": example_concept_id}))

print("\n==== 8b. Work detail (parameterised)")
example_work_id = "tsayk6g3"      # Adjust if needed
work_detail_query = """
MATCH (w:Work {id: $work_id})-[hc:HAS_CONCEPT]->(c:Concept)
RETURN w.id AS work_id,
       w.title AS title,
       count(DISTINCT c) AS concept_count,
       collect(DISTINCT hc.referenced_type)[0..5] AS concept_types_sample
"""
display_results(run_query(work_detail_query, {"work_id": example_work_id}))

## Concept Queries

Query and explore concepts in the catalogue graph.

In [None]:
# Use the pre-built concept query to get a sample of concepts
# This query gets concepts with their linked source concepts
concept_sample_params = {
    "start_offset": 0,
    "limit": 3,  # Reduce to 3 for cleaner output with pretty print
}

print("Running pre-built concept query...")
print(f"Query parameters: {concept_sample_params}")
print(CONCEPT_QUERY)

results = run_query(CONCEPT_QUERY, concept_sample_params)

display_results(results)

results

In [None]:
from pprint import pprint

# Search for concepts by label (case-insensitive)
def search_concepts(search_term: str, limit: int = 10):
    """Search for concepts containing the given term in their label."""
    search_query = """
    MATCH (c:Concept)
    WHERE toLower(c.label) CONTAINS toLower($search_term)
    RETURN c.id as concept_id, c.label as label
    ORDER BY c.label
    LIMIT $limit
    """

    params = {"search_term": search_term, "limit": limit}
    return run_query(search_query, params)


# Lookup a concept by ID
def lookup_by_concept_id(concept_id: str):
    """Lookup a concept by its ID."""
    lookup_query = """
    MATCH (c:Concept)
    WHERE c.id = $concept_id
    RETURN c as concept
    """

    params = {"concept_id": concept_id}
    return run_query(lookup_query, params)

# Example: Search for concepts related to "Great Britain. Army."
search_term = "Great Britain. Army."
search_results = search_concepts(search_term)
display_results(search_results)

# Example: Lookup a concept by ID
concept_id = "ck4h8gj9"
lookup_result = lookup_by_concept_id(concept_id)
pprint(lookup_result)

## Work Queries

Query and explore works in the catalogue graph and their relationships to concepts.

In [None]:
# Get sample works with their associated concepts
works_query = """
MATCH (w:Work)-[hc:HAS_CONCEPT]->(c:Concept)
WITH w, collect({concept: c, type: hc.referenced_type}) as concepts
RETURN w.id as work_id, 
       w.title as title,
       size(concepts) as concept_count,
       concepts[0..3] as sample_concepts
ORDER BY concept_count DESC
LIMIT 10
"""

works_results = run_query(works_query)
display_results(works_results)

## Custom Queries

Use this section to run your own openCypher queries.

In [None]:
# Template for custom queries
# Replace this with your own openCypher query

custom_query = """
MATCH (n)
RETURN count(n) as total_nodes
"""

# Optional parameters for your query
custom_params = {}

# Execute the query
custom_results = run_query(custom_query, custom_params)
display_results(custom_results)