# Import modules and functions

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [24]:
import os
import networkx as nx
from sentence_transformers import SentenceTransformer
from IPython.display import display, Markdown

In [4]:
from turingdb.exceptions import TuringDBException

In [26]:
from turingdb_kgsearch.embeddings import (
    build_node_only_embeddings,
    build_context_enriched_embeddings,
    build_smart_enriched_embeddings,
    build_sparse_embeddings,
    build_node2vec_embeddings,
)
from turingdb_kgsearch.search import (
    dense_search,
    sparse_search,
    print_results,
    hybrid_search,
    compare_search_methods,
)
from turingdb_kgsearch.subgraph import get_subgraph_around_query
from turingdb_kgsearch.visualization import (
    visualize_graph_with_pyvis,
    extract_and_visualize_subgraph,
)
from turingdb_kgsearch.workflow import (
    search_and_expand_hybrid_filtered,
    generate_report_hybrid_workflow_results,
)
from turingdb_kgsearch.statistics import get_subgraph_stats, print_subgraph_stats
from turingdb_kgsearch.ranking import (
    rank_nodes_by_importance,
    rank_nodes_by_importance_with_context,
    print_node_rankings,
    compare_node_importance,
    diagnose_rankings,
)
from turingdb_kgsearch.explain_results import (
    explain_retrieval,
    explain_top_results,
    print_explanation,
)
from turingdb_kgsearch.llm import (
    create_llm_prompt_with_graph,
    query_llm,
)

In [5]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

# Set path to data

In [6]:
example_name = "software_dependencies"
path_data = f"{os.getcwd()}/data/{example_name}"
if not os.path.exists(path_data):
    raise ValueError(f"{path_data} does not exists")

# Create graph using `turingdb` python package

<div class="alert alert-block alert-info">
    <h2>
        See <a href="https://docs.turingdb.ai/quickstart">TuringDB Get started documentation</a> for the important steps to follow :
    </h2>
    <h3>
        <ul>
            <li>Create your TuringDB account</li>
            <li>Create your instance in the <a href="https://console.turingdb.ai/auth">TuringDB Cloud UI</a></li>
            <li>Copy your Instance ID from the Database Instances management page</li>
            <li>Get API Key from the Settings in UI</li>
        </ul>
        Remember to have your instance active while working in this notebook !
    </h3>
</div>

## Connect to instance and transfer data

In [7]:
from turingdb import TuringDB

# Create TuringDB client
client = TuringDB(
    host="http://localhost:6666"  # Remove this parameter and set the two parameters below
    # instance_id=os.getenv("INSTANCE_ID"),
    # auth_token=os.getenv("AUTH_TOKEN"),
)

In [8]:
%%time

client.s3_connect(
    bucket_name="turing-internal",
    region="eu-west-2",
    access_key=os.getenv("AWS_ACCESS_KEY"),
    secret_key=os.getenv("AWS_SECRET_KEY"),
)

CPU times: user 142 ms, sys: 56.9 ms, total: 198 ms
Wall time: 6.29 s


In [9]:
%%time

gml_filename = f"{example_name}.gml"
client.transfer(
    src=f"data/{example_name}/{gml_filename}",
    dst=f"turingdb://{gml_filename}",  # to s3 bucket or TuringDB instance or local .turing
)

CPU times: user 85.1 ms, sys: 21.8 ms, total: 107 ms
Wall time: 424 ms


In [10]:
! tree /home/dev/.turing/data

[01;34m/home/dev/.turing/data[0m
‚îú‚îÄ‚îÄ ai_gov_control_mappings_full.csv
‚îú‚îÄ‚îÄ reactome.dump
‚îú‚îÄ‚îÄ sec_8k_raw_text_filings_2024.csv
‚îú‚îÄ‚îÄ sec_8k_raw_text_filings_2024.gml
‚îî‚îÄ‚îÄ software_dependencies.gml

0 directories, 5 files


## Check data files are available

In [11]:
list_files = sorted(os.listdir(path_data))
if gml_filename not in list_files:
    raise ValueError(f"file is not available in {path_data}")

## Import and format data

In [12]:
path_turing_folder = f"{os.getenv('HOME')}/.turing"

In [13]:
G = nx.read_gml(f"{path_turing_folder}/data/{example_name}.gml")
print(G)

DiGraph with 21 nodes and 40 edges


In [14]:
client.query(f"""
IMPORT GRAPH {example_name}
FROM "{example_name}.gml"
""")

In [17]:
try:
    client.load_graph(example_name)
except TuringDBException as e:
    print(f"{e}: Graph already loaded")

GRAPH_LOAD_ERROR: Graph already loaded


In [16]:
client.set_graph(example_name)

# Query TuringDB

## Use metaqueries to have insight on graph overall structure

<h3>
    To learn more about üìÆ Metaqueries, please check TuringDB documentation on this <a href="https://turingdb.mintlify.app/query/cypher_subset#%F0%9F%93%AE-metaqueries">link</a>
</h3>

In [18]:
%%time

# CALL PROPERTIES() - returns a column of all the different node and edge properties and their types in the database
command = """
CALL PROPERTIES()
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = ["Property_ID", "Property_name", "Property_type"]
    display(df)

Unnamed: 0,Property_ID,Property_name,Property_type
0,0,label (String),String
1,1,type (String),String
2,2,industry (String),String
3,3,founded (String),String
4,4,category (String),String
5,5,provider (String),String
6,6,relationship (String),String
7,7,criticality (String),String


CPU times: user 9.1 ms, sys: 2.91 ms, total: 12 ms
Wall time: 10.9 ms


In [19]:
%%time

# CALL LABELS () - returns a column of all the different node labels
command = """
CALL LABELS()
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = ["Node_type_ID", "Node_label"]
    display(df)

Unnamed: 0,Node_type_ID,Node_label
0,0,GMLNode


CPU times: user 5.82 ms, sys: 973 Œºs, total: 6.8 ms
Wall time: 6.16 ms


In [20]:
%%time

# CALL EDGETYPES() - returns a column of all the different edge types (edge equivalent of node labels)
command = """
CALL EDGETYPES()
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = ["Edge_type_ID", "Edge_label"]
    display(df)

Unnamed: 0,Edge_type_ID,Edge_label
0,0,GMLEdge


CPU times: user 5.67 ms, sys: 950 Œºs, total: 6.62 ms
Wall time: 5.94 ms


In [21]:
%%time

# CALL LABELSETS() - returns a two columns describing combinations of node labels
command = """
CALL LABELSETS()
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = ["Node_type_ID", "Node_label"]
    display(df)

Unnamed: 0,Node_type_ID,Node_label
0,0,GMLNode


CPU times: user 6.44 ms, sys: 73 Œºs, total: 6.52 ms
Wall time: 5.9 ms


## Simple queries

In [22]:
%%time

# Match all edges and return them
command = """
MATCH (n)-[e]-(m)
RETURN n, e, m
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    #df.columns = get_return_statements(command)
    display(df)

Unnamed: 0,0,1,2
0,0,0,6
1,0,1,7
2,1,2,8
3,1,3,3
4,2,4,8
5,2,5,7
6,2,6,3
7,9,7,4
8,9,8,3
9,10,9,3


CPU times: user 8.38 ms, sys: 66 Œºs, total: 8.44 ms
Wall time: 7.83 ms


# Load the embedding model

In [27]:
%%time

# This will convert text to vectors
model = SentenceTransformer("paraphrase-MiniLM-L3-v2")
print(f"‚úì Model loaded: {model.get_sentence_embedding_dimension()} dimensions")

‚úì Model loaded: 384 dimensions
CPU times: user 135 ms, sys: 22.4 ms, total: 158 ms
Wall time: 1.65 s


# Build vector index on the graph

## Vector Search Implementation - Dense (semantic) search

#### How It Works

Each control is converted to a **384-dimensional vector** using a pre-trained language model (`paraphrase-MiniLM-L3-v2`).

**Search process:**
1. Convert user query to vector
2. Calculate cosine similarity with all control vectors
3. Rank by similarity score (0-1)
4. Return top-k most relevant controls

#### Why Vectors?

- **Semantic understanding**: "data protection" matches "privacy safeguards"
- **Handles synonyms**: "AI governance" finds "artificial intelligence oversight"
- **No keyword dependency**: Works even without exact term matches

### Use the three different approaches

In [38]:
%%time

# Build different versions
node_only = build_node_only_embeddings(G, model)  # Node-only embeddings
heavy = build_context_enriched_embeddings(
    G, model, strategy="heavy"  # "lightweight"
)  # Context-enriched embeddings

print("\n" + "=" * 80 + "\n")

‚úì Vector index built using node-only embeddings approach: 21 vectors
‚úì Vector index built using context-enriched embeddings approach (strategy heavy): 21 vectors


CPU times: user 291 ms, sys: 6.28 ms, total: 297 ms
Wall time: 19.9 ms


In [47]:
nx.write_gml(G, f"{path_data}/{example_name}.gml")

## Vector Search Implementation - Sparse (keyword) search

In [29]:
%%time

# Build sparse embeddings
sparse_vectors, node_texts, sparse_vectorizer = build_sparse_embeddings(
    G=G, max_features=500, ngram_range=(1, 2)
)

Building sparse index (TF-IDF)...
‚úì Sparse index built: 21 vectors
  Vocabulary size: 48
  Sample terms: ['amazon', 'api', 'app', 'aws', 'aws api', 'azure', 'azure cloud', 'banking', 'banking api', 'cloud', 'cloud api', 'commerce', 'commerce platforms', 'gpt', 'gpt api', 'microsoft', 'mobile', 'mobile app', 'netflix', 'netflix streaming']
CPU times: user 5.8 ms, sys: 947 Œºs, total: 6.74 ms
Wall time: 6.05 ms


## Node2Vec

In [30]:
%%time

# Build structural embeddings
structural_vectors = build_node2vec_embeddings(G, dimensions=384)

Training Node2Vec on graph structure...


  import pkg_resources
  import pkg_resources


‚úì Node2Vec trained: 21 structural vectors
CPU times: user 263 ms, sys: 113 ms, total: 376 ms
Wall time: 1.23 s


In [35]:
structural_vectors[list(structural_vectors.keys())[0]].shape

(384,)

# Search capabilities

## Vector Search Implementation - Dense (semantic) search

Find controls relevant to any natural language query:

```python
results = search("data privacy protection", k=5)
```

**Use cases:**
- Exploratory research: "What controls cover AI model governance?"
- Concept-based lookup: "security monitoring requirements"
- Gap analysis: "What's missing in our risk management?"

In [39]:
# Choose most relevant approach
node_vectors = heavy[0]
node_texts = heavy[1]

### Query

In [54]:
%%time

# Try different queries
queries = [
    "AI providers",
    "companies with risks of failure due to cloud providers",
    "AI model governance",
    "risk management controls",
    "security monitoring requirements",
    "companies with risks linked to goods transportations/delivery",
    "Video",
    "API"
]

for query in queries:
    print(f"\n{'=' * 80}")
    print(f"QUERY: '{query}'")
    print("=" * 80)
    results = dense_search(
        query=query,
        node_vectors=node_vectors,
        node_texts=node_texts,
        G=G,
        model=model,
        k=3,
        # node_type='control'
    )
    print_results(results)


QUERY: 'AI providers'

Found 3 results:

1. Similarity: 0.3034
   Node: Azure Cloud API
   Type: api
   Text: Azure Cloud API...

2. Similarity: 0.2950
   Node: Ride Sharing Services
   Type: customer_segment
   Text: Ride Sharing Services...

3. Similarity: 0.2789
   Node: Amazon
   Type: company
   Text: Amazon...

QUERY: 'companies with risks of failure due to cloud providers'

Found 3 results:

1. Similarity: 0.3760
   Node: Azure Cloud API
   Type: api
   Text: Azure Cloud API...

2. Similarity: 0.2482
   Node: Microsoft
   Type: company
   Text: Microsoft...

3. Similarity: 0.1934
   Node: Twilio
   Type: company
   Text: Twilio...

QUERY: 'AI model governance'

Found 3 results:

1. Similarity: 0.2739
   Node: Azure Cloud API
   Type: api
   Text: Azure Cloud API...

2. Similarity: 0.2501
   Node: AWS API
   Type: api
   Text: AWS API...

3. Similarity: 0.2290
   Node: Amazon
   Type: company
   Text: Amazon...

QUERY: 'risk management controls'

Found 3 results:

1. Similarity:

## Vector Search Implementation - Sparse (keyword) search

In [56]:
%%time

## Try different queries
#queries = [
#    "AI providers",
#    "companies with risks of failure due to cloud providers",
#    "AI model governance",
#    "risk management controls",
#    "security monitoring requirements",
#    "companies with risks linked to goods transportations/delivery",
#    "Video",
#    "API"
#]

for query in queries:
    print(f"\n{'=' * 80}")
    print(f"QUERY: '{query}'")
    print("=" * 80)
    results = sparse_search(
        query=query,
        sparse_vectors=sparse_vectors,
        sparse_vectorizer=sparse_vectorizer,
        node_texts=node_texts,
        G=G,
        k=3
    )
    print_results(results)


QUERY: 'AI providers'

Found 3 results:

1. Similarity: 0.0000
   Node: Video Streaming Services
   Type: customer_segment
   Text: Video Streaming Services...

2. Similarity: 0.0000
   Node: E-commerce Platforms
   Type: customer_segment
   Text: E-commerce Platforms...

3. Similarity: 0.0000
   Node: Ride Sharing Services
   Type: customer_segment
   Text: Ride Sharing Services...

QUERY: 'companies with risks of failure due to cloud providers'

Found 3 results:

1. Similarity: 0.4768
   Node: Azure Cloud API
   Type: api
   Text: Azure Cloud API...

2. Similarity: 0.0000
   Node: Video Streaming Services
   Type: customer_segment
   Text: Video Streaming Services...

3. Similarity: 0.0000
   Node: E-commerce Platforms
   Type: customer_segment
   Text: E-commerce Platforms...

QUERY: 'AI model governance'

Found 3 results:

1. Similarity: 0.0000
   Node: Video Streaming Services
   Type: customer_segment
   Text: Video Streaming Services...

2. Similarity: 0.0000
   Node: E-commerc

## Hybrid search: Best of Both Worlds

### The Problem

- **Dense (semantic) search**: Great for concepts, misses exact terms
- **Sparse (keyword) search**: Finds exact matches, misses semantics

### The Solution

**Hybrid search** combines both approaches:

```
final_score = Œ± √ó semantic_score + (1-Œ±) √ó keyword_score
```

### Alpha Parameter Guide

| Alpha | Behavior | Best For |
|-------|----------|----------|
| 1.0 | Pure semantic | Conceptual queries |
| 0.7 | Favor semantics | General use (recommended) |
| 0.5 | Balanced | Mixed queries |
| 0.3 | Favor keywords | Technical lookups |
| 0.0 | Pure keywords | Exact term matching |

### When to Use What

**Dense (Œ±=1.0)**
- Query: "What covers security?"
- Finds: Controls about protection, safeguards, defense

**Sparse (Œ±=0.0)**
- Query: "ISO27001 A.8.24"
- Finds: Exact standard reference

**Hybrid (Œ±=0.7)**
- Query: "NIST risk management frameworks"
- Finds: Both NIST references AND risk-related controls

### Query

In [57]:
%%time

## Try different queries
#queries = [
#    "AI providers",
#    "companies with risks of failure due to cloud providers",
#    "AI model governance",
#    "risk management controls",
#    "security monitoring requirements",
#    "companies with risks linked to goods transportations/delivery",
#    "Video",
#    "API"
#]

for query in queries:
    print(f"\n{'=' * 80}")
    print(f"QUERY: '{query}'")
    print("=" * 80)
    results = hybrid_search(
        query=query,
        node_vectors=node_vectors,
        node_texts=node_texts,
        G=G,
        sparse_vectors=sparse_vectors,
        sparse_vectorizer=sparse_vectorizer,
        model=model,
        k=3,
        alpha=0.7,  # 70% semantic, 30% keywords
    )
    print_results(results)


QUERY: 'AI providers'

Found 3 results:

1. Similarity: 0.9254
   Node: Ride Sharing Services
   Type: customer_segment
   Text: Ride Sharing Services...

2. Similarity: 0.7818
   Node: Amazon
   Type: company
   Text: Amazon...

3. Similarity: 0.7000
   Node: Azure Cloud API
   Type: api
   Text: Azure Cloud API...

QUERY: 'companies with risks of failure due to cloud providers'

Found 3 results:

1. Similarity: 1.0000
   Node: Azure Cloud API
   Type: api
   Text: Azure Cloud API...

2. Similarity: 0.3580
   Node: Microsoft
   Type: company
   Text: Microsoft...

3. Similarity: 0.2114
   Node: Twilio
   Type: company
   Text: Twilio...

QUERY: 'AI model governance'

Found 3 results:

1. Similarity: 0.7373
   Node: Amazon
   Type: company
   Text: Amazon...

2. Similarity: 0.7000
   Node: Azure Cloud API
   Type: api
   Text: Azure Cloud API...

3. Similarity: 0.5607
   Node: AWS API
   Type: api
   Text: AWS API...

QUERY: 'risk management controls'

Found 3 results:

1. Similarity:

## Compare Dense vs Sparse vs Hybrid

In [64]:
%%time

# Limit results to specific node type (all node types by default)
node_type = None  # "control"
k = 5
alpha = 0.7

for query in queries:
    compare_search_methods(
        query=query,
        node_vectors=node_vectors,
        node_texts=node_texts,
        G=G,
        sparse_vectors=sparse_vectors,
        sparse_vectorizer=sparse_vectorizer,
        model=model,
        k=k,
        alpha=alpha,
    )


QUERY: 'AI providers'

1. DENSE ONLY (Semantic):
--------------------------------------------------------------------------------
1. 0.303 | Azure Cloud API...
2. 0.295 | Ride Sharing Services...
3. 0.279 | Amazon...

2. SPARSE ONLY (Keywords):
--------------------------------------------------------------------------------
1. 0.000 | Video Streaming Services...
2. 0.000 | E-commerce Platforms...
3. 0.000 | Ride Sharing Services...

3. HYBRID (alpha=0.7):
--------------------------------------------------------------------------------
1. 0.963 (raw: D:0.30/S:0.00) | (norm: D:0.95/S:1.00) | Ride Sharing Services...
2. 0.892 (raw: D:0.28/S:0.00) | (norm: D:0.85/S:1.00) | Amazon...
3. 0.883 (raw: D:0.28/S:0.00) | (norm: D:0.83/S:1.00) | Microsoft...

QUERY: 'companies with risks of failure due to cloud providers'

1. DENSE ONLY (Semantic):
--------------------------------------------------------------------------------
1. 0.376 | Azure Cloud API...
2. 0.248 | Microsoft...
3. 0.193 | Twil

# Get subgraph around query results and visualise

### Graph-Based Context Retrieval

Get full context around relevant controls:

```python
subgraph = get_subgraph_around_query("risk management", k=3, hops=1)
```

**Returns:**
- Relevant controls
- Related topics and domains
- All mapped standard references
- Network connections

In [82]:
query = queries[1]

In [83]:
print(f"Query: '{query}'")

method_to_test = "hybrid"
possible_method_to_test = ["dense", "sparse", "hybrid"]
if method_to_test not in possible_method_to_test:
    raise ValueError(f"method_to_test has to be one of {possible_method_to_test}")

if method_to_test == "dense":
    # Dense search
    subgraph, results = get_subgraph_around_query(
        query=query,
        G=G,
        search_func=dense_search,
        search_params={
            "node_vectors": node_vectors,
            "node_texts": node_texts,
            "model": model
        },
        k=3,
        hops=1,
    )

elif method_to_test == "sparse":
    # Sparse search
    subgraph, results = get_subgraph_around_query(
        query=query,
        G=G,
        search_func=sparse_search,
        search_params={
            "sparse_vectors": sparse_vectors,
            "sparse_vectorizer": sparse_vectorizer,
            "node_texts": node_texts
        },
        k=3,
        hops=1,
    )

elif method_to_test == "hybrid":
    # Hybrid search
    subgraph, results = get_subgraph_around_query(
        query=query,
        G=G,
        search_func=hybrid_search,
        search_params={
            "node_vectors": node_vectors,
            "node_texts": node_texts,
            "sparse_vectors": sparse_vectors,
            "sparse_vectorizer": sparse_vectorizer,
            "model": model,
            "alpha": 0.7
        },
        k=3,
        hops=1,
    )

Query: companies with risks of failure due to cloud providers


In [71]:
print(f"Query: '{query}'")
print(
    f"Subgraph: {subgraph.number_of_nodes()} nodes, {subgraph.number_of_edges()} edges"
)

# Show what's in the subgraph
types_in_subgraph = {}
for node, data in subgraph.nodes(data=True):
    ntype = data.get("type", "unknown")
    types_in_subgraph[ntype] = types_in_subgraph.get(ntype, 0) + 1

print("\nSubgraph composition:")
for ntype, count in types_in_subgraph.items():
    print(f"  {ntype}: {count}")

Query: 'companies with risks of failure due to cloud providers'
Subgraph: 9 nodes, 12 edges

Subgraph composition:
  api: 4
  company: 5


# Visualization Capabilities

## Interactive Visualization (PyVis)

- Hover to see full control text
- Click to explore connections
- Physics-based layout
- Filterable and zoomable

**Features:**
- Color legend for node types
- Relevance-based sizing
- Relationship labels on edges
- Responsive browser-based interface

In [84]:
print(f"Query: '{query}'")

# With hybrid search
extract_and_visualize_subgraph(
    query=query,
    G=G,
    search_func=hybrid_search,
    search_params={
        "node_vectors": node_vectors,
        "node_texts": node_texts,
        "sparse_vectors": sparse_vectors,
        "sparse_vectorizer": sparse_vectorizer,
        "model": model,
        "alpha": 0.7
    },
    k=4,
    hops=2,
    output_file=f"{example_name}.html"  # f"{path_data}/{example_name}.html",
)

Query: 'companies with risks of failure due to cloud providers'
Auto-generated color map for 3 node types:
  api: #ff6b6b
  company: #4ecdc4
  customer_segment: #95e1d3
‚úì Interactive graph saved to: software_dependencies.html
  Nodes: 11
  Edges: 15


# Workflow

In [99]:
print(f"Query: '{query}'")

semantic_results, expanded, subgraph = search_and_expand_hybrid_filtered(
    query=query,
    G=G,
    node_vectors=node_vectors,
    node_texts=node_texts,
    sparse_vectors=sparse_vectors,
    sparse_vectorizer=sparse_vectorizer,
    structural_vectors=structural_vectors,
    model=model,
    k_search=5,
    max_hops=10,
    min_structural_sim=0.3,  # Must be structurally similar
    min_semantic_sim=0.3,  # AND semantically relevant to query
    structural_weight=0.5,  # 50-50 balance
    alpha=0.7,  # Weight alpha to attribute to semantic (dense) search, (1 - alpha) for keyword (sparse) search
)

report = generate_report_hybrid_workflow_results(semantic_results, expanded)
print(report)

Query: 'companies with risks of failure due to cloud providers'
Stage 1: Hybrid search for 'companies with risks of failure due to cloud providers'...
--------------------------------------------------------------------------------

Found 5 semantically relevant seed nodes:
  1. Azure Cloud API (score: 1.000)
     Azure Cloud API...
  2. Microsoft (score: 0.430)
     Microsoft...
  3. Twilio (score: 0.315)
     Twilio...
  4. Ride Sharing Services (score: 0.290)
     Ride Sharing Services...
  5. Amazon (score: 0.287)
     Amazon...

Stage 2: Hybrid filtering (structural + semantic)...
  - Max hops: 10
  - Min structural similarity: 0.3
  - Min semantic similarity: 0.3
  - Structural weight: 0.5
--------------------------------------------------------------------------------

  Expanding from: Azure Cloud API
  Found 0 neighbors (after hybrid filtering):

  Expanding from: Microsoft
  Found 0 neighbors (after hybrid filtering):

  Expanding from: Twilio
  Found 0 neighbors (after hybri

In [100]:
# Access subgraph data
print(f"\nSubgraph ({subgraph}) node attributes:")
for node in list(subgraph.nodes()):
    print(f"\n* {node}:")
    for key, value in subgraph.nodes[node].items():
        if key not in ["statement"]:  # Skip long text
            print(f"  {key}: {value}")

# Export subgraph if needed
# nx.write_gml(subgraph, "filtered_subgraph.gml")

# Visualise subgraph
visualize_graph_with_pyvis(subgraph, output_file="hybrid_filtered_graph.html")


Subgraph (DiGraph with 5 nodes and 2 edges) node attributes:

* Amazon:
  type: company
  founded: 1994
  category: multi_sector
  is_seed: True
  seed_score: 0.28730838674412373
  dense_score: 0.1802797019481659
  sparse_score: 0.0

* Ride Sharing Services:
  type: customer_segment
  industry: Transportation
  is_seed: True
  seed_score: 0.2897629087093686
  dense_score: 0.181443989276886
  sparse_score: 0.0

* Azure Cloud API:
  type: api
  provider: Microsoft
  category: cloud_infrastructure
  is_seed: True
  seed_score: 1.0
  dense_score: 0.3760374188423157
  sparse_score: 0.47680959904304493

* Microsoft:
  type: company
  founded: 1975
  category: cloud_software
  is_seed: True
  seed_score: 0.4304882797314395
  dense_score: 0.24819619953632355
  sparse_score: 0.0

* Twilio:
  type: company
  founded: 2008
  category: communications
  is_seed: True
  seed_score: 0.31494461272890195
  dense_score: 0.19338877499103546
  sparse_score: 0.0
Auto-generated color map for 3 node types:


# Results exploration

## Graph statistics

In [101]:
# Usage
stats = get_subgraph_stats(
    subgraph, include_node_breakdown=True, include_centrality=True, include_paths=True
)
print_subgraph_stats(stats, verbose=True)


SUBGRAPH STATISTICS

üìä Basic Metrics:
   nodes: 5
   edges: 2
   density: 0.1
   is_connected: False

üîó Degree Statistics:
   average: 0.80
   max: 2.00
   min: 0.00
   median: 1.00

üè∑Ô∏è  Node Types:
   company: 3
   customer_segment: 1
   api: 1

üéØ Node Roles:
   seed: 5
   found: 0
   intermediate: 0

‚≠ê Most Central Nodes:
   By Degree:
      Azure Cloud API: 0.500
      Microsoft: 0.250
      Twilio: 0.250
      Amazon: 0.000
      Ride Sharing Services: 0.000
   By Betweenness:
      Amazon: 0.000
      Ride Sharing Services: 0.000
      Azure Cloud API: 0.000
      Microsoft: 0.000
      Twilio: 0.000

üîó Edge Types:
   company_to_api: 2



## Node importance ranking

In [106]:
node_to_check = "Amazon"
focus_type = "company"

In [107]:
# Usage examples
rankings = rank_nodes_by_importance(
    subgraph,
    methods="all",  # or ['pagerank', 'degree', 'relevance']
    top_k=10,
    aggregate="average",  # or 'max' or {'pagerank': 0.4, 'degree': 0.3, 'relevance': 0.3}
)

print_node_rankings(rankings, subgraph, show_details=True)


NODE IMPORTANCE RANKINGS

Ranked 5 nodes using: pagerank, degree, betweenness, eigenvector


üìä PAGERANK (Top 5):
   1. Azure Cloud API (api): 0.4030
   2. Amazon (company): 0.1493
   3. Ride Sharing Services (customer_segment): 0.1493
   4. Microsoft (company): 0.1493
   5. Twilio (company): 0.1493

üìä DEGREE (Top 5):
   1. Azure Cloud API (api): 0.5000
   2. Microsoft (company): 0.2500
   3. Twilio (company): 0.2500
   4. Amazon (company): 0.0000
   5. Ride Sharing Services (customer_segment): 0.0000

üìä BETWEENNESS (Top 5):
   1. Amazon (company): 0.0000
   2. Ride Sharing Services (customer_segment): 0.0000
   3. Azure Cloud API (api): 0.0000
   4. Microsoft (company): 0.0000
   5. Twilio (company): 0.0000

‚ùå CLOSENESS: graph is not connected enough to compute closeness centrality

üìä EIGENVECTOR (Top 5):
   1. Azure Cloud API (api): 1.0000
   2. Amazon (company): 0.0008
   3. Ride Sharing Services (customer_segment): 0.0008
   4. Microsoft (company): 0.0008
   5. Twilio

In [108]:
# Compare specific node
# Check if node exists first
if node_to_check in subgraph:
    node_comparison = compare_node_importance(node_to_check, rankings)
    if node_comparison:
        print(f"\nHow {node_to_check} ranks:")
        for method, info in node_comparison.items():
            print(f"  {method}: #{info['rank']} (score: {info['score']:.3f})")
    else:
        print(f"\{node_to_check} not in top rankings")
else:
    print(f"\{node_to_check} not in subgraph")


How Amazon ranks:
  pagerank: #2 (score: 0.149)
  degree: #4 (score: 0.000)
  betweenness: #1 (score: 0.000)
  eigenvector: #2 (score: 0.001)


In [109]:
filtered_rankings, full_rankings = rank_nodes_by_importance_with_context(
    subgraph,
    focus_type=focus_type,
    methods="all",
    top_k=10,
    aggregate="average",  # or 'max' or {'pagerank': 0.4, 'degree': 0.3, 'relevance': 0.3}
)

print("\n=== CONTROL NODE RANKINGS (computed on full graph) ===")
print_node_rankings(filtered_rankings, subgraph)


=== CONTROL NODE RANKINGS (computed on full graph) ===

NODE IMPORTANCE RANKINGS

Ranked 5 nodes using: pagerank, degree, betweenness, eigenvector


üìä PAGERANK (Top 5):
   1. Amazon (company): 0.1493
   2. Microsoft (company): 0.1493
   3. Twilio (company): 0.1493

üìä DEGREE (Top 5):
   1. Microsoft (company): 0.2500
   2. Twilio (company): 0.2500
   3. Amazon (company): 0.0000

üìä BETWEENNESS (Top 5):
   1. Amazon (company): 0.0000
   2. Microsoft (company): 0.0000
   3. Twilio (company): 0.0000

‚ùå CLOSENESS: graph is not connected enough to compute closeness centrality

üìä EIGENVECTOR (Top 5):
   1. Amazon (company): 0.0008
   2. Microsoft (company): 0.0008
   3. Twilio (company): 0.0008

‚≠ê COMBINED RANKING (Top 10):
   1. Microsoft (company): 0.1250
   2. Twilio (company): 0.1250
   3. Amazon (company): 0.0000



In [110]:
# Debug why rankings might be zero for some nodes in some metrics
diagnose_rankings(subgraph, focus_type)

Total nodes: 5
Filtered nodes (company): 3
Filtered subgraph connected: False
Filtered subgraph edges: 0
Isolated nodes: 3/3

   Recommendation: Don't filter by node_type for ranking.
   Rank on full graph, then filter results for display.


## Explain node retrieval

In [111]:
# For a specific node
explanation = explain_retrieval(
    node_id=node_to_check,
    query=query,
    subgraph=subgraph,
    node_vectors=node_vectors,
    sparse_vectors=sparse_vectors,
    sparse_vectorizer=sparse_vectorizer,
    structural_vectors=structural_vectors,
    model=model,
)
print_explanation(explanation, verbose=True)


RETRIEVAL EXPLANATION: Amazon

Node Type: company
Query: 'companies with risks of failure due to cloud providers'

üéØ Reason: SEED NODE
   Initial search score: 0.287
   - Semantic component: 0.180

üìù Semantic Similarity: 0.180
   Very low similarity

üîç Keyword Matching: 0.000
   Very low similarity
   Top TF-IDF terms in node:
      - amazon: 1.000

üï∏Ô∏è  Structural Similarity:
   Shares graph structure with seed nodes

üìÑ Content Preview:
   No text content



In [112]:
explanations = explain_top_results(
    query=query,
    results=semantic_results,
    subgraph=subgraph,
    node_vectors=node_vectors,
    sparse_vectors=sparse_vectors,
    sparse_vectorizer=sparse_vectorizer,
    structural_vectors=structural_vectors,
    model=model,
    top_k=3,
)


RETRIEVAL EXPLANATION: Azure Cloud API

Node Type: api
Query: 'companies with risks of failure due to cloud providers'

üéØ Reason: SEED NODE
   Initial search score: 1.000
   - Semantic component: 0.376
   - Keyword component: 0.477

üìù Semantic Similarity: 0.376
   Low similarity

üîç Keyword Matching: 0.477
   Low similarity
   Top TF-IDF terms in node:
      - cloud: 0.477
      - cloud api: 0.477
      - azure: 0.477

üï∏Ô∏è  Structural Similarity:
   Shares graph structure with seed nodes

üìÑ Content Preview:
   No text content


RETRIEVAL EXPLANATION: Microsoft

Node Type: company
Query: 'companies with risks of failure due to cloud providers'

üéØ Reason: SEED NODE
   Initial search score: 0.430
   - Semantic component: 0.248

üìù Semantic Similarity: 0.248
   Very low similarity

üîç Keyword Matching: 0.000
   Very low similarity
   Top TF-IDF terms in node:
      - microsoft: 1.000

üï∏Ô∏è  Structural Similarity:
   Shares graph structure with seed nodes

üìÑ Con

# Take exploration results and ask LLM to answer original client query only using subgraph

In [119]:
api_keys = {
    "Anthropic": os.getenv("ANTHROPIC_API_KEY"),
    "Mistral": os.getenv("MISTRAL_API_KEY"),
    "OpenAI": os.getenv("OPENAI_API_KEY"),
}

## Create prompt

In [125]:
print(f"Query: '{query}'")

# Complete prompt
prompt = create_llm_prompt_with_graph(
    query=query,
    subgraph=subgraph,
    report=report,
    format="natural",  # or 'markdown'
)

# Send to LLM
print(prompt)

# Save to file
with open("llm_prompt.txt", "w") as f:
    f.write(prompt)

Query: 'companies with risks of failure due to cloud providers'
# Graph-Based Query Response

## User Query
"companies with risks of failure due to cloud providers"

## Search Results
HYBRID-FILTERED WORKFLOW RESULTS

1. SEED NODE (Hybrid Search Match):
   Node: Azure Cloud API
   Semantic Score: 1.000
   Text: Azure Cloud API...

   HYBRID-FILTERED NEIGHBORS:
   (Must pass BOTH structural AND semantic thresholds)

2. SEED NODE (Hybrid Search Match):
   Node: Microsoft
   Semantic Score: 0.430
   Text: Microsoft...

   HYBRID-FILTERED NEIGHBORS:
   (Must pass BOTH structural AND semantic thresholds)

3. SEED NODE (Hybrid Search Match):
   Node: Twilio
   Semantic Score: 0.315
   Text: Twilio...

   HYBRID-FILTERED NEIGHBORS:
   (Must pass BOTH structural AND semantic thresholds)

4. SEED NODE (Hybrid Search Match):
   Node: Ride Sharing Services
   Semantic Score: 0.290
   Text: Ride Sharing Services...

   HYBRID-FILTERED NEIGHBORS:
   (Must pass BOTH structural AND semantic threshold

## Query LLM

In [126]:
%%time

provider = "Anthropic"

result = query_llm(
    prompt=prompt,
    # system_prompt=system_prompt,
    provider=provider,
    api_key=api_keys[provider],
    temperature=0.2,
)

CPU times: user 23.2 ms, sys: 2.01 ms, total: 25.2 ms
Wall time: 7.94 s


In [127]:
display(Markdown(result))

Based on the provided graph data, here's an analysis of companies with risks of failure due to cloud providers:

Key Insights:
1. Cloud Provider Dependency:
- Microsoft and Twilio are both connected to Azure Cloud API
- This suggests potential vulnerability to cloud service disruptions
- Both companies rely on the same cloud infrastructure (Azure)

2. Risk Patterns:
- Companies like Microsoft and Twilio demonstrate direct dependency on cloud APIs
- The graph reveals a potential single point of failure through Azure Cloud API
- Ride Sharing Services appears as a customer segment, which might also be impacted by cloud service risks

3. Structural Observations:
- Limited graph connectivity (only 2 edges)
- Semantic scores vary, with Azure Cloud API having the highest relevance (1.000)
- Amazon is present in the graph but not directly linked to the cloud API

Potential Risks:
- Concentration risk: Multiple companies depending on a single cloud provider (Azure)
- Service disruption could simultaneously impact multiple organizations
- Limited graph data suggests more comprehensive analysis would be beneficial

Limitations of Analysis:
- Small sample size (5 nodes)
- Incomplete relationship mapping
- Lack of detailed risk assessment information

Recommendation:
Companies should consider:
- Multi-cloud strategies
- Redundancy in cloud service providers
- Robust disaster recovery plans

Note: This analysis is strictly based on the provided graph data and may not represent a comprehensive risk assessment.

In [128]:
print("Notebook finished !")

Notebook finished !
