<div class="alert alert-block alert-success">
    <h1>
        Example notebook - Reactome subgraph
    </h1>
</div>

# Import modules and functions

In [168]:
%load_ext autoreload
%autoreload 2

import os
import glob
import re
import networkx as nx
import pandas as pd

from turingdb_examples.graph import (
    build_create_command_from_networkx,
)
from turingdb_examples.utils import get_return_statements
from turingdb_examples.llm import natural_language_to_cypher, query_llm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Check data files are available

In [2]:
folder_name = "reactome"
path_data = f"{os.getcwd()}/data/{folder_name}"
if not os.path.exists(path_data):
    raise ValueError(f"{path_data} does not exists")

list_csv_files = sorted([os.path.basename(file) for file in glob.glob(os.path.join(path_data, '*'))])
if not list_csv_files == [
    "entities_pairwise.gml"
]:
    raise ValueError(
        f"At least one of the {len(list_csv_files)} csv files is not available in {path_data}"
    )

# Import `gml` file

In [3]:
G = nx.read_gml(f"{path_data}/entities_pairwise.gml")
print(G)

MultiGraph with 40 nodes and 57 edges


In [133]:
from pyvis.network import Network
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

net = Network(
    height="750px", width="100%", notebook=True, bgcolor="#ffffff", font_color="#000000", directed=True
)

# Choose your palette (can be changed easily)
palette_name = 'Pastel1'  # Options: 'tab10', 'Set3', 'Paired', 'viridis', 'plasma', etc.

# Get unique node types
unique_types = list(set(data.get('schemaClass') for _, data in G.nodes(data=True)))

# Get colors from matplotlib palette
cmap = plt.get_cmap(palette_name)
colors = [mcolors.rgb2hex(cmap(i / len(unique_types))) for i in range(len(unique_types))]

# Map types to colors
type_colors = {node_type: colors[i] for i, node_type in enumerate(unique_types)}

# Then use in your visualization
for node, data in G.nodes(data=True):
    node_type = data.get('schemaClass', 'Unknown')
    color = type_colors.get(node_type, '#95a5a6')

    net.add_node(
        node,
        label=data.get("displayName", str(node)),
        title=f"{data.get('displayName', '')}",
        color=color,
        size=25,
    )

for source, target, data in G.edges(data=True):
    net.add_edge(source, target, color="#95a5a6", width=3)

net.toggle_physics(status=True)
net.show(f"{folder_name}_graph.html")

reactome_graph.html


# Create graph using `turingdb` python package

<div class="alert alert-block alert-info">
    <h2>
        See <a href="https://docs.turingdb.ai/quickstart">TuringDB Get started documentation</a> for the important steps to follow :
    </h2>
    <h4>
        <ul>
            <li>Create your TuringDB account</li>
            <li>Create your instance in the <a href="https://console.turingdb.ai/auth">TuringDB Cloud UI</a></li>
            <li>Copy your Instance ID from the Database Instances management page</li>
            <li>Get API Key from the Settings in UI</li>
        </ul>
        Remember to have your instance active while working in this notebook !
    </h4>
</div>

In [54]:
from turingdb import TuringDB

# Create TuringDB client
client = TuringDB(
    host="http://localhost:6666"  # Remove this parameter and set the two parameters below
    # instance_id="...",  # Replace by your instance id
    # auth_token="...",  # Replace by your API token
)

In [55]:
# Get list of available graphs
list_graphs = client.query("LIST GRAPH").loc[:, 0].tolist()

In [56]:
# Set graph name
graph_name_prefix = folder_name
graph_name_nb_suffix = str(
    max(
        [
            int(re.sub(graph_name_prefix, "", g))
            for g in list_graphs
            if g.startswith(graph_name_prefix)
            and re.sub(graph_name_prefix, "", g).isdigit()
        ]
        + [0]
    )
    + 1
)
graph_name = graph_name_prefix + graph_name_nb_suffix
graph_name

'reactome4'

In [57]:
%%time

# Create a new graph
client.query(f"CREATE GRAPH {graph_name}")
client.set_graph(graph_name)

# Create a new change on the graph
change = client.query("CHANGE NEW").loc[0, 0]

# Checkout into the change
client.checkout(change=change)

CPU times: user 2.6 ms, sys: 129 μs, total: 2.73 ms
Wall time: 2.33 ms


In [58]:
# Build CREATE command from networkx object
create_command = build_create_command_from_networkx(G, node_type_key="schemaClass")
print(f"Cypher CREATE command :\n\n{100 * '*'}\n{create_command}\n{100 * '*'}")

Cypher CREATE command :

****************************************************************************************************
CREATE (n0:Reaction {"id":"TP53 binds the PMAIP1 (NOXA) promoter", "schemaClass":"Reaction", "stId":"R-HSA-4331331", "oldStId":"REACT_169265", "releaseDate":"2016-03-23", "name":"[ TP53 binds the PMAIP1 (NOXA) promoter ]", "stIdVersion":"R-HSA-4331331.6", "speciesName":"Homo sapiens", "category":"binding", "displayName":"TP53 binds the PMAIP1 (NOXA) promoter"}),
(n1:Complex {"id":"p-S15,S20-TP53 Tetramer [nucleoplasm]", "schemaClass":"Complex", "stId":"R-HSA-3222171", "name":"[ p-S15,S20-TP53 Tetramer ]", "stIdVersion":"R-HSA-3222171.1", "speciesName":"Homo sapiens", "displayName":"p-S15,S20-TP53 Tetramer [nucleoplasm]"}),
(n2:Reaction {"id":"TP53 binds the APAF1 gene promoter", "schemaClass":"Reaction", "stId":"R-HSA-6791349", "releaseDate":"2016-03-23", "name":"[ TP53 binds the APAF1 gene promoter ]", "stIdVersion":"R-HSA-6791349.3", "speciesName":"Homo sapien

In [59]:
%%time

# Run CREATE command
client.query(create_command)

# Commit the change
client.query("COMMIT")
client.query("CHANGE SUBMIT")

# Checkout into main
client.checkout()

CPU times: user 3.49 ms, sys: 104 μs, total: 3.59 ms
Wall time: 7.16 ms


<div class="alert alert-block alert-info">
    <h2>
        Visualize your graph in TuringDB Graph Visualizer ! Now that your instance is running:
    </h2>
    <h3>
        <ul>
            <li>Go to <a href="https://console.turingdb.ai/databases">TuringDB Console - Database Instances</a></li>
            <li>In your current instance panel, click on "Open Visualizer" button</li>
            <li>Visualizer opens, now you can choose your graph in the dropdown menu at the top-right corner</li>
        </ul>
        You can then play with your graph and visualize the nodes you want !
    </h3>
</div>

# Query TuringDB

## Use metaqueries to have insight on graph overall structure

<h3>
    To learn more about 📮 Metaqueries, please check TuringDB documentation on this <a href="https://turingdb.mintlify.app/query/cypher_subset#%F0%9F%93%AE-metaqueries">link</a>
</h3>

In [60]:
%%time

# CALL LABELS () - returns a column of all the different node labels
command = """
CALL LABELS()
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = ["Node_type_ID", "Node_type"]
    display(df)

Unnamed: 0,Node_type_ID,Node_type
0,0,Reaction
1,1,Complex
2,2,BlackBoxEvent
3,3,EntityWithAccessionedSequence
4,4,PositiveGeneExpressionRegulation
5,5,PositiveRegulation


CPU times: user 4.57 ms, sys: 121 μs, total: 4.69 ms
Wall time: 4.23 ms


In [61]:
%%time

# CALL EDGETYPES() - returns a column of all the different edge types (edge equivalent of node labels)
command = """
CALL EDGETYPES()
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = ["Edge_type_ID", "Edge_type"]
    display(df)

Unnamed: 0,Edge_type_ID,Edge_type
0,0,CONNECTED


CPU times: user 4.18 ms, sys: 35 μs, total: 4.21 ms
Wall time: 3.96 ms


## Simple queries

In [62]:
%%time

# Match all edges and return them
command = """
MATCH (n)-[e]-(m) RETURN n.displayName, e, m.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = get_return_statements(command)
    display(df)

Unnamed: 0,n.displayName,e,m.displayName
0,TP53 binds the PMAIP1 (NOXA) promoter,0,"p-S15,S20-TP53 Tetramer [nucleoplasm]"
1,TP53 binds the PMAIP1 (NOXA) promoter,1,PMAIP1 Gene [nucleoplasm]
2,TP53 binds the PMAIP1 (NOXA) promoter,2,"p-S15,S20-TP53 Tetramer:PMAIP1 Gene [nucleoplasm]"
3,TP53 binds the PMAIP1 (NOXA) promoter,3,TP53 stimulates PMAIP1 (NOXA) expression
4,TP53 binds the APAF1 gene promoter,4,TP53 stimulates APAF1 gene expression
5,NRF1:PPARGC1B binds the CYCS promoter,5,Expression of CYCS
6,NRF1:PPARGC1B binds the CYCS promoter,6,Expression of NRF1
7,CYCS binds to APAF1,7,Release of Cytochrome c from mitochondria
8,CYCS binds to APAF1,8,CYCS [cytosol]
9,E2F1 binds APAF1 gene promoter,9,APAF1 gene expression is stimulated by E2F1 an...


CPU times: user 7.11 ms, sys: 55 μs, total: 7.17 ms
Wall time: 6.68 ms


In [73]:
%%time

# Find all nodes of type "EntityWithAccessionedSequence" (which is also schemaClass) and referenceType "ReferenceGeneProduct"
command = """
MATCH (n:EntityWithAccessionedSequence {"referenceType"="ReferenceGeneProduct"})
RETURN n.displayName, n.schemaClass, n.referenceType
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = get_return_statements(command)
    display(df)

Unnamed: 0,n.displayName,n.schemaClass,n.referenceType
0,E2F1 [nucleoplasm],EntityWithAccessionedSequence,ReferenceGeneProduct
1,ESRRA [nucleoplasm],EntityWithAccessionedSequence,ReferenceGeneProduct
2,CYCS [cytosol],EntityWithAccessionedSequence,ReferenceGeneProduct
3,EP300 [nucleoplasm],EntityWithAccessionedSequence,ReferenceGeneProduct
4,PPARGC1A [nucleoplasm],EntityWithAccessionedSequence,ReferenceGeneProduct
5,NRF1 [nucleoplasm],EntityWithAccessionedSequence,ReferenceGeneProduct
6,CYCS [mitochondrial intermembrane space],EntityWithAccessionedSequence,ReferenceGeneProduct
7,PMAIP1 [cytosol],EntityWithAccessionedSequence,ReferenceGeneProduct
8,APAF1 [cytosol],EntityWithAccessionedSequence,ReferenceGeneProduct


CPU times: user 9.55 ms, sys: 123 μs, total: 9.68 ms
Wall time: 9.13 ms


In [79]:
%%time

# Find all nodes of schemaClass "EntityWithAccessionedSequence" and referenceType "ReferenceGeneProduct"
command = """
MATCH (n)
RETURN n.schemaClass
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = get_return_statements(command)
    display(pd.DataFrame(df.value_counts()))

Unnamed: 0_level_0,count
n.schemaClass,Unnamed: 1_level_1
Reaction,12
EntityWithAccessionedSequence,10
Complex,9
BlackBoxEvent,6
PositiveGeneExpressionRegulation,2
PositiveRegulation,1


CPU times: user 10.3 ms, sys: 1.86 ms, total: 12.1 ms
Wall time: 10.7 ms


In [83]:
%%time

# Find all nodes of type "PositiveGeneExpressionRegulation" (which is also schemaClass)
command = """
MATCH (n:PositiveGeneExpressionRegulation)
RETURN n, n.displayName, n.schemaClass
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = get_return_statements(command)
    display(df)

Unnamed: 0,n,n.displayName,n.schemaClass
0,37,"Positive gene expression regulation by p-S15,S...",PositiveGeneExpressionRegulation
1,38,Positive gene expression regulation by E2F1:TF...,PositiveGeneExpressionRegulation


CPU times: user 9.05 ms, sys: 1.01 ms, total: 10.1 ms
Wall time: 8.69 ms


In [84]:
%%time

# Find all edges involving a node of type Complex
command = """
MATCH (n:Complex)-[e]-(m)
RETURN n.displayName, n.schemaClass, e, m.displayName, m.schemaClass, m.category
"""
df1 = client.query(command)
if df1.empty:
    print("No result found")
else:
    df1.columns = get_return_statements(command)

command = """
MATCH (n)-[e]-(m:Complex)
RETURN n.displayName, n.schemaClass, e, m.displayName, m.schemaClass, m.category
"""
df2 = client.query(command)
if df2.empty:
    print("No result found")
else:
    df2.columns = get_return_statements(command)

df = pd.concat([df1, df2], axis=0).sort_values("e")
df.index = range(len(df))
df

CPU times: user 5 ms, sys: 1 ms, total: 6.01 ms
Wall time: 5.24 ms


Unnamed: 0,n.displayName,n.schemaClass,e,m.displayName,m.schemaClass,m.category
0,TP53 binds the PMAIP1 (NOXA) promoter,Reaction,0,"p-S15,S20-TP53 Tetramer [nucleoplasm]",Complex,
1,TP53 binds the PMAIP1 (NOXA) promoter,Reaction,2,"p-S15,S20-TP53 Tetramer:PMAIP1 Gene [nucleoplasm]",Complex,
2,"E2F1:(TFDP1,TFDP2) [nucleoplasm]",Complex,20,E2F1 binds APAF1 gene promoter,Reaction,binding
3,CYCS gene:NRF1:PPARGC1B [nucleoplasm],Complex,21,NRF1:PPARGC1B binds the CYCS promoter,Reaction,binding
4,CYCS gene:NRF1:PPARGC1B [nucleoplasm],Complex,22,NRF1 [nucleoplasm],EntityWithAccessionedSequence,
5,RORA:Coactivator [nucleoplasm],Complex,23,EP300 [nucleoplasm],EntityWithAccessionedSequence,
6,"p-S15,S20-TP53 Tetramer [nucleoplasm]",Complex,24,"p-S15,S20-TP53:EP300:PRMT1:CARM1:GADD45A Gene ...",Complex,
7,"p-S15,S20-TP53 Tetramer [nucleoplasm]",Complex,24,"p-S15,S20-TP53:EP300:PRMT1:CARM1:GADD45A Gene ...",Complex,
8,"p-S15,S20-TP53 Tetramer [nucleoplasm]",Complex,25,"Positive gene expression regulation by p-S15,S...",PositiveGeneExpressionRegulation,
9,"p-S15,S20-TP53 Tetramer [nucleoplasm]",Complex,26,TP53 binds the APAF1 gene promoter,Reaction,binding


In [85]:
%%time

# Find all edges involving one node related to mitochondria using string approximation
command = """
MATCH (n {"displayName"~="mitochond"})-[e]-(m)
RETURN n.displayName, n.schemaClass, e, m.displayName, m.schemaClass
"""
df1 = client.query(command)
if df1.empty:
    print("No result found")
else:
    df1.columns = get_return_statements(command)

command = """
MATCH (n)-[e]-(m {"displayName"~="mitochond"})
RETURN n.displayName, n.schemaClass, e, m.displayName, m.schemaClass
"""
df2 = client.query(command)
if df2.empty:
    print("No result found")
else:
    df2.columns = get_return_statements(command)

df = pd.concat([df1, df2], axis=0).sort_values("e")
df.index = range(len(df))
df

CPU times: user 8.35 ms, sys: 216 μs, total: 8.56 ms
Wall time: 7.28 ms


Unnamed: 0,n.displayName,n.schemaClass,e,m.displayName,m.schemaClass
0,CYCS binds to APAF1,Reaction,7,Release of Cytochrome c from mitochondria,Reaction
1,Translocation of PMAIP1 (NOXA) to mitochondria,Reaction,14,PMAIP1 [cytosol],EntityWithAccessionedSequence
2,Translocation of PMAIP1 (NOXA) to mitochondria,Reaction,15,BH3-only proteins associate with and inactivat...,Reaction
3,Translocation of PMAIP1 (NOXA) to mitochondria,Reaction,16,Transactivation of PMAIP1 (NOXA) by E2F1,BlackBoxEvent
4,BH3-only proteins associate with and inactivat...,Reaction,17,Translocation of tBID to mitochondria,Reaction
5,Translocation of tBID to mitochondria,Reaction,18,Release of Cytochrome c from mitochondria,Reaction
6,Translocation of tBID to mitochondria,Reaction,18,Release of Cytochrome c from mitochondria,Reaction
7,Release of Cytochrome c from mitochondria,Reaction,19,CYCS [mitochondrial intermembrane space],EntityWithAccessionedSequence
8,Release of Cytochrome c from mitochondria,Reaction,19,CYCS [mitochondrial intermembrane space],EntityWithAccessionedSequence
9,TP53 stimulates PMAIP1 (NOXA) expression,BlackBoxEvent,37,Translocation of PMAIP1 (NOXA) to mitochondria,Reaction


In [86]:
%%time

# Find all edges with a node of category "binding" going to an other node
command = """
MATCH (n {"category"="binding"})-[e]-(m)
RETURN n.displayName, n.schemaClass, n.category, e, m.displayName, m.schemaClass, m.category
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = get_return_statements(command)
    display(df)

Unnamed: 0,n.displayName,n.schemaClass,n.category,e,m.displayName,m.schemaClass,m.category
0,TP53 binds the PMAIP1 (NOXA) promoter,Reaction,binding,0,"p-S15,S20-TP53 Tetramer [nucleoplasm]",Complex,
1,TP53 binds the PMAIP1 (NOXA) promoter,Reaction,binding,1,PMAIP1 Gene [nucleoplasm],EntityWithAccessionedSequence,
2,TP53 binds the PMAIP1 (NOXA) promoter,Reaction,binding,2,"p-S15,S20-TP53 Tetramer:PMAIP1 Gene [nucleoplasm]",Complex,
3,TP53 binds the PMAIP1 (NOXA) promoter,Reaction,binding,3,TP53 stimulates PMAIP1 (NOXA) expression,BlackBoxEvent,omitted
4,TP53 binds the APAF1 gene promoter,Reaction,binding,4,TP53 stimulates APAF1 gene expression,BlackBoxEvent,omitted
5,NRF1:PPARGC1B binds the CYCS promoter,Reaction,binding,5,Expression of CYCS,BlackBoxEvent,omitted
6,NRF1:PPARGC1B binds the CYCS promoter,Reaction,binding,6,Expression of NRF1,BlackBoxEvent,omitted
7,CYCS binds to APAF1,Reaction,binding,7,Release of Cytochrome c from mitochondria,Reaction,transition
8,CYCS binds to APAF1,Reaction,binding,8,CYCS [cytosol],EntityWithAccessionedSequence,
9,E2F1 binds APAF1 gene promoter,Reaction,binding,9,APAF1 gene expression is stimulated by E2F1 an...,BlackBoxEvent,omitted


CPU times: user 11.8 ms, sys: 1.7 ms, total: 13.5 ms
Wall time: 12.4 ms


## Complex queries

In [100]:
def build_query_chain(
    hop_count: int,
    start_node_label: str = None,
    start_node_property: str = None,
    start_node_value: str = None,
    end_node_label: str = None,
    end_node_property: str = None,
    end_node_value: str = None,
    edge_type: str = None,
    intermediate_node_label: str = None,
    return_properties: list = None
) -> tuple[str, list[str]]:
    """
    Build a general query to find chains between nodes.
    
    Parameters:
    -----------
    hop_count : int
        Number of hops/edges in the path (REQUIRED)
    start_node_label : str, optional
        Label of the starting node (e.g., 'Process', 'Station')
    start_node_property : str, optional
        Property name to match on start node (e.g., 'displayName', 'id')
    start_node_value : str, optional
        Value to match for start node
    end_node_label : str, optional
        Label of the ending node (None for any node)
    end_node_property : str, optional
        Property name to match on end node
    end_node_value : str, optional
        Value to match for end node
    edge_type : str, optional
        Type of edges to traverse (None for any edge type)
    intermediate_node_label : str, optional
        Label constraint for intermediate nodes (None for any label)
    return_properties : list, optional
        List of property names to return for nodes (default: ['displayName'])
    
    Returns:
    --------
    tuple : (query_string, column_names)
    """
    
    if return_properties is None:
        return_properties = ['displayName']
    
    # Build MATCH clause
    query = "MATCH "
    
    # Start node - build based on what's provided
    start_parts = []
    if start_node_label:
        start_parts.append(f":{start_node_label}")
    if start_node_property and start_node_value:
        start_parts.append(f'{{{start_node_property}:"{start_node_value}"}}')
    
    if start_parts:
        query += f'(start{"".join(start_parts)})'
    else:
        query += "(start)"
    
    # Intermediate nodes and edges
    for k in range(1, hop_count + 1):
        # Edge
        if edge_type:
            query += f"-[e{k}:{edge_type}]-"
        else:
            query += f"-[e{k}]-"
        
        # Last hop - end node
        if k == hop_count:
            end_parts = []
            if end_node_label:
                end_parts.append(f":{end_node_label}")
            if end_node_property and end_node_value:
                end_parts.append(f'{{{end_node_property}:"{end_node_value}"}}')
            
            if end_parts:
                query += f'(end{"".join(end_parts)})'
            else:
                query += "(end)"
        # Intermediate nodes
        else:
            if intermediate_node_label:
                query += f"(s{k}:{intermediate_node_label})"
            else:
                query += f"(s{k})"
    
    # Build RETURN clause
    query += " RETURN start"
    column_names = ['start']
    
    # Add start node properties
    for prop in return_properties:
        query += f", start.{prop}"
        column_names.append(f'start.{prop}')
    
    # Add intermediate nodes/edges
    for k in range(1, hop_count + 1):
        query += f", e{k}"
        column_names.append(f'e{k}')
        
        if k == hop_count:
            query += ", end"
            column_names.append('end')
            for prop in return_properties:
                query += f", end.{prop}"
                column_names.append(f'end.{prop}')
        else:
            query += f", s{k}"
            column_names.append(f's{k}')
            for prop in return_properties:
                query += f", s{k}.{prop}"
                column_names.append(f's{k}.{prop}')
    
    return query, column_names

In [175]:
%%time

# Find all paths (with maximum length of 15 hops) between a node of type "EntityWithAccessionedSequence" and a node of type "BlackBoxEvent"
max_hops = 15
longest_df = None

for hop in range(1, max_hops):
    print(100 * "*")
    print(f"{hop} hop(s) :\n")
    
    query, cols = build_query_chain(
        hop_count=hop,
        start_node_label="EntityWithAccessionedSequence",
        end_node_label="BlackBoxEvent",
        return_properties=['id', 'displayName', 'schemaClass', 'category']
    )
    
    # Use with client
    df = client.query(query)
    if df.empty:
        print("No result found")
    else:
        df.columns = cols
        display(df)
        longest_df = df

print(100 * "*")

****************************************************************************************************
1 hop(s) :



Unnamed: 0,start,start.id,start.displayName,start.schemaClass,start.category,e1,end,end.id,end.displayName,end.schemaClass,end.category
0,29,CYCS [cytosol],CYCS [cytosol],EntityWithAccessionedSequence,,43,23,Expression of CYCS,Expression of CYCS,BlackBoxEvent,omitted
1,31,PMAIP1 Gene [nucleoplasm],PMAIP1 Gene [nucleoplasm],EntityWithAccessionedSequence,,46,26,TP53 stimulates PMAIP1 (NOXA) expression,TP53 stimulates PMAIP1 (NOXA) expression,BlackBoxEvent,omitted
2,31,PMAIP1 Gene [nucleoplasm],PMAIP1 Gene [nucleoplasm],EntityWithAccessionedSequence,,47,22,Transactivation of PMAIP1 (NOXA) by E2F1,Transactivation of PMAIP1 (NOXA) by E2F1,BlackBoxEvent,omitted
3,33,NRF1 [nucleoplasm],NRF1 [nucleoplasm],EntityWithAccessionedSequence,,53,24,Expression of NRF1,Expression of NRF1,BlackBoxEvent,omitted


****************************************************************************************************
2 hop(s) :



Unnamed: 0,start,start.id,start.displayName,start.schemaClass,start.category,e1,s1,s1.id,s1.displayName,s1.schemaClass,s1.category,e2,end,end.id,end.displayName,end.schemaClass,end.category
0,33,NRF1 [nucleoplasm],NRF1 [nucleoplasm],EntityWithAccessionedSequence,,51,6,"NRF1:p-PPARGC1A, NRF2 bind the TFB2M promoter","NRF1:p-PPARGC1A, NRF2 bind the TFB2M promoter",Reaction,binding,13,24,Expression of NRF1,Expression of NRF1,BlackBoxEvent,omitted
1,33,NRF1 [nucleoplasm],NRF1 [nucleoplasm],EntityWithAccessionedSequence,,52,2,NRF1:PPARGC1B binds the CYCS promoter,NRF1:PPARGC1B binds the CYCS promoter,Reaction,binding,5,23,Expression of CYCS,Expression of CYCS,BlackBoxEvent,omitted
2,33,NRF1 [nucleoplasm],NRF1 [nucleoplasm],EntityWithAccessionedSequence,,52,2,NRF1:PPARGC1B binds the CYCS promoter,NRF1:PPARGC1B binds the CYCS promoter,Reaction,binding,6,24,Expression of NRF1,Expression of NRF1,BlackBoxEvent,omitted


****************************************************************************************************
3 hop(s) :



Unnamed: 0,start,start.id,start.displayName,start.schemaClass,start.category,e1,s1,s1.id,s1.displayName,s1.schemaClass,...,s2.id,s2.displayName,s2.schemaClass,s2.category,e3,end,end.id,end.displayName,end.schemaClass,end.category
0,27,E2F1 [nucleoplasm],E2F1 [nucleoplasm],EntityWithAccessionedSequence,,39,12,"E2F1:(TFDP1,TFDP2) [nucleoplasm]","E2F1:(TFDP1,TFDP2) [nucleoplasm]",Complex,...,E2F1 binds APAF1 gene promoter,E2F1 binds APAF1 gene promoter,Reaction,binding,9,21,APAF1 gene expression is stimulated by E2F1 an...,APAF1 gene expression is stimulated by E2F1 an...,BlackBoxEvent,omitted
1,31,PMAIP1 Gene [nucleoplasm],PMAIP1 Gene [nucleoplasm],EntityWithAccessionedSequence,,46,26,TP53 stimulates PMAIP1 (NOXA) expression,TP53 stimulates PMAIP1 (NOXA) expression,BlackBoxEvent,...,Translocation of PMAIP1 (NOXA) to mitochondria,Translocation of PMAIP1 (NOXA) to mitochondria,Reaction,transition,16,22,Transactivation of PMAIP1 (NOXA) by E2F1,Transactivation of PMAIP1 (NOXA) by E2F1,BlackBoxEvent,omitted
2,33,NRF1 [nucleoplasm],NRF1 [nucleoplasm],EntityWithAccessionedSequence,,51,6,"NRF1:p-PPARGC1A, NRF2 bind the TFB2M promoter","NRF1:p-PPARGC1A, NRF2 bind the TFB2M promoter",Reaction,...,p38 MAPK phosphorylates PPARGC1A,p38 MAPK phosphorylates PPARGC1A,Reaction,transition,10,24,Expression of NRF1,Expression of NRF1,BlackBoxEvent,omitted


****************************************************************************************************
4 hop(s) :

No result found
****************************************************************************************************
5 hop(s) :

No result found
****************************************************************************************************
6 hop(s) :

No result found
****************************************************************************************************
7 hop(s) :



Unnamed: 0,start,start.id,start.displayName,start.schemaClass,start.category,e1,s1,s1.id,s1.displayName,s1.schemaClass,...,s6.id,s6.displayName,s6.schemaClass,s6.category,e7,end,end.id,end.displayName,end.schemaClass,end.category
0,31,PMAIP1 Gene [nucleoplasm],PMAIP1 Gene [nucleoplasm],EntityWithAccessionedSequence,,47,22,Transactivation of PMAIP1 (NOXA) by E2F1,Transactivation of PMAIP1 (NOXA) by E2F1,BlackBoxEvent,...,E2F1 binds APAF1 gene promoter,E2F1 binds APAF1 gene promoter,Reaction,binding,9,21,APAF1 gene expression is stimulated by E2F1 an...,APAF1 gene expression is stimulated by E2F1 an...,BlackBoxEvent,omitted


****************************************************************************************************
8 hop(s) :

No result found
****************************************************************************************************
9 hop(s) :



Unnamed: 0,start,start.id,start.displayName,start.schemaClass,start.category,e1,s1,s1.id,s1.displayName,s1.schemaClass,...,s8.id,s8.displayName,s8.schemaClass,s8.category,e9,end,end.id,end.displayName,end.schemaClass,end.category
0,31,PMAIP1 Gene [nucleoplasm],PMAIP1 Gene [nucleoplasm],EntityWithAccessionedSequence,,46,26,TP53 stimulates PMAIP1 (NOXA) expression,TP53 stimulates PMAIP1 (NOXA) expression,BlackBoxEvent,...,E2F1 binds APAF1 gene promoter,E2F1 binds APAF1 gene promoter,Reaction,binding,9,21,APAF1 gene expression is stimulated by E2F1 an...,APAF1 gene expression is stimulated by E2F1 an...,BlackBoxEvent,omitted


****************************************************************************************************
10 hop(s) :

No result found
****************************************************************************************************
11 hop(s) :

No result found
****************************************************************************************************
12 hop(s) :

No result found
****************************************************************************************************
13 hop(s) :

No result found
****************************************************************************************************
14 hop(s) :

No result found
****************************************************************************************************
CPU times: user 46.2 ms, sys: 2.34 ms, total: 48.5 ms
Wall time: 45.3 ms


# Create subgraph to visualise

In [176]:
# Get subgraph
subset_nodes = longest_df.filter(regex="id$", axis=1).iloc[0].values.tolist()
subG = G.subgraph(subset_nodes).copy()
print(subG)

# Build CREATE command from subgraph
create_command_subG = build_create_command_from_networkx(subG)
print(f"Cypher CREATE command :\n\n{100 * '*'}\n{create_command_subG}\n{100 * '*'}")

MultiGraph with 10 nodes and 10 edges
Cypher CREATE command :

****************************************************************************************************
CREATE (n0:Node {"id":"Translocation of PMAIP1 (NOXA) to mitochondria", "schemaClass":"Reaction", "stId":"R-HSA-140216", "oldStId":"REACT_1585", "releaseDate":"2004-10-27", "name":"[ Translocation of PMAIP1 (NOXA) to mitochondria ]", "stIdVersion":"R-HSA-140216.4", "speciesName":"Homo sapiens", "category":"transition", "displayName":"Translocation of PMAIP1 (NOXA) to mitochondria"}),
(n1:Node {"id":"TP53 stimulates PMAIP1 (NOXA) expression", "schemaClass":"BlackBoxEvent", "stId":"R-HSA-140214", "oldStId":"REACT_2201", "releaseDate":"2004-10-27", "name":"[ TP53 stimulates PMAIP1 (NOXA) expression ]", "stIdVersion":"R-HSA-140214.6", "speciesName":"Homo sapiens", "category":"omitted", "displayName":"TP53 stimulates PMAIP1 (NOXA) expression"}),
(n2:Node {"id":"E2F1 [nucleoplasm]", "schemaClass":"EntityWithAccessionedSequence", "

In [177]:
subgraph_name = f"{graph_name}_subgraph"
subgraph_name

'reactome4_subgraph'

In [178]:
%%time

# Create new graph
client.query(f"CREATE GRAPH {subgraph_name}")
client.set_graph(subgraph_name)

# Create a new change on the graph
change = client.query("CHANGE NEW").loc[0, 0]

# Checkout into the change
client.checkout(change=change)

# Run CREATE command
client.query(create_command_subG)

# Commit the change
client.query("COMMIT")
client.query("CHANGE SUBMIT")

# Checkout into main
client.checkout()

CPU times: user 5.22 ms, sys: 1.72 ms, total: 6.94 ms
Wall time: 10.4 ms


<div class="alert alert-block alert-info">
    <h2>
        You can visualise the subgraph directly in the notebook below. For more details on nodes and edges, you can go to TuringDB visualizer (running on your instance)
    </h2>
</div>

<div class="alert alert-block alert-info">
    <h2>
        Visualize your graph in TuringDB Graph Visualizer ! Now that your instance is running:
    </h2>
    <h3>
        <ul>
            <li>Go to <a href="https://console.turingdb.ai/databases">TuringDB Console - Database Instances</a></li>
            <li>In your current instance panel, click on "Open Visualizer" button</li>
            <li>Visualizer opens, now you can choose your graph in the dropdown menu at the top-right corner</li>
        </ul>
        You can then play with your graph and visualize the nodes you want !
    </h3>
</div>

In [179]:
from pyvis.network import Network
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

net = Network(
    height="750px", width="100%", notebook=True, bgcolor="#ffffff", font_color="#000000", directed=True
)

# Choose your palette (can be changed easily)
palette_name = 'Pastel1'  # Options: 'tab10', 'Set3', 'Paired', 'viridis', 'plasma', etc.

# Get unique node types
unique_types = list(set(data.get('schemaClass') for _, data in G.nodes(data=True)))

# Get colors from matplotlib palette
cmap = plt.get_cmap(palette_name)
colors = [mcolors.rgb2hex(cmap(i / len(unique_types))) for i in range(len(unique_types))]

# Map types to colors
type_colors = {node_type: colors[i] for i, node_type in enumerate(unique_types)}

# Then use in your visualization
for node, data in subG.nodes(data=True):
    node_type = data.get('schemaClass', 'Unknown')
    color = type_colors.get(node_type, '#95a5a6')

    net.add_node(
        node,
        label=data.get("displayName", str(node)),
        title=f"{data.get('displayName', '')}",
        color=color,
        size=25,
    )

for source, target, data in subG.edges(data=True):
    net.add_edge(source, target, color="#95a5a6", width=3)

net.toggle_physics(status=True)
net.show(f"{folder_name}_graph.html")

reactome_graph.html


# Use LLM to generate Cypher query

Before running this section, create a `.env` file in the project root with your API keys:

```env
ANTHROPIC_API_KEY=your_key_here
OPENAI_API_KEY=your_key_here
MISTRAL_API_KEY=your_key_here

In [180]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv(override=True)

True

In [181]:
api_keys = {
    "Anthropic": os.getenv("ANTHROPIC_API_KEY"),
    "Mistral": os.getenv("MISTRAL_API_KEY"),
    "OpenAI": os.getenv("OPENAI_API_KEY"),
}

In [182]:
"""Build system prompt with TuringDB schema and examples"""

turingdb_cypher_system_prompt = """
You are an expert at converting natural language questions into TuringDB queries.

Your task is to generate syntactically correct TuringDB queries based on natural language input.

VERY IMPORTANT - TuringDB Syntax Guidelines:
1. Return ONLY the TuringDB query, no explanations or markdown formatting
2. Use MATCH or CREATE operations only
3. Nodes: (n:Label{property="value"}) or (n:Label{property:value})
4. Edges: Use UNDIRECTED syntax with - (NOT ->)
5. Pattern matching: MATCH (n)-[e]-(m)
6. Property matching: Use = or : operators for exact matching
7. String approximation: Use ~= for approximate string matching
8. Node ID injection: Use @ operator or AT keyword: (n @ 1) or (n AT 1)
9. Multiple constraints: (n:Person,Engineer{name="John", age=30})
10. Return all matched entities: RETURN n, e, m or use RETURN * for all

VERY IMPORTANT - FORBIDDEN in TuringDB:
- Do NOT use directed edges (-> or <-)
- Do NOT use AS aliases
- Do NOT use LIMIT, SKIP clauses
- Do NOT use WHERE clauses
- Do NOT use WITH clauses
- Do NOT use CALL (except for metaqueries)
- Do NOT use toLower() or other functions

Supported TuringDB Operations:
- MATCH queries: MATCH (n:Label)-[e:Type]-(m) RETURN n, m
- CREATE queries: CREATE (n:Label{property="value"})-[e:Type]-(m:Label)
- Metaqueries: CALL PROPERTIES(), CALL LABELS(), CALL EDGETYPES(), CALL LABELSETS()
- Property types: String ("text" or `text`), Boolean (true/false), Integer (20), Unsigned (20u), Double (20.5)

Examples for few-shot learning:
- Find all persons: MATCH (n:Person) RETURN n
- Find connections: MATCH (n:Person)-[e]-(m:Person) RETURN n, e, m
- Create person: CREATE (n:Person{name="John", age=30})
- String approximation: MATCH (n{name~="John"}) RETURN n
- Node by ID: MATCH (n @ 1) RETURN n
- Multiple IDs: MATCH (n:Person @ 1, 2, 3) RETURN n
- Path with 1 hop between Station Paddington and Blackfriars:  MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]-(end:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, end, end.displayName, end.Note
- Path with 2 hops between Station Paddington and Blackfriars: MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]-(s1:Station)-[e2:CONNECTED]-(end:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, s1, s1.displayName, s1.Note, e2.Line, end, end.displayName, end.Note
- Path with 8 hops between Station Paddington and Blackfriars: MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]-(s1:Station)-[e2:CONNECTED]-(s2:Station)-[e3:CONNECTED]-(s3:Station)-[e4:CONNECTED]-(s4:Station)-[e5:CONNECTED]-(s5:Station)-[e6:CONNECTED]-(s6:Station)-[e7:CONNECTED]-(s7:Station)-[e8:CONNECTED]-(end:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, s1, s1.displayName, s1.Note, e2.Line, s2, s2.displayName, s2.Note, e3.Line, s3, s3.displayName, s3.Note, e4.Line, s4, s4.displayName, s4.Note, e5.Line, s5, s5.displayName, s5.Note, e6.Line, s6, s6.displayName, s6.Note, e7.Line, s7, s7.displayName, s7.Note, e8.Line, end, end.displayName, end.Note
- Find all Chinese providers and what they supply: MATCH (n{provider_country:"CHN"}) RETURN n, n.provider_name, n.displayName, n.share_provided, n.type
- Find all deposition tools and their types: MATCH (specific)-[e:IS_TYPE_OF]-(general:Tool_Resource{displayName:"Deposition tools"}) RETURN specific, specific.displayName, specific.provider_name, e, general, general.displayName
"""

In [192]:
# Get subset of CREATE command to avoid exceeding context window
create_command_subset = create_command.split("\n")[:5] + create_command.split("\n")[-5:]

# Create system_prompt
system_prompt = f"""
TuringDB Cypher prompt :
{turingdb_cypher_system_prompt}

Here is a subset of the CREATE command used to create the graph, this way you know graph structure.
Only a subset is passed because the whole command is to long :
{create_command_subset}

Here is also the output of "CALL LABELS ()" command, showing the different node types of the graph :
{client.query("CALL LABELS ()")}

Here is also the output of "CALL EDGETYPES ()" command, showing the different edge types of the graph :
{client.query("CALL EDGETYPES ()")}

Very important :
- You MUST follow current TuringDB Syntax Guidelines
- You MUST NOT USE what is FORBIDDEN in TuringDB
- By default, RETURN ALL THE MATCHED NODES AND EDGES AND THEIR PROPERTIES in the RETURN section (except contrary demand from user)
- Use the correct node and edge properties name in the MATCH section.
- Use the correct node and edge properties name in the RETURN section.
- Pay attention to which properties come from nodes or edges, to create a functioning query
- Pay attention to lower and uppercases in properties
- If some properties contain spaces, be careful to wrap them

Give me the query FOLLOWING TURINGDB GUIDELINES AND NOT USING WHAT IS FORBIDDEN for this specific question :
"""

In [193]:
question = """
Is there a path of any size linking gene PMAIP1 and gene E2F1 ?
"""

In [194]:
%%time

provider = "Anthropic"

cypher_query = natural_language_to_cypher(
    question=question,
    system_prompt=system_prompt,
    provider=provider,
    api_key=api_keys[provider],
    temperature=0.0,
    #model="claude-3-haiku-20240307",
)
print(f"cypher_query : {cypher_query}")

cypher_query : MATCH (start{displayName:"PMAIP1"})-[e1]-(n1)-[e2]-(n2)-[e3]-(n3)-[e4]-(n4)-[e5]-(n5)-[e6]-(n6)-[e7]-(end{displayName:"E2F1"}) RETURN start, end, e1, e2, e3, e4, e5, e6, e7, n1, n2, n3, n4, n5, n6
CPU times: user 27.8 ms, sys: 1.21 ms, total: 29.1 ms
Wall time: 2.45 s


In [195]:
%%time

# Set original graph
client.set_graph(graph_name)

df_path = client.query(cypher_query)
df_path.columns = get_return_statements(cypher_query)
if df_path.empty:
    print("--> No result found\n")
else:
    display(df_path)

--> No result found

CPU times: user 2.42 ms, sys: 89 μs, total: 2.51 ms
Wall time: 1.91 ms


# Use LLM to get subgraph summary

In [197]:
%%time

prompt = f"""
Give me a summary of this graph. It represents biological entities, tell me more about the entities involved and the interactions.
Here is the graph :
{G.nodes(data=True)} {G.edges(data=True)}
"""

system_prompt = """
You are a specialist in analysing graphs and their structure.
You will use your knowledge to add more information about the entities and relationships in the graph.
Add information only when you are sure it is relevant.
"""

provider = "OpenAI"

response = query_llm(
    prompt=prompt,
    system_prompt=system_prompt,
    provider=provider,
    api_key=api_keys[provider],
    temperature=0.0
)

CPU times: user 24.9 ms, sys: 0 ns, total: 24.9 ms
Wall time: 11.8 s


In [198]:
from IPython.display import display, Markdown, HTML

In [199]:
display(Markdown(response))

The graph represents a complex network of interactions involving various biological entities primarily related to gene regulation, apoptosis, and cellular signaling in *Homo sapiens*. Here’s a summary of the key entities and their interactions:

### Key Entities:
1. **TP53 (Tumor Protein p53)**: A crucial tumor suppressor gene that regulates the cell cycle and functions in preventing cancer. It binds to the promoters of various genes, including PMAIP1 and APAF1, to stimulate their expression.
   - **p-S15,S20-TP53 Tetramer**: The phosphorylated form of TP53 that is active in gene regulation.

2. **PMAIP1 (NOXA)**: A pro-apoptotic gene that is regulated by TP53. It plays a role in apoptosis and is translocated to mitochondria to promote cell death.

3. **APAF1 (Apoptotic Protease Activating Factor 1)**: A key component in the apoptosome that activates caspases, leading to apoptosis. Its expression is stimulated by TP53 and E2F1.

4. **CYCS (Cytochrome c)**: A protein involved in the electron transport chain and apoptosis. It binds to APAF1 and is released from mitochondria during apoptosis.

5. **E2F1**: A transcription factor that regulates the expression of genes involved in cell cycle progression and apoptosis. It binds to the promoters of PMAIP1 and APAF1.

6. **NRF1 (Nuclear Respiratory Factor 1)**: A transcription factor that regulates the expression of genes involved in mitochondrial biogenesis and function, including CYCS.

7. **PPARGC1A (Peroxisome Proliferator-Activated Receptor Gamma Coactivator 1-alpha)**: A coactivator that regulates genes involved in energy metabolism and mitochondrial function.

8. **ESRRA (Estrogen-Related Receptor Alpha)**: A transcription factor that positively regulates the expression of various genes, including those involved in mitochondrial function.

### Key Interactions:
- **TP53 Binding**: TP53 binds to the promoters of PMAIP1 and APAF1, stimulating their expression, which is crucial for apoptosis.
- **E2F1 Regulation**: E2F1 also binds to the promoters of PMAIP1 and APAF1, indicating a collaborative role in regulating apoptosis and cell cycle.
- **CYCS and APAF1 Interaction**: CYCS binds to APAF1, leading to the formation of the apoptosome and subsequent activation of caspases, which are essential for the apoptotic process.
- **Translocation Events**: PMAIP1 is translocated to mitochondria, where it can exert its pro-apoptotic effects, and CYCS is released from mitochondria, further promoting apoptosis.
- **Positive Regulation**: The interactions between TP53, E2F1, and other transcription factors like NRF1 and ESRRA indicate a complex regulatory network that ensures proper gene expression in response to cellular stress and damage.

### Conclusion:
This graph illustrates a network of interactions that highlight the roles of TP53, E2F1, and other transcription factors in regulating apoptosis and gene expression in response to cellular signals. The interplay between these entities is crucial for maintaining cellular homeostasis and preventing tumorigenesis.

In [200]:
print("Notebook finished !")

Notebook finished !
