<div class="alert alert-block alert-success">
    <h1>
        Example notebook - Healthcare
    </h1>
    <p>
        Link to dataset : <a href="https://www.kaggle.com/datasets/prasad22/healthcare-dataset">Kaggle link</a>
    </p>
</div>

# Import modules and functions

In [1]:
import os
import pandas as pd
import re
import time
from tqdm.auto import tqdm

from turingdb_examples.utils import create_ID_column
from turingdb_examples.graph import (
    create_graph_from_df,
    build_create_command_from_networkx,
    split_cypher_commands
)
from turingdb_examples.llm import natural_language_to_cypher

In [2]:
%load_ext autoreload
%autoreload 2

# Check data files are available

In [3]:
example_name = "healthcare_dataset"
path_data = f"{os.getcwd()}/data/{example_name}"
if not os.path.exists(path_data):
    raise ValueError(f"{path_data} does not exists")

filename = "healthcare_dataset.csv"
list_csv_files = sorted(os.listdir(path_data))
if filename not in list_csv_files:
    raise ValueError(
        f"{filename} csv file is not available in {path_data}"
    )

# Import and format data

In [4]:
df = pd.read_csv(f"{path_data}/healthcare_dataset.csv")
df["Name"] = df["Name"].apply(
    lambda x: f"{x.split(' ')[0].capitalize()} {x.split(' ')[1].upper()}"
)
df["Doctor"] = df["Doctor"].apply(
    lambda x: f"{x.split(' ')[0].capitalize()} {x.split(' ')[1].upper()}"
)
df = create_ID_column(df)
# Keep only 10 patients to reduce graph for now
# You can comment the following line to generate the whole graph
df = df.iloc[:5000, :]
df

Unnamed: 0,Patient ID,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,00000,Bobby JACKSON,30,Male,B-,Cancer,2024-01-31,Matthew SMITH,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,00001,Leslie TERRY,62,Male,A+,Obesity,2019-08-20,Samantha DAVIES,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,00002,Danny SMITH,76,Female,A-,Obesity,2022-09-22,Tiffany MITCHELL,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,00003,Andrew WATTS,28,Female,O+,Diabetes,2020-11-18,Kevin WELLS,"Hernandez Rogers and Vang,",Medicare,37909.782410,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,00004,Adrienne BELL,43,Female,AB+,Cancer,2022-09-19,Kathleen HANNA,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,04995,Marcus LONG,41,Female,AB+,Hypertension,2023-02-12,Melissa DUNCAN,"and Mosley, Williams Russell",Aetna,10769.289595,177,Urgent,2023-03-02,Penicillin,Inconclusive
4996,04996,Bethany GUZMAN,77,Male,AB+,Arthritis,2020-01-18,Mr. OSCAR,Rodriguez Inc,UnitedHealthcare,5500.283823,490,Elective,2020-02-08,Paracetamol,Inconclusive
4997,04997,Deborah MACDONALD,85,Female,O+,Arthritis,2019-09-05,Cathy SWANSON,Calderon and Sons,UnitedHealthcare,5153.940251,499,Urgent,2019-09-21,Paracetamol,Inconclusive
4998,04998,Dr. ANA,28,Male,O+,Cancer,2022-11-03,Kathryn DAVIS,LLC Jensen,UnitedHealthcare,5260.553923,115,Elective,2022-11-20,Ibuprofen,Inconclusive


# Create graph from dataframe

In [5]:
label_str = "displayName"

G = create_graph_from_df(
    df,
    directed=True,
    source_node_col={"id": "Patient ID", label_str: "Name", "type": "Patient"},
    attributes_source_node_cols=["Age", "Date of Admission", "Discharge Date"],
    optional_nodes_cols={
        "Gender": {"link_to_source": True, "edge_type_to_source": "is"},
        "Blood Type": {"link_to_source": True, "edge_type_to_source": "is"},
        "Medical Condition": {"link_to_source": True, "edge_type_to_source": "has"},
        "Doctor": {"link_to_source": True, "edge_type_to_source": "is_treated_by"},
        "Hospital": {
            "attributes": ["Room Number"],
            "link_to_source": True,
            "edge_type_to_source": "is_treated_in",
        },
        "Insurance Provider": {
            "attributes": ["Billing Amount"],
            "link_to_source": True,
            "edge_type_to_source": "is_client_of",
        },
        "Admission Type": {"link_to_source": True},
        "Medication": {
            "link_to_source": True,
            "edge_type_to_source": "took_medication",
        },
        "Test Results": {"link_to_source": True, "edge_type_to_source": "has_result"},
    },
)
print(f"Resulting graph : {G}")

Resulting graph : DiGraph with 14571 nodes and 45000 edges


In [6]:
# Show first few nodes with properties
for node in list(G.nodes(data=True))[:20]:
    print(node)

('00000', {'displayName': 'Bobby JACKSON', 'type': 'Patient', 'Age': 30, 'Date of Admission': '2024-01-31', 'Discharge Date': '2024-02-02'})
('Male', {'displayName': 'Male', 'type': 'Gender'})
('B-', {'displayName': 'B-', 'type': 'Blood Type'})
('Cancer', {'displayName': 'Cancer', 'type': 'Medical Condition'})
('Matthew SMITH', {'displayName': 'Matthew SMITH', 'type': 'Doctor'})
('Sons and Miller', {'displayName': 'Sons and Miller', 'type': 'Hospital', 'Room Number': 328})
('Blue Cross', {'displayName': 'Blue Cross', 'type': 'Insurance Provider', 'Billing Amount': 18856.281305978155})
('Urgent', {'displayName': 'Urgent', 'type': 'Admission Type'})
('Paracetamol', {'displayName': 'Paracetamol', 'type': 'Medication'})
('Normal', {'displayName': 'Normal', 'type': 'Test Results'})
('00001', {'displayName': 'Leslie TERRY', 'type': 'Patient', 'Age': 62, 'Date of Admission': '2019-08-20', 'Discharge Date': '2019-08-26'})
('A+', {'displayName': 'A+', 'type': 'Blood Type'})
('Obesity', {'displa

In [7]:
# Show first few edge with properties
for edge in list(G.edges(data=True))[:20]:
    print(edge)

('00000', 'Male', {'type': 'is'})
('00000', 'B-', {'type': 'is'})
('00000', 'Cancer', {'type': 'has'})
('00000', 'Matthew SMITH', {'type': 'is_treated_by'})
('00000', 'Sons and Miller', {'type': 'is_treated_in'})
('00000', 'Blue Cross', {'type': 'is_client_of'})
('00000', 'Urgent', {})
('00000', 'Paracetamol', {'type': 'took_medication'})
('00000', 'Normal', {'type': 'has_result'})
('00001', 'Male', {'type': 'is'})
('00001', 'A+', {'type': 'is'})
('00001', 'Obesity', {'type': 'has'})
('00001', 'Samantha DAVIES', {'type': 'is_treated_by'})
('00001', 'Kim Inc', {'type': 'is_treated_in'})
('00001', 'Medicare', {'type': 'is_client_of'})
('00001', 'Emergency', {})
('00001', 'Ibuprofen', {'type': 'took_medication'})
('00001', 'Inconclusive', {'type': 'has_result'})
('00002', 'Female', {'type': 'is'})
('00002', 'A-', {'type': 'is'})


# Create Cypher CREATE command

## Build CREATE command

In [8]:
%%time

# Build CREATE command
graph_CREATE_command = build_create_command_from_networkx(G)
print(f"""
Cypher CREATE command :
* size: {len(graph_CREATE_command.encode('utf-8'))/1024/1000:.4f} MB\n
{100 * '*'}
{graph_CREATE_command if len(graph_CREATE_command.split("\n")) < 10000 else "\n".join(graph_CREATE_command.split('\n')[:5]) + "\n...\n" + "\n".join(graph_CREATE_command.split('\n')[-5:])}
{100 * '*'}
""")

Cypher query will create graph with 14,571 nodes and 45,000 edges

Cypher CREATE command :
* size: 5.8096 MB

****************************************************************************************************
CREATE (:Patient {id: "00000", displayName: "Bobby JACKSON", type: "Patient", Age: 30, `Date of Admission`: "2024-01-31", `Discharge Date`: "2024-02-02"}),
(:Gender {id: "Male", displayName: "Male", type: "Gender"}),
(:BloodType {id: "B-", displayName: "B-", type: "Blood Type"}),
(:MedicalCondition {id: "Cancer", displayName: "Cancer", type: "Medical Condition"}),
(:Doctor {id: "Matthew SMITH", displayName: "Matthew SMITH", type: "Doctor"}),
...
MATCH (source {id: "04999"}), (target {id: "Russell-Knight"}) CREATE (source)-[:IS_TREATED_IN]->(target)
MATCH (source {id: "04999"}), (target {id: "Blue Cross"}) CREATE (source)-[:IS_CLIENT_OF]->(target)
MATCH (source {id: "04999"}), (target {id: "Emergency"}) CREATE (source)-[:CONNECTED]->(target)
MATCH (source {id: "04999"}), (target 

## Split command into chunks

In [9]:
%%time

chunks = split_cypher_commands(graph_CREATE_command, max_size_mb=1)

print(f"âœ“ Split into {len(chunks['node_chunks'])} node chunk(s) and {len(chunks['edge_chunks'])} edge chunk(s)")

print("\nNode chunks:")
for i, chunk in enumerate(chunks['node_chunks']):
    print(f"  Node chunk {i+1}: {len(chunk.encode('utf-8'))/1024:.1f} KB")
    if i == 10:
        print("  ...")
        break

print("\nEdge chunks:")
for i, chunk in enumerate(chunks['edge_chunks']):
    print(f"  Edge chunk {i+1}: {len(chunk.encode('utf-8'))/1024:.1f} KB")
    if i == 10:
        print("  ...")
        break

âœ“ Split into 2 node chunk(s) and 45000 edge chunk(s)

Node chunks:
  Node chunk 1: 976.4 KB
  Node chunk 2: 635.8 KB

Edge chunks:
  Edge chunk 1: 0.1 KB
  Edge chunk 2: 0.1 KB
  Edge chunk 3: 0.1 KB
  Edge chunk 4: 0.1 KB
  Edge chunk 5: 0.1 KB
  Edge chunk 6: 0.1 KB
  Edge chunk 7: 0.1 KB
  Edge chunk 8: 0.1 KB
  Edge chunk 9: 0.1 KB
  Edge chunk 10: 0.1 KB
  Edge chunk 11: 0.1 KB
  ...
CPU times: user 92.4 ms, sys: 4.9 ms, total: 97.3 ms
Wall time: 96.6 ms


# Create graph using `turingdb` python package

<div class="alert alert-block alert-info">
    <h2>
        See <a href="https://docs.turingdb.ai/quickstart">TuringDB Get started documentation</a> for the important steps to follow :
    </h2>
    <h4>
        <ul>
            <li>Create your TuringDB account</li>
            <li>Create your instance in the <a href="https://console.turingdb.ai/auth">TuringDB Cloud UI</a></li>
            <li>Copy your Instance ID from the Database Instances management page</li>
            <li>Get API Key from the Settings in UI</li>
        </ul>
        Remember to have your instance active while working in this notebook !
    </h4>
</div>

In [10]:
from turingdb import TuringDB

# Create TuringDB client
# set host parameter to the URL (as string) on which TuringDB is running,
# default "http://localhost:6666"
client = TuringDB(host="http://localhost:6666")
try:
    client.warmup()
except Exception as e:
    print(f"TuringDB not started, please run `uv run turingdb` in your terminal")

In [11]:
# Get list of available graphs
list_graphs = client.list_available_graphs()

In [12]:
client.list_loaded_graphs()

['crypto_orbitaal_fraud_detection1', 'citeab_antibody1', 'default']

In [13]:
# Set graph name
graph_name_prefix = example_name
graph_name_nb_suffix = str(
    max(
        [
            int(re.sub(graph_name_prefix, "", g))
            for g in list_graphs
            if g.startswith(graph_name_prefix)
            and re.sub(graph_name_prefix, "", g).isdigit()
        ]
        + [0]
    )
    + 1
)
graph_name = graph_name_prefix + graph_name_nb_suffix
graph_name = re.sub("-", "_", graph_name)
graph_name

'healthcare_dataset1'

In [14]:
from turingdb.exceptions import TuringDBException

In [15]:
%%time

# Set graph
try:
    client.create_graph(graph_name)
except TuringDBException as e:
    print(e)

# Set working graph
client.set_graph(graph_name)

CPU times: user 1.95 ms, sys: 73 Î¼s, total: 2.02 ms
Wall time: 8.53 ms


In [16]:
%%time

# Create a new change on the graph
client.checkout()
change = client.new_change()
print(f"Current change {change}")

# Checkout into the change
client.checkout(change=change)

Current change 0
CPU times: user 1.85 ms, sys: 67 Î¼s, total: 1.91 ms
Wall time: 1.42 ms


In [17]:
%%time

# Run CREATE command
print("\nExecuting query on TuringDB...")
start_time = time.time()

print(f"âœ“ Split into {len(chunks['node_chunks'])} node chunk(s) and {len(chunks['edge_chunks'])} edge chunk(s)")

# CREATE nodes
print("\nNode chunks:")
for i, chunk in enumerate(tqdm(chunks['node_chunks'])):
    result = client.query(chunk)
# Commit the change
client.query("COMMIT")
print(f"âœ“ {len(chunks['node_chunks'])} node chunks done")

# CREATE edges
print("\nEdge chunks:")
for i, chunk in enumerate(tqdm(chunks['edge_chunks'])):
    result = client.query(chunk)
# Commit the change
client.query("COMMIT")
print(f"âœ“ {len(chunks['edge_chunks'])} edge chunks done")

execution_time = time.time() - start_time
print(f"\nâœ“ Graph created successfully in {execution_time:.2f} seconds")

# Submit changes
start_time = time.time()
client.query("CHANGE SUBMIT")
execution_time = time.time() - start_time
print(f"\nâœ“ Changes successfully submitted in {execution_time:.2f} seconds")

# Checkout into main
client.checkout()


Executing query on TuringDB...
âœ“ Split into 2 node chunk(s) and 45000 edge chunk(s)

Node chunks:


  0%|          | 0/2 [00:00<?, ?it/s]

âœ“ 2 node chunks done

Edge chunks:


  0%|          | 0/45000 [00:00<?, ?it/s]

âœ“ 45000 edge chunks done

âœ“ Graph created successfully in 110.12 seconds

âœ“ Changes successfully submitted in 0.06 seconds
CPU times: user 38.3 s, sys: 2.44 s, total: 40.7 s
Wall time: 1min 50s


In [18]:
# Returns the commit history
client.query("CALL db.history()")

Unnamed: 0,commit,nodeCount,edgeCount,partCount
0,f76179e32b9ea7a2,0,0,0
1,702428bde18d46d7,14571,0,1
2,6e41926621d4891c,0,45000,1
3,a0fd7f3ede04ba65,0,0,0


<div class="alert alert-block alert-info">
    <h2>
        Visualize your graph in TuringDB Graph Visualizer ! Now that your instance is running:
    </h2>
    <h3>
        <ul>
            <li>Go to <a href="https://console.turingdb.ai/databases">TuringDB Console - Database Instances</a></li>
            <li>In your current instance panel, click on "Open Visualizer" button</li>
            <li>Visualizer opens, now you can choose your graph in the dropdown menu at the top-right corner</li>
        </ul>
        You can then play with your graph and visualize the nodes you want !
    </h3>
</div>

# Query TuringDB

## Use metaqueries to have insight on graph overall structure

<h3>
    To learn more about ðŸ“® Metaqueries, please check TuringDB documentation on this <a href="https://turingdb.mintlify.app/query/cypher_subset#%F0%9F%93%AE-metaqueries">link</a>
</h3>

In [19]:
%%time

# CALL propertyTypes() - returns a column of all the different node and edge properties and their types in the database
command = """
CALL db.propertyTypes()
"""
df_propertyTypes = client.query(command)
if df_propertyTypes.empty:
    print("No result found")
else:
    display(df_propertyTypes)

Unnamed: 0,id,propertyType,valueType
0,0,Date of Admission,String
1,1,Age,Int64
2,2,type,String
3,3,Discharge Date,String
4,4,displayName,String
5,5,id,String
6,6,Room Number,Int64
7,7,Billing Amount,Double


CPU times: user 3.72 ms, sys: 987 Î¼s, total: 4.71 ms
Wall time: 3.97 ms


In [20]:
# Get node properties
nodes_properties = df_propertyTypes["propertyType"].values.tolist()
print(f"Node properties: {nodes_properties}")

Node properties: ['Date of Admission', 'Age', 'type', 'Discharge Date', 'displayName', 'id', 'Room Number', 'Billing Amount']


In [21]:
%%time

# CALL labels () - returns a column of all the different node labels
command = """
CALL db.labels()
"""
df_labels = client.query(command)
if df_labels.empty:
    print("No result found")
else:
    display(df_labels)

Unnamed: 0,id,label
0,0,Patient
1,1,Gender
2,2,BloodType
3,3,MedicalCondition
4,4,Doctor
5,5,Hospital
6,6,InsuranceProvider
7,7,AdmissionType
8,8,Medication
9,9,TestResults


CPU times: user 4.05 ms, sys: 1 Î¼s, total: 4.05 ms
Wall time: 3.63 ms


In [22]:
%%time

# CALL edgeTypes() - returns a column of all the different edge types (edge equivalent of node labels)
command = """
CALL db.edgeTypes()
"""
df_edgeTypes = client.query(command)
if df_edgeTypes.empty:
    print("No result found")
else:
    display(df_edgeTypes)

Unnamed: 0,id,edgeType
0,0,IS
1,1,HAS
2,2,IS_TREATED_BY
3,3,IS_TREATED_IN
4,4,IS_CLIENT_OF
5,5,CONNECTED
6,6,TOOK_MEDICATION
7,7,HAS_RESULT


CPU times: user 2.93 ms, sys: 1 ms, total: 3.94 ms
Wall time: 3.4 ms


## Counts

In [23]:
%%time

# Find number of nodes and number of edges in the graph
n_nodes = len(client.query("MATCH (n) RETURN n"))
n_edges = len(client.query("MATCH (n)-->(m) RETURN n, m"))
print(f"Graph: {n_nodes:,} nodes and {n_edges:,} edges\n")

Graph: 14,571 nodes and 45,000 edges

CPU times: user 8.99 ms, sys: 3.08 ms, total: 12.1 ms
Wall time: 12.9 ms


In [24]:
%%time

# Count all nodes
command = """
MATCH (n)
RETURN COUNT(n)
"""
df_count_nodes = client.query(command)
display(df_count_nodes)

# Count all edges
command = """
MATCH (n)-->()
RETURN COUNT(n)
"""
df_count_edges = client.query(command)
display(df_count_edges)

# Find number of nodes and number of edges in the graph
n_nodes = int(df_count_nodes.loc[0, "COUNT(n)"])
n_edges = int(df_count_edges.loc[0, "COUNT(n)"])
print(f"Graph: {n_nodes:,} nodes and {n_edges:,} edges\n")

Unnamed: 0,COUNT(n)
0,14571


Unnamed: 0,COUNT(n)
0,45000


Graph: 14,571 nodes and 45,000 edges

CPU times: user 7.11 ms, sys: 944 Î¼s, total: 8.06 ms
Wall time: 8.52 ms


In [25]:
# Count number of nodes for each label
for label in df_labels["label"]:
    print(100 * '-')
    print(f"label: {label}")
    df_curr_label = client.query(f"""
    MATCH (n:{label})
    RETURN count(n)
    """)
    display(df_curr_label)
    print()
print(100 * '-')

----------------------------------------------------------------------------------------------------
label: Patient


Unnamed: 0,count(n)
0,5000



----------------------------------------------------------------------------------------------------
label: Gender


Unnamed: 0,count(n)
0,2



----------------------------------------------------------------------------------------------------
label: BloodType


Unnamed: 0,count(n)
0,8



----------------------------------------------------------------------------------------------------
label: MedicalCondition


Unnamed: 0,count(n)
0,6



----------------------------------------------------------------------------------------------------
label: Doctor


Unnamed: 0,count(n)
0,4820



----------------------------------------------------------------------------------------------------
label: Hospital


Unnamed: 0,count(n)
0,4719



----------------------------------------------------------------------------------------------------
label: InsuranceProvider


Unnamed: 0,count(n)
0,5



----------------------------------------------------------------------------------------------------
label: AdmissionType


Unnamed: 0,count(n)
0,3



----------------------------------------------------------------------------------------------------
label: Medication


Unnamed: 0,count(n)
0,5



----------------------------------------------------------------------------------------------------
label: TestResults


Unnamed: 0,count(n)
0,3



----------------------------------------------------------------------------------------------------


## Queries

In [26]:
%%time

# Match all edges and return them
command = """
MATCH (n)-[e]->(m)
RETURN n.displayName, e, m.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,n.displayName,e,m.displayName
0,Brett GRIFFIN,0,Male
1,Brett GRIFFIN,1,A+
2,Brett GRIFFIN,2,Asthma
3,Brett GRIFFIN,3,David PERRY
4,Brett GRIFFIN,4,Nelson-Drake
...,...,...,...
44995,Adrian BOWMAN,44995,Baldwin-Marshall
44996,Adrian BOWMAN,44996,Julie WARREN
44997,Adrian BOWMAN,44997,Cancer
44998,Adrian BOWMAN,44998,A-


CPU times: user 14.2 ms, sys: 7.06 ms, total: 21.3 ms
Wall time: 29.6 ms


In [27]:
%%time

# Match all edges linking a Patient to an other node
# Return displayName and type properties
command = """
MATCH (n:Patient)-[e]->(m)
RETURN n.type, n.displayName, e, m.type, m.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,n.type,n.displayName,e,m.type,m.displayName
0,Patient,Brett GRIFFIN,0,Gender,Male
1,Patient,Brett GRIFFIN,1,Blood Type,A+
2,Patient,Brett GRIFFIN,2,Medical Condition,Asthma
3,Patient,Brett GRIFFIN,3,Doctor,David PERRY
4,Patient,Brett GRIFFIN,4,Hospital,Nelson-Drake
...,...,...,...,...,...
44995,Patient,Adrian BOWMAN,44995,Hospital,Baldwin-Marshall
44996,Patient,Adrian BOWMAN,44996,Doctor,Julie WARREN
44997,Patient,Adrian BOWMAN,44997,Medical Condition,Cancer
44998,Patient,Adrian BOWMAN,44998,Blood Type,A-


CPU times: user 18.5 ms, sys: 17 ms, total: 35.6 ms
Wall time: 49.8 ms


In [28]:
%%time

# Find all patients
command = """
MATCH (p:Patient)
RETURN p.id, p.displayName, p.Age
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,p.id,p.displayName,p.Age
0,02012,Brett GRIFFIN,74
1,02011,Phillip WALLACE,75
2,04303,Victor DAVIS,42
3,02010,Chelsea MENDOZA,84
4,02009,Eugene HOFFMAN,79
...,...,...,...
4995,00070,Carol PATTERSON,29
4996,00138,Melissa SCOTT,38
4997,00071,Jose LOPEZ,18
4998,01136,Hannah CAREY,60


CPU times: user 8.94 ms, sys: 0 ns, total: 8.94 ms
Wall time: 9.49 ms


In [29]:
%%time

# Find all doctors
command = """
MATCH (d:Doctor)
RETURN d.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,d.displayName
0,Matthew THOMAS
1,Laurie GATES
2,Catherine MORA
3,Kathryn ROJAS
4,Erica WILLIAMS
...,...
4815,Kaitlyn ROGERS
4816,Wayne MORALES
4817,Kristen AGUIRRE
4818,Ashley JOHNSTON


CPU times: user 3.33 ms, sys: 1.03 ms, total: 4.36 ms
Wall time: 4.11 ms


In [30]:
%%time

# Find all medications
command = """
MATCH (d:Medication)
RETURN d.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,d.displayName
0,Aspirin
1,Lipitor
2,Ibuprofen
3,Penicillin
4,Paracetamol


CPU times: user 2.8 ms, sys: 992 Î¼s, total: 3.8 ms
Wall time: 9.33 ms


In [31]:
%%time

# Find patient with specific ID and return all their information
command = """
MATCH (p:Patient)
WHERE p.id = "00000"
RETURN p, p.id, p.displayName, p.type, p.Age, p.`Date of Admission`, p.`Discharge Date`
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,p,p.id,p.displayName,p.type,p.Age,p.`Date of Admission`,p.`Discharge Date`
0,4953,0,Bobby JACKSON,Patient,30,2024-01-31,2024-02-02


CPU times: user 5.16 ms, sys: 1.02 ms, total: 6.17 ms
Wall time: 8 ms


In [32]:
%%time

# Find female patients
command = """
MATCH (p:Patient)-[:IS]->(g:Gender)
WHERE g.displayName = "Female"
RETURN p.displayName, p.Age
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,p.displayName,p.Age
0,Victor DAVIS,42
1,Chelsea MENDOZA,84
2,Adrian GREEN,37
3,Kelly HARRIS,77
4,Katrina ADAMS,81
...,...,...
2473,Tony SALAS,58
2474,Carol PATTERSON,29
2475,Melissa SCOTT,38
2476,Hannah CAREY,60


CPU times: user 4.88 ms, sys: 59 Î¼s, total: 4.94 ms
Wall time: 6.98 ms


In [33]:
%%time

# Find patients with Cancer
command = """
MATCH (p:Patient)-[:HAS]->(mc:MedicalCondition)
WHERE mc.displayName = "Cancer"
RETURN p.displayName, p.Age
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,p.displayName,p.Age
0,Chelsea MENDOZA,84
1,Eugene HOFFMAN,79
2,Matthew GLOVER,35
3,Luis PATTERSON,27
4,Leah TAYLOR,49
...,...,...
857,Marcus BROWN,41
858,Kevin SMITH,48
859,Carol PATTERSON,29
860,Hannah CAREY,60


CPU times: user 4.34 ms, sys: 91 Î¼s, total: 4.43 ms
Wall time: 5.48 ms


In [34]:
%%time

# Find all patients who are treated by a doctor
command = """
MATCH (p:Patient)-[:IS_TREATED_BY]->(d:Doctor)
RETURN p.displayName, d.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,p.displayName,d.displayName
0,Brett GRIFFIN,David PERRY
1,Phillip WALLACE,William TAYLOR
2,Victor DAVIS,Michelle CRUZ
3,Chelsea MENDOZA,Robyn HOWARD
4,Eugene HOFFMAN,Justin TAPIA
...,...,...
4995,Carol PATTERSON,Jamie BAKER
4996,Melissa SCOTT,Kari MOORE
4997,Jose LOPEZ,Erika JACKSON
4998,Hannah CAREY,Emma YANG


CPU times: user 3.46 ms, sys: 2.05 ms, total: 5.51 ms
Wall time: 6.9 ms


In [35]:
%%time

# Find all patients treated by doctor Kelly OLSON
command = """
MATCH (p:Patient)-[:IS_TREATED_BY]->(d:Doctor)
WHERE d.displayName = "Kelly OLSON"
RETURN p.displayName, d.displayName
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,p.displayName,d.displayName
0,Edward EDWARDS,Kelly OLSON


CPU times: user 2.77 ms, sys: 990 Î¼s, total: 3.76 ms
Wall time: 4.7 ms


In [36]:
%%time

# Find all patients with blood type A+
command = """
MATCH (p:Patient)-[:IS]->(bt:BloodType)
WHERE bt.displayName = "A+"
RETURN p.displayName, p.Age
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    display(df)

Unnamed: 0,p.displayName,p.Age
0,Brett GRIFFIN,74
1,Phillip WALLACE,75
2,Eugene HOFFMAN,79
3,Katrina ADAMS,81
4,Kelly WILLIAMS,75
...,...,...
640,Cynthia BRYANT,70
641,Mr. KENNETH,34
642,Tammy COLE,28
643,Linda JONES,82


CPU times: user 3.35 ms, sys: 1.03 ms, total: 4.38 ms
Wall time: 5.77 ms


In [37]:
%%time

# Find all patients who took Paracetamol
command = """
MATCH (p:Patient)-[:TOOK_MEDICATION]->(m:Medication)
WHERE m.displayName = "Paracetamol"
RETURN p.id, p.displayName, m.displayName
"""
df = client.query(command)
df.columns = ["Patient ID", "Patient Name", "Medication"]
display(df)

Unnamed: 0,Patient ID,Patient Name,Medication
0,02012,Brett GRIFFIN,Paracetamol
1,02004,Kelly WILLIAMS,Paracetamol
2,01998,Holly NELSON,Paracetamol
3,01995,Leroy PRICE,Paracetamol
4,02027,Shawn RAMIREZ,Paracetamol
...,...,...,...
941,00072,Kevin SIMMONS,Paracetamol
942,01097,Jasmine LUNA,Paracetamol
943,04983,Melissa MCDONALD,Paracetamol
944,00139,Gary HOPKINS,Paracetamol


CPU times: user 4.92 ms, sys: 0 ns, total: 4.92 ms
Wall time: 6.01 ms


# Create subgraph to visualise

In [38]:
import numpy as np

In [39]:
# Get subgraph
subset_nodes = np.unique(df.loc[:100, ["Patient ID", "Medication"]].values).tolist()
subG = G.subgraph(subset_nodes).copy()
print(subG)

# Build CREATE command from subgraph
create_command_subG = build_create_command_from_networkx(subG)
print(f"""
Cypher CREATE command :
* size: {len(create_command_subG.encode('utf-8'))/1024/1000:.4f} MB\n
{100 * '*'}
{create_command_subG \
if len(create_command_subG.split("\n")) < 10000 \
else "\n".join(create_command_subG.split('\n')[:5]) + "\n...\n" + "\n".join(create_command_subG.split('\n')[-5:])}
{100 * '*'}
""")

DiGraph with 102 nodes and 101 edges
Cypher query will create graph with 102 nodes and 101 edges

Cypher CREATE command :
* size: 0.0250 MB

****************************************************************************************************
CREATE (:Patient {id: "02399", displayName: "Thomas JOHNSON", type: "Patient", Age: 39, `Date of Admission`: "2022-06-16", `Discharge Date`: "2022-07-08"}),
(:Patient {id: "02333", displayName: "David WARREN", type: "Patient", Age: 60, `Date of Admission`: "2021-07-16", `Discharge Date`: "2021-07-20"}),
(:Patient {id: "04301", displayName: "Joseph SANTIAGO", type: "Patient", Age: 59, `Date of Admission`: "2024-04-15", `Discharge Date`: "2024-04-17"}),
(:Patient {id: "04196", displayName: "Kendra JOHNSON", type: "Patient", Age: 57, `Date of Admission`: "2021-09-20", `Discharge Date`: "2021-10-05"}),
(:Patient {id: "01825", displayName: "Lucas WILSON", type: "Patient", Age: 39, `Date of Admission`: "2019-07-28", `Discharge Date`: "2019-08-02"}),
(:Pa

In [40]:
subgraph_name = f"{graph_name}_subgraph"
subgraph_name

'healthcare_dataset1_subgraph'

In [41]:
%%time

# Set graph
try:
    client.create_graph(subgraph_name)
except TuringDBException as e:
    print(e)

# Set working graph
client.set_graph(subgraph_name)

# Create a new change on the graph
client.checkout()
change = client.new_change()
print(f"Current change {change}")

# Checkout into the change
client.checkout(change=change)

Current change 0
CPU times: user 2.71 ms, sys: 0 ns, total: 2.71 ms
Wall time: 9.39 ms


In [42]:
%%time

chunks = split_cypher_commands(create_command_subG, max_size_mb=1)

print(f"âœ“ Split into {len(chunks['node_chunks'])} node chunk(s) and {len(chunks['edge_chunks'])} edge chunk(s)")

print("\nNode chunks:")
for i, chunk in enumerate(chunks['node_chunks']):
    print(f"  Node chunk {i+1}: {len(chunk.encode('utf-8'))/1024:.1f} KB")
    if i == 10:
        print("  ...")
        break

print("\nEdge chunks:")
for i, chunk in enumerate(chunks['edge_chunks']):
    print(f"  Edge chunk {i+1}: {len(chunk.encode('utf-8'))/1024:.1f} KB")
    if i == 10:
        print("  ...")
        break

âœ“ Split into 1 node chunk(s) and 101 edge chunk(s)

Node chunks:
  Node chunk 1: 14.8 KB

Edge chunks:
  Edge chunk 1: 0.1 KB
  Edge chunk 2: 0.1 KB
  Edge chunk 3: 0.1 KB
  Edge chunk 4: 0.1 KB
  Edge chunk 5: 0.1 KB
  Edge chunk 6: 0.1 KB
  Edge chunk 7: 0.1 KB
  Edge chunk 8: 0.1 KB
  Edge chunk 9: 0.1 KB
  Edge chunk 10: 0.1 KB
  Edge chunk 11: 0.1 KB
  ...
CPU times: user 989 Î¼s, sys: 0 ns, total: 989 Î¼s
Wall time: 964 Î¼s


In [43]:
%%time

# Run CREATE command
print("\nExecuting query on TuringDB...")
start_time = time.time()

print(f"âœ“ Split into {len(chunks['node_chunks'])} node chunk(s) and {len(chunks['edge_chunks'])} edge chunk(s)")

# CREATE nodes
print("\nNode chunks:")
for i, chunk in enumerate(tqdm(chunks['node_chunks'])):
    result = client.query(chunk)
# Commit the change
client.query("COMMIT")
print(f"âœ“ {len(chunks['node_chunks'])} node chunks done")

# CREATE edges
print("\nEdge chunks:")
for i, chunk in enumerate(tqdm(chunks['edge_chunks'])):
    result = client.query(chunk)
# Commit the change
client.query("COMMIT")
print(f"âœ“ {len(chunks['edge_chunks'])} edge chunks done")

execution_time = time.time() - start_time
print(f"\nâœ“ Graph created successfully in {execution_time:.2f} seconds")

# Submit changes
start_time = time.time()
client.query("CHANGE SUBMIT")
execution_time = time.time() - start_time
print(f"\nâœ“ Changes successfully submitted in {execution_time:.2f} seconds")

# Checkout into main
client.checkout()


Executing query on TuringDB...
âœ“ Split into 1 node chunk(s) and 101 edge chunk(s)

Node chunks:


  0%|          | 0/1 [00:00<?, ?it/s]

âœ“ 1 node chunks done

Edge chunks:


  0%|          | 0/101 [00:00<?, ?it/s]

âœ“ 101 edge chunks done

âœ“ Graph created successfully in 0.09 seconds

âœ“ Changes successfully submitted in 0.05 seconds
CPU times: user 90.9 ms, sys: 8.22 ms, total: 99.1 ms
Wall time: 141 ms


<div class="alert alert-block alert-info">
    <h2>
        You can visualise the subgraph directly in the notebook below. For more details on nodes and edges, you can go to TuringDB visualizer (running on your instance)
    </h2>
</div>

<div class="alert alert-block alert-info">
    <h2>
        Visualize your graph in TuringDB Graph Visualizer ! Now that your instance is running:
    </h2>
    <h3>
        <ul>
            <li>Go to <a href="https://console.turingdb.ai/databases">TuringDB Console - Database Instances</a></li>
            <li>In your current instance panel, click on "Open Visualizer" button</li>
            <li>Visualizer opens, now you can choose your graph in the dropdown menu at the top-right corner</li>
        </ul>
        You can then play with your graph and visualize the nodes you want !
    </h3>
</div>

In [44]:
from pyvis.network import Network

net = Network(
    height="750px",
    width="100%",
    notebook=True,
    bgcolor="#f8f9fa",
    font_color="#212529",
    directed=True,
)

# Node type colors
type_colors = {"Patient": "#3498db", "Medication": "#e74c3c"}

for node, data in subG.nodes(data=True):
    node_type = data.get("type", "Unknown")
    color = type_colors.get(node_type, "#7f8c8d")

    label = data.get("displayName", str(node))

    # Build title based on node type
    if node_type == "Patient":
        title = f"<b>{label}</b><br>Age: {data.get('Age', 'N/A')}<br>Admitted: {data.get('Date of Admission', 'N/A')}<br>Discharged: {data.get('Discharge Date', 'N/A')}"
    else:
        title = f"<b>{label}</b><br>Type: {node_type}"

    net.add_node(node, label=label, color=color, title=title, size=25)

# Edge colors by type
edge_colors = {"took_medication": "#27ae60"}

for source, target, data in subG.edges(data=True):
    edge_type = data.get("type", "")
    color = edge_colors.get(edge_type, "#95a5a6")
    net.add_edge(source, target, title=edge_type, color=color, width=3)

net.toggle_physics(True)
net.show(f"{example_name}_subgraph.html")

healthcare_dataset_subgraph.html


# Use LLM to generate Cypher query

Before running this section, create a `.env` file in the project root with your API keys:

```env
ANTHROPIC_API_KEY=your_key_here
OPENAI_API_KEY=your_key_here
MISTRAL_API_KEY=your_key_here

In [45]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [46]:
api_keys = {
    "Anthropic": os.getenv("ANTHROPIC_API_KEY"),
    "Mistral": os.getenv("MISTRAL_API_KEY"),
    "OpenAI": os.getenv("OPENAI_API_KEY"),
}

In [47]:
"""Build system prompt with TuringDB schema and examples"""

turingdb_cypher_system_prompt = """
You are an expert at converting natural language questions into TuringDB queries.

Your task is to generate syntactically correct TuringDB queries based on natural language input.

VERY IMPORTANT - TuringDB Syntax Guidelines:
1. Return ONLY the TuringDB query, no explanations or markdown formatting
2. Use MATCH, CREATE and WHERE operations only
3. Nodes: (n:Label {property = "value"}) or (n:Label {property: value})
4. Edges: Use DIRECTED syntax with ->
5. Pattern matching: MATCH (n)-[e]->(m)
6. Property matching: Use = operator for exact matching
7. Multiple constraints: (n:Person:Engineer {name = "John", age = 30})
8. Return all matched entities: RETURN n, e, m or use RETURN * for all
9. Filter using WHERE clause: MATCH (n:Person) WHERE n.name = 'John' RETURN n.firstname, n.lastname

VERY IMPORTANT - FORBIDDEN in TuringDB:
- Do NOT use AS aliases
- Do NOT use LIMIT, SKIP clauses
- Do NOT use WITH clauses
- Do NOT use CALL (except for metaqueries)
- Do NOT use toLower() or other functions

Supported TuringDB Operations:
- MATCH queries: MATCH (n:Label)-[e:Type]->(m) RETURN n, m
- CREATE queries: CREATE (n:Label{property="value"})-[e:Type]->(m:Label)
- Metaqueries: CALL db.propertyTypes(), CALL db.labels(), CALL db.edgeTypes()
- Property types: String ("text" or `text`), Boolean (true/false), Integer (20), Double (20.5)

Examples for few-shot learning:
- Find all persons: MATCH (n:Person) RETURN n
- Find connections: MATCH (n:Person)-[e]->(m:Person) RETURN n, e, m
- Create person: CREATE (n:Person{name="John", age=30})
- Match person with specific name: MATCH (p:Person) WHERE p.name = "John" RETURN p
- Path with 1 hop between Station Paddington and Blackfriars:  MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]->(end:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, end, end.displayName, end.Note
- Path with 2 hops between Station Paddington and Blackfriars: MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]->(s1:Station)-[e2:CONNECTED]->(end:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, s1, s1.displayName, s1.Note, e2.Line, end, end.displayName, end.Note
- Path with 8 hops between Station Paddington and Blackfriars: MATCH (start:Station{displayName:"Paddington"})-[e1:CONNECTED]->(s1:Station)-[e2:CONNECTED]->(s2:Station)-[e3:CONNECTED]->(s3:Station)-[e4:CONNECTED]->(s4:Station)-[e5:CONNECTED]->(s5:Station)-[e6:CONNECTED]->(s6:Station)-[e7:CONNECTED]->(s7:Station)-[e8:CONNECTED]->(end:Station{displayName="Blackfriars"}) RETURN start, start.displayName, start.Note, e1.Line, s1, s1.displayName, s1.Note, e2.Line, s2, s2.displayName, s2.Note, e3.Line, s3, s3.displayName, s3.Note, e4.Line, s4, s4.displayName, s4.Note, e5.Line, s5, s5.displayName, s5.Note, e6.Line, s6, s6.displayName, s6.Note, e7.Line, s7, s7.displayName, s7.Note, e8.Line, end, end.displayName, end.Note
- Find all Chinese providers and what they supply: MATCH (n{provider_country:"CHN"}) RETURN n, n.provider_name, n.displayName, n.share_provided, n.type
- Find all deposition tools and their types: MATCH (specific)-[e:IS_TYPE_OF]->(general:Tool_Resource{displayName:"Deposition tools"}) RETURN specific, specific.displayName, specific.provider_name, e, general, general.displayName
"""

In [48]:
# Get subset of CREATE command to avoid exceeding context window
create_command_subset = create_command_subG.split("\n")[:5] + create_command_subG.split("\n")[-5:]

# Create system_prompt
system_prompt = f"""
TuringDB Cypher prompt :
{turingdb_cypher_system_prompt}

Here is a subset of the CREATE command used to create the graph, this way you know graph structure.
Only a subset is passed because the whole command is to long :
{create_command_subset}

Here is also the output of "CALL LABELS ()" command, showing the different node types of the graph :
{client.query("CALL db.labels()")}

Here is also the output of "CALL EDGETYPES ()" command, showing the different edge types of the graph :
{client.query("CALL db.edgeTypes()")}

Very important :
- You MUST follow current TuringDB Syntax Guidelines
- You MUST NOT USE what is FORBIDDEN in TuringDB
- By default, RETURN ALL THE MATCHED NODES AND EDGES AND THEIR PROPERTIES in the RETURN section (except contrary demand from user)
- Use the correct node and edge properties name in the MATCH section.
- Use the correct node and edge properties name in the RETURN section.
- Pay attention to which properties come from nodes or edges, to create a functioning query
- Pay attention to lower and uppercases in properties
- If some properties contain spaces, be careful to wrap them

Give me the query FOLLOWING TURINGDB GUIDELINES AND NOT USING WHAT IS FORBIDDEN for this specific question :
"""

In [49]:
# Set natural language query
question = """
Find all patients who took Paracetamol
"""

In [50]:
%%time

provider = "Anthropic"

cypher_query = natural_language_to_cypher(
    question=question,
    system_prompt=system_prompt,
    provider=provider,
    api_key=api_keys[provider],
)
print(f"cypher_query : {cypher_query}")

cypher_query : MATCH (p:Patient)-[e:TOOK_MEDICATION]->(m:Medication{displayName:"Paracetamol"}) RETURN p, e, m, p.id, p.displayName, p.Age, p.`Date of Admission`, p.`Discharge Date`
CPU times: user 183 ms, sys: 15.1 ms, total: 198 ms
Wall time: 2.09 s


In [51]:
df_path = client.query(cypher_query)
if df_path.empty:
    print("--> No result found\n")
else:
    display(df_path)

Unnamed: 0,p,e,m,p.id,p.displayName,p.Age,p.`Date of Admission`,p.`Discharge Date`
0,0,0,101,01907,Brenda COLLINS,70,2021-05-03,2021-06-02
1,1,1,101,02346,Michael DECKER,41,2019-06-26,2019-07-10
2,2,2,101,02027,Shawn RAMIREZ,40,2022-04-25,2022-04-29
3,3,3,101,04367,Nichole LEE,85,2021-06-27,2021-07-05
4,4,4,101,01962,Brianna BELL,68,2022-07-28,2022-08-07
...,...,...,...,...,...,...,...,...
96,96,96,101,02312,Cathy RODRIGUEZ,39,2023-08-13,2023-08-14
97,97,97,101,02040,Benjamin BERNARD,76,2022-06-29,2022-07-13
98,98,98,101,01798,Victoria CRAWFORD,34,2021-01-23,2021-02-19
99,99,99,101,02285,Mr. JAMES,80,2023-02-01,2023-02-24


In [52]:
print("Notebook finished !")

Notebook finished !
