<div class="alert alert-block alert-success">
    <h1>
        Example notebook - Healthcare
    </h1>
    <p>
        Link to dataset : <a href="https://www.kaggle.com/datasets/prasad22/healthcare-dataset">Kaggle link</a>
    </p>
</div>

# Import modules and functions

In [1]:
import os
import pandas as pd
import re

# Check csv files are available

In [2]:
folder_name = "healthcare_dataset"
path_data = f"{os.getcwd()}/data/{folder_name}"
if not os.path.exists(path_data):
    raise ValueError(f"{path_data} does not exists")

list_csv_files = sorted(os.listdir(path_data))
if not list_csv_files == ["healthcare_dataset.csv"]:
    raise ValueError(f"csv file is not available in {path_data}")

# Import and format data

In [3]:
def create_ID_column(df_):
    df = df_.copy()
    df = df.reset_index()
    df = df.rename(columns={"index": "Patient ID"})
    df["Patient ID"] = df["Patient ID"].astype(str)
    max_len = max([len(_) for _ in df["Patient ID"].values])
    df["Patient ID"] = [(max_len - len(id_)) * "0" + id_ for id_ in df["Patient ID"]]

    return df

In [4]:
df = pd.read_csv(f"{path_data}/healthcare_dataset.csv")
df["Name"] = df["Name"].apply(
    lambda x: f"{x.split(' ')[0].capitalize()} {x.split(' ')[1].upper()}"
)
df["Doctor"] = df["Doctor"].apply(
    lambda x: f"{x.split(' ')[0].capitalize()} {x.split(' ')[1].upper()}"
)
df = create_ID_column(df)
# Keep only 10 patients to reduce graph for now
# You can comment the following line to generate the whole graph
df = df.iloc[:10, :]
df

Unnamed: 0,Patient ID,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,0,Bobby JACKSON,30,Male,B-,Cancer,2024-01-31,Matthew SMITH,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,1,Leslie TERRY,62,Male,A+,Obesity,2019-08-20,Samantha DAVIES,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,2,Danny SMITH,76,Female,A-,Obesity,2022-09-22,Tiffany MITCHELL,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,3,Andrew WATTS,28,Female,O+,Diabetes,2020-11-18,Kevin WELLS,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,4,Adrienne BELL,43,Female,AB+,Cancer,2022-09-19,Kathleen HANNA,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal
5,5,Emily JOHNSON,36,Male,A+,Asthma,2023-12-20,Taylor NEWTON,Nunez-Humphrey,UnitedHealthcare,48145.110951,389,Urgent,2023-12-24,Ibuprofen,Normal
6,6,Edward EDWARDS,21,Female,AB-,Diabetes,2020-11-03,Kelly OLSON,Group Middleton,Medicare,19580.872345,389,Emergency,2020-11-15,Paracetamol,Inconclusive
7,7,Christina MARTINEZ,20,Female,A+,Cancer,2021-12-28,Suzanne THOMAS,"Powell Robinson and Valdez,",Cigna,45820.462722,277,Emergency,2022-01-07,Paracetamol,Inconclusive
8,8,Jasmine AGUILAR,82,Male,AB+,Asthma,2020-07-01,Daniel FERGUSON,Sons Rich and,Cigna,50119.222792,316,Elective,2020-07-14,Aspirin,Abnormal
9,9,Christopher BERG,58,Female,AB-,Cancer,2021-05-23,Heather DAY,Padilla-Walker,UnitedHealthcare,19784.631062,249,Elective,2021-06-22,Paracetamol,Inconclusive


# Create graph from dataframe

In [5]:
# Define create_graph_from_df function
from typing import Union, Dict, List, Optional
import networkx as nx


def create_graph_from_df(
    df: pd.DataFrame,
    *,
    directed: bool = True,
    source_node_col: Union[str, Dict[str, str]] = "source",
    target_node_col: Union[str, Dict[str, str], None] = None,
    attributes_source_node_cols: Union[str, List[str], None] = None,
    attributes_target_node_cols: Union[str, List[str], None] = None,
    optional_nodes_cols: Optional[
        Dict[str, Dict[str, Union[str, List[str], bool]]]
    ] = None,
    attributes_edges: Union[str, List[str], None] = None,
    edge_col: Optional[str] = None,
    edge_col_label: Optional[str] = None,
    node_attributes_df: Optional[pd.DataFrame] = None,
    node_attributes_key_col: str = "id",
) -> Union[nx.Graph, nx.DiGraph]:
    """
    Create a NetworkX graph from a pandas DataFrame.

    This function converts a DataFrame where each row represents either:
    - An interaction (if target_node_col is provided)
    - A single node (if target_node_col is None)

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame where each row represents a node or an interaction.

    directed : bool, default=True
        Whether to create a directed graph. If True, creates nx.DiGraph, otherwise nx.Graph.

    source_node_col : str or Dict[str, str], default='source'
        Column(s) specifying the source node (or main node when target is None).
        If str: Name of column containing source node IDs.
        If dict: {
            'id': 'column_name',  # Required: Column containing node IDs
            'displayName': 'column_name',  # Optional: Column containing node labels
            'type': value  # Optional: Can be either:
                           # - A column name containing node types
                           # - A constant string value to use as the type for all nodes
        }

    target_node_col : str, Dict[str, str], or None, default=None
        Column(s) specifying the target node. If None, each row creates only a source node.
        Same format as source_node_col when provided.

    attributes_source_node_cols : str, List[str], or None, default=None
        Column(s) containing attributes for source nodes.

    attributes_target_node_cols : str, List[str], or None, default=None
        Column(s) containing attributes for target nodes. Ignored if target_node_col is None.

    optional_nodes_cols : Dict[str, Dict[str, Union[str, List[str], bool]]] or None, default=None
        Specifications for additional node sets in the DataFrame.
        Format: {
            'node_set_name': {
                'id': 'column_name',  # Required: Column containing node IDs
                'displayName': 'column_name',  # Optional: Column containing node labels
                'type': 'column_name',  # Optional: Column containing node types
                'attributes': ['col1', 'col2', ...],  # Optional: Columns for node attributes
                'link_to_source': True,  # Optional: Whether to link to source nodes
                'link_to_target': False,  # Optional: Whether to link to target nodes (ignored if target_node_col is None)
                'edge_attributes': ['col1', 'col2', ...]  # Optional: Edge attribute columns
            },
            ...
        }

    attributes_edges : str, List[str], or None, default=None
        Column(s) containing edge attributes between source and target nodes.
        Ignored if target_node_col is None.


    edge_type_col : str or None, default=None
        Column containing the type/label for edges.
        Ignored if target_node_col is None.

    node_attributes_df : pd.DataFrame or None, default=None
        Optional DataFrame containing additional attributes for nodes.
        Must contain a column specified by node_attributes_key_col to match nodes.

    node_attributes_key_col : str, default='id'
        Column name in node_attributes_df used to match nodes.

    Returns
    -------
    G : nx.DiGraph or nx.Graph
        NetworkX graph with nodes, edges, and attributes as specified.

    Notes
    -----
    - Node IDs must be unique and not NaN
    - If label is not specified, node ID is used as label
    - Node attributes are added as node properties in the graph
    - Edge attributes are added as edge properties in the graph
    - When target_node_col is None, each row creates a standalone node that can be linked to optional nodes

    Examples
    --------
    >>> # Simple example with standalone nodes (no target)
    >>> G = create_graph_from_df(df, source_node_col='person', target_node_col=None)

    >>> # Nodes with optional connections
    >>> G = create_graph_from_df(
    ...     df,
    ...     source_node_col={'id': 'person_id', 'displayName': 'person_name', 'type': 'Person'},
    ...     target_node_col=None,
    ...     optional_nodes_cols={
    ...         'skills': {
    ...             'id': 'skill_id',
    ...             'displayName': 'skill_name',
    ...             'type': 'Skill',
    ...             'link_to_source': True
    ...         }
    ...     }
    ... )

    >>> # Traditional source-target relationship
    >>> G = create_graph_from_df(df, source_node_col='person', target_node_col='movie')
    """
    label_str = "displayName"

    # Strip whitespace from string columns in main DataFrame
    df = df.copy()
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].astype(str).str.strip()

    # Strip whitespace from node_attributes_df if provided
    if node_attributes_df is not None:
        node_attributes_df = node_attributes_df.copy()
        for col in node_attributes_df.select_dtypes(include=["object"]).columns:
            node_attributes_df[col] = node_attributes_df[col].astype(str).str.strip()

    # Create a directed or undirected graph
    G = nx.DiGraph() if directed else nx.Graph()

    # Helper function to process node columns
    def process_node_info(col_spec):
        if isinstance(col_spec, str):
            return {
                "id": col_spec,
                label_str: None,
                "type": None,
                "is_type_column": False,
            }
        else:
            # Check if 'type' is directly a string (constant type) or a column name
            type_value = col_spec.get("type")
            is_type_column = False

            # If type is specified and is a string that exists as a column, it's a column reference
            # Otherwise, it's treated as a constant value
            if isinstance(type_value, str) and type_value in df.columns:
                is_type_column = True

            return {
                "id": col_spec.get("id"),
                label_str: col_spec.get(label_str),
                "type": type_value,
                "is_type_column": is_type_column,
            }

    # Process node column specifications
    source_info = process_node_info(source_node_col)
    target_info = (
        process_node_info(target_node_col) if target_node_col is not None else None
    )

    # Validate required columns exist in the DataFrame
    required_cols = [source_info["id"]]
    if source_info[label_str]:
        required_cols.append(source_info[label_str])
    if source_info["type"] and source_info["is_type_column"]:
        required_cols.append(source_info["type"])

    # Only add target columns if target is specified
    if target_info:
        required_cols.append(target_info["id"])
        if target_info[label_str]:
            required_cols.append(target_info[label_str])
        if target_info["type"] and target_info["is_type_column"]:
            required_cols.append(target_info["type"])

    # Add attribute columns to required columns if specified
    if attributes_source_node_cols:
        if isinstance(attributes_source_node_cols, str):
            required_cols.append(attributes_source_node_cols)
        else:
            required_cols.extend(attributes_source_node_cols)

    # Only add target attributes if target is specified
    if target_info and attributes_target_node_cols:
        if isinstance(attributes_target_node_cols, str):
            required_cols.append(attributes_target_node_cols)
        else:
            required_cols.extend(attributes_target_node_cols)

    # Only add edge attributes if target is specified
    if target_info and attributes_edges:
        if isinstance(attributes_edges, str):
            required_cols.append(attributes_edges)
        else:
            required_cols.extend(attributes_edges)

    if target_info and edge_col:
        required_cols.append(edge_col)

    # Check for optional node sets
    if optional_nodes_cols:
        for node_set, config in optional_nodes_cols.items():
            required_cols.append(config.get("id", node_set))
            if label_str in config and config[label_str]:
                required_cols.append(config[label_str])
            if "type" in config and config["type"]:
                required_cols.append(config["type"])
            if "attributes" in config and config["attributes"]:
                if isinstance(config["attributes"], str):
                    required_cols.append(config["attributes"])
                else:
                    required_cols.extend(config["attributes"])
            if "edge_attributes" in config and config["edge_attributes"]:
                if isinstance(config["edge_attributes"], str):
                    required_cols.append(config["edge_attributes"])
                else:
                    required_cols.extend(config["edge_attributes"])

    # Check if all required columns exist in DataFrame
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns in DataFrame: {missing_cols}")

    # Helper function to normalize attributes to a list
    def normalize_attr(attr):
        if attr is None:
            return []
        elif isinstance(attr, str):
            return [attr]
        return list(attr)

    # Normalize attribute lists
    source_attrs = normalize_attr(attributes_source_node_cols)
    target_attrs = normalize_attr(attributes_target_node_cols) if target_info else []
    edge_attrs = normalize_attr(attributes_edges) if target_info else []

    # Create lookup dict for node attributes if provided
    node_attrs_lookup = {}
    if node_attributes_df is not None:
        # Drop duplicates based on key column to ensure unique index
        unique_node_attrs = node_attributes_df.drop_duplicates(
            subset=[node_attributes_key_col]
        )
        node_attrs_lookup = unique_node_attrs.set_index(
            node_attributes_key_col
        ).to_dict("index")

    # Function to add a node to the graph with its attributes
    def add_node_with_attrs(
        row,
        node_id_col,
        node_label_col,
        node_type_val,
        is_type_column,
        attr_cols,
        node_prefix="",
    ):
        if pd.isna(row[node_id_col]):
            return None

        node_id = row[node_id_col]

        # Skip if node_id is NaN
        if pd.isna(node_id):
            return None

        # Add node if it doesn't exist yet
        if node_id not in G:
            # Set node attributes
            node_attrs = {}

            # Add label if specified
            if node_label_col:
                label = (
                    row[node_label_col] if not pd.isna(row[node_label_col]) else node_id
                )
                node_attrs[label_str] = label
            else:
                node_attrs[label_str] = str(node_id)

            # Add type if specified
            if node_type_val is not None:
                if is_type_column:
                    # Type from column
                    node_type = (
                        row[node_type_val] if not pd.isna(row[node_type_val]) else None
                    )
                    if node_type:
                        node_attrs["type"] = node_type
                else:
                    # Type as constant value
                    node_attrs["type"] = node_type_val

            # Add other attributes from row
            for attr_col in attr_cols:
                if attr_col in row and not pd.isna(row[attr_col]):
                    attr_name = f"{node_prefix}{attr_col}" if node_prefix else attr_col
                    node_attrs[attr_name] = row[attr_col]

            # Add attributes from node_attributes_df if available
            if node_id in node_attrs_lookup:
                for attr_name, attr_value in node_attrs_lookup[node_id].items():
                    if not pd.isna(attr_value):
                        node_attrs[attr_name] = attr_value

            G.add_node(node_id, **node_attrs)

        return node_id

    # Process each row in the DataFrame
    for idx, row in df.iterrows():
        # Add source node (always required)
        source_id = add_node_with_attrs(
            row,
            source_info["id"],
            source_info[label_str],
            source_info["type"],
            source_info["is_type_column"],
            source_attrs,
            node_prefix="",
        )

        # Skip if source is None
        if source_id is None:
            continue

        # Add target node only if target_node_col is specified
        target_id = None
        if target_info:
            target_id = add_node_with_attrs(
                row,
                target_info["id"],
                target_info[label_str],
                target_info["type"],
                target_info["is_type_column"],
                target_attrs,
                node_prefix="",
            )

            # Add edge between source and target with attributes (only if both nodes exist)
            if target_id is not None:
                edge_attributes = {}

                # Add edge type if specified
                if edge_col and edge_col in row and not pd.isna(row[edge_col]):
                    if edge_col_label is None:
                        edge_col_label = edge_col
                    edge_attributes[edge_col_label] = row[edge_col]

                # Add edge attributes
                for attr_col in edge_attrs:
                    if attr_col in row and not pd.isna(row[attr_col]):
                        edge_attributes[attr_col] = row[attr_col]

                G.add_edge(source_id, target_id, **edge_attributes)

        # Process optional node sets
        if optional_nodes_cols:
            for node_set, config in optional_nodes_cols.items():
                # Extract node information - use key name as default for id
                node_id_col = config.get("id", node_set)
                node_label_col = config.get(label_str)

                # Handle node type (column reference or constant value)
                if "type" in config:
                    # Explicitly specified type
                    node_type_val = config["type"]
                    is_type_column = (
                        isinstance(node_type_val, str) and node_type_val in df.columns
                    )
                else:
                    # Default to node_set key as constant type
                    node_type_val = node_set
                    is_type_column = False  # Always treat default as constant

                node_attr_cols = normalize_attr(config.get("attributes", []))

                # Add optional node
                opt_node_id = add_node_with_attrs(
                    row,
                    node_id_col,
                    node_label_col,
                    node_type_val,
                    is_type_column,
                    node_attr_cols,
                    node_prefix="",
                )

                if opt_node_id is None:
                    continue

                # Connect to source if specified
                if config.get("link_to_source", False):
                    edge_attrs_to_source = {}
                
                    # Add edge type if specified
                    if "edge_type_to_source" in config:
                        edge_attrs_to_source["type"] = config["edge_type_to_source"]
                
                    # Add edge attributes if specified
                    if "edge_attributes" in config:
                        for attr_col in normalize_attr(config["edge_attributes"]):
                            if attr_col in row and not pd.isna(row[attr_col]):
                                edge_attrs_to_source[attr_col] = row[attr_col]
                
                    G.add_edge(source_id, opt_node_id, **edge_attrs_to_source)

                # Connect to target if specified (only if target exists)
                if config.get("link_to_target", False) and target_id is not None:
                    edge_attrs_to_target = {}
                
                    # Add edge type if specified
                    if "edge_type_to_target" in config:
                        edge_attrs_to_target["type"] = config["edge_type_to_target"]
                
                    # Add edge attributes if specified
                    if "edge_attributes" in config:
                        for attr_col in normalize_attr(config["edge_attributes"]):
                            if attr_col in row and not pd.isna(row[attr_col]):
                                edge_attrs_to_target[attr_col] = row[attr_col]
                
                    G.add_edge(opt_node_id, target_id, **edge_attrs_to_target)

    return G

In [6]:
df

Unnamed: 0,Patient ID,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,0,Bobby JACKSON,30,Male,B-,Cancer,2024-01-31,Matthew SMITH,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,1,Leslie TERRY,62,Male,A+,Obesity,2019-08-20,Samantha DAVIES,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,2,Danny SMITH,76,Female,A-,Obesity,2022-09-22,Tiffany MITCHELL,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,3,Andrew WATTS,28,Female,O+,Diabetes,2020-11-18,Kevin WELLS,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,4,Adrienne BELL,43,Female,AB+,Cancer,2022-09-19,Kathleen HANNA,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal
5,5,Emily JOHNSON,36,Male,A+,Asthma,2023-12-20,Taylor NEWTON,Nunez-Humphrey,UnitedHealthcare,48145.110951,389,Urgent,2023-12-24,Ibuprofen,Normal
6,6,Edward EDWARDS,21,Female,AB-,Diabetes,2020-11-03,Kelly OLSON,Group Middleton,Medicare,19580.872345,389,Emergency,2020-11-15,Paracetamol,Inconclusive
7,7,Christina MARTINEZ,20,Female,A+,Cancer,2021-12-28,Suzanne THOMAS,"Powell Robinson and Valdez,",Cigna,45820.462722,277,Emergency,2022-01-07,Paracetamol,Inconclusive
8,8,Jasmine AGUILAR,82,Male,AB+,Asthma,2020-07-01,Daniel FERGUSON,Sons Rich and,Cigna,50119.222792,316,Elective,2020-07-14,Aspirin,Abnormal
9,9,Christopher BERG,58,Female,AB-,Cancer,2021-05-23,Heather DAY,Padilla-Walker,UnitedHealthcare,19784.631062,249,Elective,2021-06-22,Paracetamol,Inconclusive


In [37]:
label_str = "displayName"

G = create_graph_from_df(
    df,
    directed=True,
    source_node_col={"id": "Patient ID", label_str: "Name", "type": "Patient"},
    attributes_source_node_cols=["Age", "Date of Admission", "Discharge Date"],
    optional_nodes_cols={
        "Gender": {"link_to_source": True, "edge_type_to_source": "is"},
        "Blood Type": {"link_to_source": True, "edge_type_to_source": "is"},
        "Medical Condition": {"link_to_source": True, "edge_type_to_source": "has"},
        "Doctor": {"link_to_source": True, "edge_type_to_source": "is_treated_by"},
        "Hospital": {"attributes": ["Room Number"], "link_to_source": True, "edge_type_to_source": "is_treated_in"},
        "Insurance Provider": {
            "attributes": ["Billing Amount"],
            "link_to_source": True,
            "edge_type_to_source": "is_client_of"
        },
        "Admission Type": {"link_to_source": True},
        "Medication": {"link_to_source": True, "edge_type_to_source": "took_medication"},
        "Test Results": {"link_to_source": True, "edge_type_to_source": "has_result"},
    },
    
)
print(f"Resulting graph : {G}")

Resulting graph : DiGraph with 57 nodes and 90 edges


In [38]:
# Show first few nodes with properties
for node in list(G.nodes(data=True))[:20]:
    print(node)

('00000', {'displayName': 'Bobby JACKSON', 'type': 'Patient', 'Age': 30, 'Date of Admission': '2024-01-31', 'Discharge Date': '2024-02-02'})
('Male', {'displayName': 'Male', 'type': 'Gender'})
('B-', {'displayName': 'B-', 'type': 'Blood Type'})
('Cancer', {'displayName': 'Cancer', 'type': 'Medical Condition'})
('Matthew SMITH', {'displayName': 'Matthew SMITH', 'type': 'Doctor'})
('Sons and Miller', {'displayName': 'Sons and Miller', 'type': 'Hospital', 'Room Number': 328})
('Blue Cross', {'displayName': 'Blue Cross', 'type': 'Insurance Provider', 'Billing Amount': 18856.281305978155})
('Urgent', {'displayName': 'Urgent', 'type': 'Admission Type'})
('Paracetamol', {'displayName': 'Paracetamol', 'type': 'Medication'})
('Normal', {'displayName': 'Normal', 'type': 'Test Results'})
('00001', {'displayName': 'Leslie TERRY', 'type': 'Patient', 'Age': 62, 'Date of Admission': '2019-08-20', 'Discharge Date': '2019-08-26'})
('A+', {'displayName': 'A+', 'type': 'Blood Type'})
('Obesity', {'displa

In [39]:
# Show first few edge with properties
for edge in list(G.edges(data=True))[:20]:
    print(edge)

('00000', 'Male', {'type': 'is'})
('00000', 'B-', {'type': 'is'})
('00000', 'Cancer', {'type': 'has'})
('00000', 'Matthew SMITH', {'type': 'is_treated_by'})
('00000', 'Sons and Miller', {'type': 'is_treated_in'})
('00000', 'Blue Cross', {'type': 'is_client_of'})
('00000', 'Urgent', {})
('00000', 'Paracetamol', {'type': 'took_medication'})
('00000', 'Normal', {'type': 'has_result'})
('00001', 'Male', {'type': 'is'})
('00001', 'A+', {'type': 'is'})
('00001', 'Obesity', {'type': 'has'})
('00001', 'Samantha DAVIES', {'type': 'is_treated_by'})
('00001', 'Kim Inc', {'type': 'is_treated_in'})
('00001', 'Medicare', {'type': 'is_client_of'})
('00001', 'Emergency', {})
('00001', 'Ibuprofen', {'type': 'took_medication'})
('00001', 'Inconclusive', {'type': 'has_result'})
('00002', 'Female', {'type': 'is'})
('00002', 'A-', {'type': 'is'})


# Create graph using `turingdb` python package

<div class="alert alert-block alert-info">
    <h2>
        See <a href="https://docs.turingdb.ai/quickstart">TuringDB Get started documentation</a> for the important steps to follow :
    </h2>
    <h3>
        <ul>
            <li>Create your TuringDB account</li>
            <li>Create your instance in the <a href="https://console.turingdb.ai/auth">TuringDB Cloud UI</a></li>
            <li>Copy your Instance ID from the Database Instances management page</li>
            <li>Get API Key from the Settings in UI</li>
        </ul>
        Remember to have your instance active while working in this notebook !
    </h3>
</div>

In [40]:
from turingdb import TuringDB

# Create TuringDB client
client = TuringDB(
    host="http://localhost:6666"
    # instance_id="...",  # Replace by your instance id
    # auth_token="...",  # Replace by your API token
)

In [41]:
# Get list of available graphs
list_graphs = client.query("LIST GRAPH")["graph"].tolist()

In [42]:
# Set graph name
graph_name_prefix = "healthcare_dataset"
graph_name_nb_suffix = str(
    max(
        [
            int(re.sub(graph_name_prefix, "", g))
            for g in list_graphs
            if g.startswith(graph_name_prefix)
            and re.sub(graph_name_prefix, "", g).isdigit()
        ]
        + [0]
    )
    + 1
)
graph_name = graph_name_prefix + graph_name_nb_suffix
graph_name

'healthcare_dataset7'

In [43]:
# Create a new graph
client.query(f"CREATE GRAPH {graph_name}")
client.set_graph(graph_name)

# Create a new change on the graph
change = client.query("CHANGE NEW")["Change ID"][0]

# Checkout into the change
client.checkout(change=change)

In [44]:
# Define build_create_command_from_networkx function
def build_create_command_from_networkx(G, edge_type_key="type"):
    """Build CREATE command from NetworkX object"""

    def escape_value(value):
        """Escape quotes and special characters in property values"""
        import re

        value_str = str(value)
        # Replace problematic characters
        value_str = value_str.replace('"', '\\"')
        value_str = value_str.replace("\n", " ")
        value_str = value_str.replace("\r", " ")
        value_str = value_str.replace("\t", " ")
        # Remove or replace other problematic characters
        value_str = re.sub(
            r"[^\w\s\-\.\,\:\;\(\)\[\]\{\}\/\@\#\$\%\&\*\+\=\<\>\?\!\~\`\|\\]",
            " ",
            value_str,
        )
        # Clean up multiple spaces
        value_str = re.sub(r"\s+", " ", value_str).strip()
        return value_str

    # Collect all unique nodes
    all_nodes = {}
    for node_id, attrs in G.nodes(data=True):
        all_nodes[node_id] = attrs

    # Build single CREATE query
    node_parts = []
    edge_parts = []

    # Create node variable assignments
    for i, (node_id, attrs) in enumerate(all_nodes.items()):
        var_name = f"n{i}"
        props = ", ".join([f'"{k}":"{escape_value(v)}"' for k, v in attrs.items()])
        node_type = attrs.get("type", "Node")
        # Convert node_type to PascalCase to avoid issues with spaces in queries
        node_type = "".join(x for x in node_type.title() if not x.isspace())
        node_parts.append(f'({var_name}:{node_type} {{"id":"{node_id}", {props}}})')

    # Create edge patterns using node variables
    node_to_var = {node_id: f"n{i}" for i, node_id in enumerate(all_nodes.keys())}
    for source, target, edge_attrs in G.edges(data=True):
        source_var = node_to_var[source]
        target_var = node_to_var[target]

        # Extract relationship type from specified key
        relationship_type = edge_attrs.get(edge_type_key, "CONNECTED")

        # Remove the type key from properties to avoid duplication
        filtered_edge_attrs = {
            k: v for k, v in edge_attrs.items() if k != edge_type_key
        }

        # Build edge properties string
        edge_props = (
            ", ".join(
                [f'"{k}":"{escape_value(v)}"' for k, v in filtered_edge_attrs.items()]
            )
            if filtered_edge_attrs
            else ""
        )
        edge_props_str = f" {{{edge_props}}}" if edge_props else ""

        # Convert relationship type to uppercase for Cypher convention
        relationship_type = str(relationship_type).upper()

        edge_parts.append(
            f"({source_var})-[:{relationship_type}{edge_props_str}]-({target_var})"
        )

    # Build final command
    all_parts = node_parts + edge_parts
    if all_parts:
        create_command = "CREATE " + ",\n".join(all_parts)
        return create_command

    return ""

In [45]:
# Build CREATE command from networkx object
create_command = build_create_command_from_networkx(G)
print(f"Cypher CREATE command :\n\n{100 * '*'}\n{create_command}\n{100 * '*'}")

Cypher CREATE command :

****************************************************************************************************
CREATE (n0:Patient {"id":"00000", "displayName":"Bobby JACKSON", "type":"Patient", "Age":"30", "Date of Admission":"2024-01-31", "Discharge Date":"2024-02-02"}),
(n1:Gender {"id":"Male", "displayName":"Male", "type":"Gender"}),
(n2:BloodType {"id":"B-", "displayName":"B-", "type":"Blood Type"}),
(n3:MedicalCondition {"id":"Cancer", "displayName":"Cancer", "type":"Medical Condition"}),
(n4:Doctor {"id":"Matthew SMITH", "displayName":"Matthew SMITH", "type":"Doctor"}),
(n5:Hospital {"id":"Sons and Miller", "displayName":"Sons and Miller", "type":"Hospital", "Room Number":"328"}),
(n6:InsuranceProvider {"id":"Blue Cross", "displayName":"Blue Cross", "type":"Insurance Provider", "Billing Amount":"18856.281305978155"}),
(n7:AdmissionType {"id":"Urgent", "displayName":"Urgent", "type":"Admission Type"}),
(n8:Medication {"id":"Paracetamol", "displayName":"Paracetamol",

In [46]:
# Run CREATE command
client.query(create_command)

In [47]:
# Commit the change
client.query("COMMIT")
client.query("CHANGE SUBMIT")

# Checkout into main
client.checkout()

<div class="alert alert-block alert-info">
    <h2>
        Visualize your graph in TuringDB Graph Visualizer ! Now that your instance is running:
    </h2>
    <h3>
        <ul>
            <li>Go to <a href="https://console.turingdb.ai/databases">TuringDB Console - Database Instances</a></li>
            <li>In your current instance panel, click on "Open Visualizer" button</li>
            <li>Visualizer opens, now you can choose your graph in the dropdown menu at the top-right corner</li>
        </ul>
        You can then play with your graph and visualize the nodes you want !
    </h3>
</div>

# Query TuringDB

## Simple queries

In [48]:
# Match all edges
# Return displayName properties
command = "MATCH (n)-[e]-(m) RETURN n.displayName, e, m.displayName"
client.query(command)

Unnamed: 0,n.displayName,e,m.displayName
0,Danny SMITH,0,Normal
1,Danny SMITH,1,Aspirin
2,Danny SMITH,2,Emergency
3,Danny SMITH,3,Aetna
4,Danny SMITH,4,Cook PLC
...,...,...,...
85,Bobby JACKSON,85,Sons and Miller
86,Bobby JACKSON,86,Matthew SMITH
87,Bobby JACKSON,87,Cancer
88,Bobby JACKSON,88,B-


In [49]:
# Match all edges linking a Patient to an other node
# Return displayName and type properties
command = (
    "MATCH (n:Patient)-[e]-(m) RETURN n.type, n.displayName, e, m.type, m.displayName"
)
client.query(command)

Unnamed: 0,n.type,n.displayName,e,m.type,m.displayName
0,Patient,Danny SMITH,0,Test Results,Normal
1,Patient,Danny SMITH,1,Medication,Aspirin
2,Patient,Danny SMITH,2,Admission Type,Emergency
3,Patient,Danny SMITH,3,Insurance Provider,Aetna
4,Patient,Danny SMITH,4,Hospital,Cook PLC
...,...,...,...,...,...
85,Patient,Bobby JACKSON,85,Hospital,Sons and Miller
86,Patient,Bobby JACKSON,86,Doctor,Matthew SMITH
87,Patient,Bobby JACKSON,87,Medical Condition,Cancer
88,Patient,Bobby JACKSON,88,Blood Type,B-


In [50]:
# Find all patients
command = """
MATCH (p:Patient)
RETURN p.id, p.displayName, p.Age
"""
client.query(command)

Unnamed: 0,p.id,p.displayName,p.Age
0,2,Danny SMITH,76
1,3,Andrew WATTS,28
2,4,Adrienne BELL,43
3,1,Leslie TERRY,62
4,5,Emily JOHNSON,36
5,6,Edward EDWARDS,21
6,7,Christina MARTINEZ,20
7,8,Jasmine AGUILAR,82
8,9,Christopher BERG,58
9,0,Bobby JACKSON,30


In [51]:
# Find all doctors:
command = """
MATCH (d:Doctor)
RETURN d.displayName
"""
client.query(command)

Unnamed: 0,d.displayName
0,Kathleen HANNA
1,Samantha DAVIES
2,Heather DAY
3,Taylor NEWTON
4,Kevin WELLS
5,Kelly OLSON
6,Suzanne THOMAS
7,Matthew SMITH
8,Tiffany MITCHELL
9,Daniel FERGUSON


In [52]:
# Find all medications:
command = """
MATCH (d:Medication)
RETURN d.displayName
"""
client.query(command)

Unnamed: 0,d.displayName
0,Paracetamol
1,Aspirin
2,Ibuprofen
3,Penicillin


In [53]:
command = """
MATCH (p:Patient {id: "00000"})
RETURN *
"""
client.query(command)

Unnamed: 0,Unnamed: 1
0,9


In [54]:
command = """
MATCH (p:Patient {id: "00000"})
RETURN p, p.id, p.displayName, p.type, p.Age, p."Date of Admission", p."Discharge Date"
"""
client.query(command)

Unnamed: 0,p,p.id,p.displayName,p.type,p.Age,p.Date of Admission,p.Discharge Date
0,9,0,Bobby JACKSON,Patient,30,2024-01-31,2024-02-02


In [56]:
# Find female patients:
command = """
MATCH (p:Patient)-[:IS]-(g:Gender {displayName: "Female"})
RETURN p.displayName, p.Age
"""
client.query(command)

Unnamed: 0,p.displayName,p.Age
0,Danny SMITH,76
1,Andrew WATTS,28
2,Adrienne BELL,43
3,Edward EDWARDS,21
4,Christina MARTINEZ,20
5,Christopher BERG,58


In [59]:
# Find patients with Cancer:
command = """
MATCH (p:Patient)-[:HAS]-(mc:MedicalCondition {displayName: "Cancer"})
RETURN p.displayName, p.Age
"""
client.query(command)

Unnamed: 0,p.displayName,p.Age
0,Adrienne BELL,43
1,Christina MARTINEZ,20
2,Christopher BERG,58
3,Bobby JACKSON,30


In [77]:
# Find all patients who are treated by a doctor
command = """
MATCH (p:Patient)-[:IS_TREATED_BY]-(d:Doctor)
RETURN p.displayName, d.displayName
"""
client.query(command)

Unnamed: 0,p.displayName,d.displayName
0,Danny SMITH,Tiffany MITCHELL
1,Andrew WATTS,Kevin WELLS
2,Adrienne BELL,Kathleen HANNA
3,Leslie TERRY,Samantha DAVIES
4,Emily JOHNSON,Taylor NEWTON
5,Edward EDWARDS,Kelly OLSON
6,Christina MARTINEZ,Suzanne THOMAS
7,Jasmine AGUILAR,Daniel FERGUSON
8,Christopher BERG,Heather DAY
9,Bobby JACKSON,Matthew SMITH


In [78]:
# Find all patients treated by doctor Kelly OLSON
command = """
MATCH (p:Patient)-[:IS_TREATED_BY]-(d:Doctor {"displayName": "Kelly OLSON"})
RETURN p.displayName, d.displayName
"""
client.query(command).rename(
    columns={"p.displayName": "Patients", "d.displayName": "Doctors"}
)

Unnamed: 0,Patients,Doctors
0,Edward EDWARDS,Kelly OLSON


In [79]:
# Find all patients with blood type A+
command = """
MATCH (p:Patient)-[:IS]-(bt:BloodType {displayName: "A+"})
RETURN p.displayName, p.Age
"""
client.query(command)

Unnamed: 0,p.displayName,p.Age
0,Leslie TERRY,62
1,Emily JOHNSON,36
2,Christina MARTINEZ,20


In [80]:
# Find all patients who took Paracetamol
command = """
MATCH (p:Patient)-[:TOOK_MEDICATION]-(m:Medication {"displayName": "Paracetamol"})
RETURN p.displayName, m.displayName
"""
client.query(command)

Unnamed: 0,p.displayName,m.displayName
0,Edward EDWARDS,Paracetamol
1,Christina MARTINEZ,Paracetamol
2,Christopher BERG,Paracetamol
3,Bobby JACKSON,Paracetamol


## Complex queries

In [81]:
## Find patients with Cancer treated by Doctor Kelly OLSON
#command = """
#MATCH (p:Patient)-[:HAS]-(mc:MedicalCondition {displayName: "Cancer"}),
#      (p)-[:IS_TREADED_BY]-(d:Doctor {displayName: "Kelly OLSON"})
#RETURN p.displayName, d.displayName
#"""
#client.query(command)

In [82]:
## Find all male patients treated in Sons and Miller hospital and client of Medicare insurance
#command = """
#MATCH (p:Patient)-[:IS]-(g:Gender {displayName: "Male"}),
#      (p:Patient)-[:IS_TREATED_IN]-(h:Hospital {displayName: "Sons and Miller"})
#      (p:Patient)-[:IS_CLIENT_OF]-(ip:"Insurance Provider" {displayName: "Medicare"})
#RETURN p.displayName, g.displayName, h.displayName, ip.displayName
#"""
#client.query(command).rename(
#    columns={"p.displayName": "Patients", "g.displayName": "Gender", "h.displayName": "Hospital", "ip.displayName": "Insurance Provider"}
#)