# Import modules and functions

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
import os
import networkx as nx

In [4]:
from turingdb.exceptions import TuringDBException

In [5]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

# Set path to data

In [6]:
example_name = "software_dependencies"
path_data = f"{os.getcwd()}/data/{example_name}"
if not os.path.exists(path_data):
    raise ValueError(f"{path_data} does not exists")

# Create graph using `turingdb` python package

<div class="alert alert-block alert-info">
    <h2>
        See <a href="https://docs.turingdb.ai/quickstart">TuringDB Get started documentation</a> for the important steps to follow :
    </h2>
    <h3>
        <ul>
            <li>Create your TuringDB account</li>
            <li>Create your instance in the <a href="https://console.turingdb.ai/auth">TuringDB Cloud UI</a></li>
            <li>Copy your Instance ID from the Database Instances management page</li>
            <li>Get API Key from the Settings in UI</li>
        </ul>
        Remember to have your instance active while working in this notebook !
    </h3>
</div>

## Connect to instance and transfer data

In [7]:
from turingdb import TuringDB

# Create TuringDB client
client = TuringDB(
    host="http://localhost:6666"  # Remove this parameter and set the two parameters below
    # instance_id=os.getenv("INSTANCE_ID"),
    # auth_token=os.getenv("AUTH_TOKEN"),
)

In [8]:
%%time

client.s3_connect(
    bucket_name="turing-internal",
    region="eu-west-2",
    access_key=os.getenv("AWS_ACCESS_KEY"),
    secret_key=os.getenv("AWS_SECRET_KEY"),
)

CPU times: user 142 ms, sys: 56.9 ms, total: 198 ms
Wall time: 6.29 s


In [9]:
%%time

gml_filename = f"{example_name}.gml"
client.transfer(
    src=f"data/{example_name}/{gml_filename}",
    dst=f"turingdb://{gml_filename}",  # to s3 bucket or TuringDB instance or local .turing
)

CPU times: user 85.1 ms, sys: 21.8 ms, total: 107 ms
Wall time: 424 ms


In [10]:
! tree /home/dev/.turing/data

[01;34m/home/dev/.turing/data[0m
â”œâ”€â”€ ai_gov_control_mappings_full.csv
â”œâ”€â”€ reactome.dump
â”œâ”€â”€ sec_8k_raw_text_filings_2024.csv
â”œâ”€â”€ sec_8k_raw_text_filings_2024.gml
â””â”€â”€ software_dependencies.gml

0 directories, 5 files


## Check data files are available

In [11]:
list_files = sorted(os.listdir(path_data))
if gml_filename not in list_files:
    raise ValueError(f"file is not available in {path_data}")

## Import and format data

In [12]:
path_turing_folder = f"{os.getenv('HOME')}/.turing"

In [13]:
G = nx.read_gml(f"{path_turing_folder}/data/{example_name}.gml")
print(G)

DiGraph with 21 nodes and 40 edges


In [14]:
client.query(f"""
IMPORT GRAPH {example_name}
FROM "{example_name}.gml"
""")

In [17]:
try:
    client.load_graph(example_name)
except TuringDBException as e:
    print(f"{e}: Graph already loaded")

GRAPH_LOAD_ERROR: Graph already loaded


In [16]:
client.set_graph(example_name)

# Query TuringDB

## Use metaqueries to have insight on graph overall structure

<h3>
    To learn more about ðŸ“® Metaqueries, please check TuringDB documentation on this <a href="https://turingdb.mintlify.app/query/cypher_subset#%F0%9F%93%AE-metaqueries">link</a>
</h3>

In [18]:
%%time

# CALL PROPERTIES() - returns a column of all the different node and edge properties and their types in the database
command = """
CALL PROPERTIES()
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = ["Property_ID", "Property_name", "Property_type"]
    display(df)

Unnamed: 0,Property_ID,Property_name,Property_type
0,0,label (String),String
1,1,type (String),String
2,2,industry (String),String
3,3,founded (String),String
4,4,category (String),String
5,5,provider (String),String
6,6,relationship (String),String
7,7,criticality (String),String


CPU times: user 9.1 ms, sys: 2.91 ms, total: 12 ms
Wall time: 10.9 ms


In [19]:
%%time

# CALL LABELS () - returns a column of all the different node labels
command = """
CALL LABELS()
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = ["Node_type_ID", "Node_label"]
    display(df)

Unnamed: 0,Node_type_ID,Node_label
0,0,GMLNode


CPU times: user 5.82 ms, sys: 973 Î¼s, total: 6.8 ms
Wall time: 6.16 ms


In [20]:
%%time

# CALL EDGETYPES() - returns a column of all the different edge types (edge equivalent of node labels)
command = """
CALL EDGETYPES()
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = ["Edge_type_ID", "Edge_label"]
    display(df)

Unnamed: 0,Edge_type_ID,Edge_label
0,0,GMLEdge


CPU times: user 5.67 ms, sys: 950 Î¼s, total: 6.62 ms
Wall time: 5.94 ms


In [21]:
%%time

# CALL LABELSETS() - returns a two columns describing combinations of node labels
command = """
CALL LABELSETS()
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    df.columns = ["Node_type_ID", "Node_label"]
    display(df)

Unnamed: 0,Node_type_ID,Node_label
0,0,GMLNode


CPU times: user 6.44 ms, sys: 73 Î¼s, total: 6.52 ms
Wall time: 5.9 ms


## Simple queries

In [22]:
%%time

# Match all edges and return them
command = """
MATCH (n)-[e]-(m)
RETURN n, e, m
"""
df = client.query(command)
if df.empty:
    print("No result found")
else:
    #df.columns = get_return_statements(command)
    display(df)

Unnamed: 0,0,1,2
0,0,0,6
1,0,1,7
2,1,2,8
3,1,3,3
4,2,4,8
5,2,5,7
6,2,6,3
7,9,7,4
8,9,8,3
9,10,9,3


CPU times: user 8.38 ms, sys: 66 Î¼s, total: 8.44 ms
Wall time: 7.83 ms
