<div class="alert alert-block alert-success">
    <h1>
        Example notebook - Integrate RDF file
    </h1>
    <p>
        Link to dataset : <a href="TR/owl-guide/wine.rdf">Link to Wine RDF file download</a>
    </p>
</div>

# Import modules and functions

In [1]:
from rdflib import Graph as RDFGraph
import re
import time

In [2]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

# Check data files are available

In [3]:
example_name = "wine_ontology"
path_data = f"{os.getcwd()}/data/{example_name}"
if not os.path.exists(path_data):
    print(f"{path_data} does not exist")
    os.makedirs(path_data)
    print(f"{path_data} folder created âœ”")

list_files = sorted(os.listdir(path_data))
rdf_filename = "wine_ontology.rdf"
if not rdf_filename in list_files:
    print(f"Wine ontology RDF file is not available in {path_data}. \n")
    url_rdf = "https://www.w3.org/TR/owl-guide/wine.rdf"
    print(
        f"Downloading from: {url_rdf}"
        "\n...\n"
    )
    os.system(f"wget {url_rdf} -O {path_data}/{rdf_filename}")

# Load dataset

In [4]:
# RDF graph loading
path_rdf = f"{path_data}/{rdf_filename}"

rg = RDFGraph()
rg.parse(path_rdf)
print(f"RDF file successfully loaded with {len(rg)} triples")

RDF file successfully loaded with 1839 triples


In [5]:
for obj in list(rg.objects())[:10]:
    print(f"Object: {obj}")
    print(f"Type:   {type(obj)}")
    print()

Object: N0ec1443eadc947789b0d7d13e4632a40
Type:   <class 'rdflib.term.BNode'>

Object: N0010c8c1b9ab45f58c1cc4ce69fcdc6e
Type:   <class 'rdflib.term.BNode'>

Object: Na7352b3d797b414da857bc84cf2a7a29
Type:   <class 'rdflib.term.BNode'>

Object: 1
Type:   <class 'rdflib.term.Literal'>

Object: http://www.w3.org/TR/2003/PR-owl-guide-20031209/wine#locatedIn
Type:   <class 'rdflib.term.URIRef'>

Object: N7fc2483dc1c244249c02fcb4ffbb01c3
Type:   <class 'rdflib.term.BNode'>

Object: http://www.w3.org/TR/2003/PR-owl-guide-20031209/wine#SauvignonBlancGrape
Type:   <class 'rdflib.term.URIRef'>

Object: http://www.w3.org/TR/2003/PR-owl-guide-20031209/wine#Medium
Type:   <class 'rdflib.term.URIRef'>

Object: http://www.w3.org/2002/07/owl#Class
Type:   <class 'rdflib.term.URIRef'>

Object: Naf7eddada1f64cf8a458e5d27025a4d3
Type:   <class 'rdflib.term.BNode'>



# Graph Creation in TuringDB

## Build Cypher CREATE Commands

### Sanitize Cypher query

In [6]:
def sanitize_cypher_query(query):
    query = query.replace("#", "_")
    query = query.replace("-", "_")

    return query

### TuringDB v2 (not implemented yet)

In [7]:
def rdf_to_cypher_v2(rdf_file, batch_size=1000):
    from rdflib import Graph
    
    g = Graph()
    g.parse(rdf_file)
    
    nodes = {str(s) for s, _, _ in g} | {str(o) for _, _, o in g if not isinstance(o, str) or o.startswith('http')}
    
    for i in range(0, len(nodes), batch_size):
        batch = list(nodes)[i:i+batch_size]
        yield "CREATE " + ",\n".join(
            f"(:{node.split('/')[-1]} {{uri: '{node}'}})" 
            for node in batch
        )
    
    for s, p, o in g:
        if not isinstance(o, str) or o.startswith('http'):
            yield f"MATCH (a {{uri: '{s}'}}), (b {{uri: '{o}'}}) CREATE (a)-[:{str(p).split('/')[-1]}]->(b)"

In [8]:
%%time

# Create Cypher CREATE command for TuringDB v2
graph_CREATE_command_v2 = '\n'.join(rdf_to_cypher_v2(path_rdf))
graph_CREATE_command_v2 = sanitize_cypher_query(graph_CREATE_command_v2)
print(graph_CREATE_command_v2)

CREATE (:Neb69f27c00024affa20d7d46aa189b4d {uri: 'Neb69f27c00024affa20d7d46aa189b4d'}),
(:wine_McGuinnesso {uri: 'http://www.w3.org/TR/2003/PR_owl_guide_20031209/wine_McGuinnesso'}),
(:wine_WineFlavor {uri: 'http://www.w3.org/TR/2003/PR_owl_guide_20031209/wine_WineFlavor'}),
(:wine_hasWineDescriptor {uri: 'http://www.w3.org/TR/2003/PR_owl_guide_20031209/wine_hasWineDescriptor'}),
(:wine_WhiteBordeaux {uri: 'http://www.w3.org/TR/2003/PR_owl_guide_20031209/wine_WhiteBordeaux'}),
(:N6d21f21d839d4497984b6def49294aef {uri: 'N6d21f21d839d4497984b6def49294aef'}),
(:N1c03afab90354b008b2843c1e98c32f9 {uri: 'N1c03afab90354b008b2843c1e98c32f9'}),
(:Ne81ddf040f2d404cb2615eee4e25356b {uri: 'Ne81ddf040f2d404cb2615eee4e25356b'}),
(:N3afdfb73da3541f3b61524f6bd243764 {uri: 'N3afdfb73da3541f3b61524f6bd243764'}),
(:N2806345706fc42f39cb481868688f967 {uri: 'N2806345706fc42f39cb481868688f967'}),
(:wine_MountEdenVineyardEstatePinotNoir {uri: 'http://www.w3.org/TR/2003/PR_owl_guide_20031209/wine_MountEdenVine

### TuringDB v1

In [9]:
def rdf_to_cypher_v1(rdf_file):
    from rdflib import Graph
    
    g = Graph()
    g.parse(rdf_file)
    
    # Collect nodes
    nodes = {str(s) for s, _, _ in g} | {str(o) for _, _, o in g if not isinstance(o, str) or o.startswith('http')}
    node_to_var = {node: f"n{i}" for i, node in enumerate(nodes)}
    
    parts = []
    
    # Add all nodes
    for node in nodes:
        var = node_to_var[node]
        label = node.split('/')[-1]
        # Prefix label with underscore if it starts with a digit
        if label and label[0].isdigit():
            label = 'ID_' + label
        parts.append(f"({var}:{label} {{uri: '{node}'}})")
    
    # Add all edges
    for s, p, o in g:
        if not isinstance(o, str) or o.startswith('http'):
            s_var = node_to_var[str(s)]
            o_var = node_to_var[str(o)]
            pred = str(p).split('/')[-1]
            # Prefix predicate with underscore if it starts with a digit
            if pred and pred[0].isdigit():
                pred = 'ID_' + pred
            parts.append(f"({s_var})-[:{pred}]->({o_var})")
    
    return "CREATE " + ",\n".join(parts)

In [10]:
def sanitize_identifier(s):
    """Sanitize identifiers (labels, relationship types, property names)"""
    if not s:
        return "ID_empty"
    
    # Replace invalid characters with underscores
    s = s.replace('#', '_').replace('-', '_').replace('/', '_').replace('.', '_')
    
    # Prefix with ID_ if starts with digit or underscore
    if s[0].isdigit() or s[0] == '_':
        s = 'ID_' + s
    
    # Remove any other non-alphanumeric characters except underscores
    s = ''.join(c if c.isalnum() or c == '_' else '_' for c in s)
    
    return s

def sanitize_value(s):
    """Sanitize property values (escape quotes)"""
    return s.replace("'", "\\'").replace('"', '\\"')

def rdf_to_cypher_v1(rdf_file):
    from rdflib import Graph
    
    g = Graph()
    g.parse(rdf_file)
    
    # Collect nodes
    nodes = {str(s) for s, _, _ in g} | {str(o) for _, _, o in g if not isinstance(o, str) or o.startswith('http')}
    node_to_var = {node: f"n{i}" for i, node in enumerate(nodes)}
    
    parts = []
    
    # Add all nodes
    for node in nodes:
        var = node_to_var[node]
        label = sanitize_identifier(node.split('/')[-1])
        uri_value = sanitize_value(node)
        parts.append(f"({var}:{label} {{uri: '{uri_value}'}})")
    
    # Add all edges
    for s, p, o in g:
        if not isinstance(o, str) or o.startswith('http'):
            s_var = node_to_var[str(s)]
            o_var = node_to_var[str(o)]
            pred = sanitize_identifier(str(p).split('/')[-1])
            parts.append(f"({s_var})-[:{pred}]-({o_var})")
    
    return "CREATE " + ",\n".join(parts)

In [11]:
#%%time

# Create Cypher CREATE command for TuringDB v1
graph_CREATE_command_v1 = rdf_to_cypher_v1(path_rdf)
#graph_CREATE_command_v1 = sanitize_cypher_query(graph_CREATE_command_v1)
print(graph_CREATE_command_v1)

CREATE (n0:wine_McGuinnesso {uri: 'http://www.w3.org/TR/2003/PR-owl-guide-20031209/wine#McGuinnesso'}),
(n1:N45afd4d09aed44c18260becab9e5f776 {uri: 'N45afd4d09aed44c18260becab9e5f776'}),
(n2:N2d32403080e5464089334ccba8627853 {uri: 'N2d32403080e5464089334ccba8627853'}),
(n3:N685a68eabbbf46b4915436530266012d {uri: 'N685a68eabbbf46b4915436530266012d'}),
(n4:wine_WineFlavor {uri: 'http://www.w3.org/TR/2003/PR-owl-guide-20031209/wine#WineFlavor'}),
(n5:N7ef77ac676ff494ab724062aad8e4763 {uri: 'N7ef77ac676ff494ab724062aad8e4763'}),
(n6:wine_hasWineDescriptor {uri: 'http://www.w3.org/TR/2003/PR-owl-guide-20031209/wine#hasWineDescriptor'}),
(n7:wine_WhiteBordeaux {uri: 'http://www.w3.org/TR/2003/PR-owl-guide-20031209/wine#WhiteBordeaux'}),
(n8:N61981cfaf03b4aa783a51c607c080109 {uri: 'N61981cfaf03b4aa783a51c607c080109'}),
(n9:wine_Wine {uri: 'http://www.w3.org/TR/2003/PR-owl-guide-20031209/wine#Wine'}),
(n10:Ndbf0bacf82d14217a6b8e0f630e4ef22 {uri: 'Ndbf0bacf82d14217a6b8e0f630e4ef22'}),
(n11:N9b9

In [12]:
%%time

# Save Cypher query to file
with open(f"{path_data}/{example_name}.cypher", "w") as f:
    f.write(graph_CREATE_command_v1)

CPU times: user 1.73 ms, sys: 0 ns, total: 1.73 ms
Wall time: 943 Î¼s


# Create graph using `turingdb` python package

<div class="alert alert-block alert-info">
    <h2>
        See <a href="https://docs.turingdb.ai/quickstart">TuringDB Get started documentation</a> for the important steps to follow :
    </h2>
    <h3>
        <ul>
            <li>Create your TuringDB account</li>
            <li>Create your instance in the <a href="https://console.turingdb.ai/auth">TuringDB Cloud UI</a></li>
            <li>Copy your Instance ID from the Database Instances management page</li>
            <li>Get API Key from the Settings in UI</li>
        </ul>
        Remember to have your instance active while working in this notebook !
    </h3>
</div>

In [13]:
from turingdb import TuringDB

# Create TuringDB client
client = TuringDB(
    host="http://localhost:6666"  # Remove this parameter and set the two parameters below
    # instance_id=os.getenv("INSTANCE_ID"),
    # auth_token=os.getenv("AUTH_TOKEN"),
)

In [14]:
%%time

client.s3_connect(
    bucket_name="turing-internal",
    region="eu-west-2",
    access_key=os.getenv("AWS_ACCESS_KEY"),
    secret_key=os.getenv("AWS_SECRET_KEY"),
)

CPU times: user 151 ms, sys: 52.1 ms, total: 203 ms
Wall time: 274 ms


In [15]:
# Get list of loaded graphs
list_graphs = client.list_loaded_graphs()
list_graphs

['default']

In [16]:
# Set graph name
graph_name_prefix = example_name
graph_name_nb_suffix = str(
    max(
        [
            int(re.sub(graph_name_prefix, "", g))
            for g in list_graphs
            if g.startswith(graph_name_prefix)
            and re.sub(graph_name_prefix, "", g).isdigit()
        ]
        + [0]
    )
    + 1
)
graph_name = graph_name_prefix + graph_name_nb_suffix
graph_name = re.sub("-", "_", graph_name)
print(f"graph_name: {graph_name}")

graph_name: wine_ontology1


In [17]:
from turingdb.exceptions import TuringDBException

In [18]:
%%time

# Set graph
try:
    client.create_graph(graph_name)
except TuringDBException as e:
    print(e)

# Set working graph
client.set_graph(graph_name)

CPU times: user 1.37 ms, sys: 91 Î¼s, total: 1.46 ms
Wall time: 10.8 ms


In [19]:
%%time

# Create a new change on the graph
client.checkout()
change = client.new_change()
print(f"Current change {change}")

# Checkout into the change
client.checkout(change=change)

Current change 0
CPU times: user 2.02 ms, sys: 1e+03 ns, total: 2.02 ms
Wall time: 1.62 ms


In [20]:
%%time

# Run CREATE command
print("\nExecuting query on TuringDB...")
start_time = time.time()
result = client.query(graph_CREATE_command_v1)
execution_time = time.time() - start_time
print(f"âœ“ Graph created successfully in {execution_time:.2f} seconds")

# Commit the change
client.query("COMMIT")
client.query("CHANGE SUBMIT")

# Checkout into main
client.checkout()


Executing query on TuringDB...
CPU times: user 1.63 ms, sys: 42 Î¼s, total: 1.67 ms
Wall time: 127 ms


RemoteProtocolError: peer closed connection without sending complete message body (incomplete chunked read)

# Query TuringDB

## Use metaqueries to have insight on graph overall structure

<h3>
    To learn more about ðŸ“® Metaqueries, please check TuringDB documentation on this <a href="https://turingdb.mintlify.app/query/cypher_subset#%F0%9F%93%AE-metaqueries">link</a>
</h3>

In [38]:
%%time

# CALL PROPERTIES() - returns a column of all the different node and edge properties and their types in the database
command = """
CALL PROPERTIES()
"""
df_PROPERTIES = client.query(command)
if df_PROPERTIES.empty:
    print("No result found")
else:
    df_PROPERTIES.columns = ["Property_ID", "Property_name", "Property_type"]
    display(df_PROPERTIES)

No result found
CPU times: user 3.26 ms, sys: 1.94 ms, total: 5.2 ms
Wall time: 3.55 ms


In [28]:
# Get node properties
nodes_properties = df_PROPERTIES["Property_name"].values.tolist()
print(f"Node properties: {nodes_properties}")

Node properties: ['id', 'type', 'amount', 'step', 'is_fraud', 'is_flagged', 'oldbalance', 'newbalance']


In [29]:
%%time

# CALL LABELS () - returns a column of all the different node labels
command = """
CALL LABELS()
"""
df_LABELS = client.query(command)
if df_LABELS.empty:
    print("No result found")
else:
    df_LABELS.columns = ["Node_type_ID", "Node_label"]
    display(df_LABELS)

Unnamed: 0,Node_type_ID,Node_label
0,0,Account
1,1,Transaction


CPU times: user 6.76 ms, sys: 1.98 ms, total: 8.74 ms
Wall time: 7.3 ms


In [30]:
%%time

# CALL EDGETYPES() - returns a column of all the different edge types (edge equivalent of node labels)
command = """
CALL EDGETYPES()
"""
df_EDGETYPES = client.query(command)
if df_EDGETYPES.empty:
    print("No result found")
else:
    df_EDGETYPES.columns = ["Edge_type_ID", "Edge_label"]
    display(df_EDGETYPES)

Unnamed: 0,Edge_type_ID,Edge_label
0,0,SENT
1,1,RECEIVED


CPU times: user 4.69 ms, sys: 3.96 ms, total: 8.65 ms
Wall time: 7.18 ms


In [31]:
%%time

# CALL LABELSETS() - returns a two columns describing combinations of node labels
command = """
CALL LABELSETS()
"""
df_LABELSETS = client.query(command)
if df_LABELSETS.empty:
    print("No result found")
else:
    df_LABELSETS.columns = ["Node_type_ID", "Node_label"]
    display(df_LABELSETS)

Unnamed: 0,Node_type_ID,Node_label
0,0,Account
1,1,Transaction


CPU times: user 7.24 ms, sys: 1.02 ms, total: 8.26 ms
Wall time: 7.05 ms


In [32]:
%%time

# Find number of nodes and number of edges in the graph
n_nodes = len(client.query("MATCH (n) RETURN n"))
n_edges = len(client.query("MATCH (n)--(m) RETURN n, m"))
print(f"Graph: {n_nodes:,} nodes and {n_edges:,} edges")

Graph: 1,572 nodes and 1,472 edges
CPU times: user 3.66 ms, sys: 2.99 ms, total: 6.65 ms
Wall time: 5.03 ms


## Simple queries

In [33]:
from turingdb_examples.utils import get_return_statements

In [35]:
%%time

# Match all edges and return them
command = """
MATCH (n)-[e]-(m)
RETURN n.id, n.type, e, m.id, m.type
"""
df_all_edges = client.query(command)
if df_all_edges.empty:
    print("No result found")
else:
    df_all_edges.columns = get_return_statements(command)
    display(df_all_edges)

Unnamed: 0,n.id,n.type,e,m.id,m.type
0,C1958260831,customer,0,TX307,CASH_OUT
1,C693207385,customer,1,TX592,CASH_OUT
2,C1108687791,customer,2,TX308,CASH_OUT
3,C2048633894,customer,3,TX309,TRANSFER
4,C436378709,customer,4,TX312,CASH_OUT
...,...,...,...,...,...
1467,TX340,TRANSFER,1467,C19673653,customer
1468,TX341,CASH_OUT,1468,C1241266786,customer
1469,TX342,CASH_OUT,1469,C1241266786,customer
1470,TX343,CASH_IN,1470,C1241266786,customer


CPU times: user 18.8 ms, sys: 1.03 ms, total: 19.8 ms
Wall time: 18.5 ms
