In [357]:
from neo4j import GraphDatabase
import pandas as pd
from py2neo import Graph, Node, Relationship
import json
import os
import csv
import collections
import re
from jsonpath_ng.ext import parse
import ssl
import networkx as nx

In [51]:
from gremlin_python import statics
from gremlin_python.process.anonymous_traversal import traversal
from gremlin_python.process.graph_traversal import __
from gremlin_python.process.strategies import *
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.traversal import T
from gremlin_python.process.traversal import Order
from gremlin_python.process.traversal import Cardinality
from gremlin_python.process.traversal import Column
from gremlin_python.process.traversal import Direction
from gremlin_python.process.traversal import Operator
from gremlin_python.process.traversal import P
from gremlin_python.process.traversal import TextP
from gremlin_python.process.traversal import Pop
from gremlin_python.process.traversal import Scope
from gremlin_python.process.traversal import Barrier
from gremlin_python.process.traversal import Bindings
from gremlin_python.process.traversal import WithOptions
from gremlin_python.driver.aiohttp.transport import AiohttpTransport

In [217]:
directory_path = "ali_nicholas_EuSNbSe2_gemd_model_dumps"
json_data = []
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    try:
        with open(file_path, encoding='utf-8') as file:
            json_data.append(json.load(file))
    except:
        print("skipping")

skipping
skipping
skipping
skipping
skipping


In [358]:
graph = Graph("http://localhost:7474")

# Decoupling #

1. When calling the decoupling operator, decouple the entire GEMD graph, or only decouple a part of the graph depending on the input parameters?
2. Right now only identified and separated shared nodes < material-template, process_template, property_template, parameter_template, condition_template, categorical_bounds >. Creating a copy of material_spec for each material_run and a copy of process spec for each process run seems to be too complicated. 
3. Question: For example, I have a path process_1 -> material_1 -> ingredient -> process_2 -> material_2. I want to identify the DAGs with process -> some edges in between -> material, where the material's name contain the word EuS. In the example, material_2 contains the word EuS but material_1 doesn't. But the result returns the entire path process_1 -> material_1 -> ingredient -> process_2 -> material_2 instead of just process_2 -> material_2 (in other words, the output contain the correct but extraneous information. Is this fine? 

**Shared nodes: material_template, material_template -> material_spec**

In [582]:
query = '''
    MATCH (parent:material_template)-[r:materialTemplate_to_materialSpec]- > (child: material_spec)
    CREATE (copy: material_template_copy)
    SET copy = properties(parent)
    SET copy.node_id = parent.node_id + '_' + child.node_id
'''
graph.run(query)
query = '''
    MATCH (parent:material_template)-[r]- > (child:material_spec)
    MATCH (parent_copy:material_template_copy)
    WHERE parent_copy.node_id = parent.node_id + '_' + child.node_id
    CREATE (parent_copy)-[:materialTemplate_to_materialSpec]- > (child)
'''
graph.run(query)
query = "MATCH (n:material_template) DETACH DELETE n"
graph.run(query)

**Shared nodes: property_template & categorical_bounds, material_spec -> property_template -> categorical_bounds -> ingredient_spec**

In [583]:
ingredient_spec_expr = parse('$[?(@.type == "ingredient_spec")]')
nodes = ingredient_spec_expr.find(json_data)
materialSpec_to_ingredientSpec = collections.defaultdict(list)
for n in nodes:
    ingredient_id = n.value['uids']['auto']
    material_id = n.value['material']['id']
    materialSpec_to_ingredientSpec[material_id].append(ingredient_id)

query = '''
    MATCH (n1:material_spec)-[r1]- > (n2:property_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:ingredient_spec)
    WHERE n1.node_id = $material_id AND 
          n4.node_id = $ingredient_id
    CREATE (property_copy:property_template_copy)
    SET property_copy = properties(n2)
    SET property_copy.node_id = n1.node_id + '_' + n2.node_id + '_' + n4.node_id
    CREATE (categorical_copy:property_categorical_bounds)
    SET categorical_copy = properties(n3)
    SET categorical_copy.node_id = n1.node_id + '_' + n3.node_id + '_' + n4.node_id
    RETURN property_copy, categorical_copy
'''
for material_id, ingredient_ids in materialSpec_to_ingredientSpec.items():
    for ingredient_id in ingredient_ids:
        graph.run(query, material_id = material_id, ingredient_id = ingredient_id)

In [584]:
query = '''
    MATCH (n1:material_spec)-[r1]- > (n2:property_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:ingredient_spec)
    MATCH (n2_copy:property_template_copy),(n3_copy:property_categorical_bounds)
    WHERE n1.node_id = $material_id AND 
          n4.node_id = $ingredient_id AND
          n2_copy.node_id = n1.node_id + '_' + n2.node_id + '_' + n4.node_id AND 
          n3_copy.node_id = n1.node_id + '_' + n3.node_id + '_' + n4.node_id
    CREATE (n1)-[:materialSpec_to_propertyTemplate]- > (n2_copy)-[:property_to_category]- > 
           (n3_copy)-[:category_to_ingredientSpec]- > (n4)
'''
for material_id, ingredient_ids in materialSpec_to_ingredientSpec.items():
    for ingredient_id in ingredient_ids:
        graph.run(query, material_id = material_id, ingredient_id = ingredient_id)

query = "MATCH (n:property_template) DETACH DELETE n"
graph.run(query)

**Shared nodes: parameter_template & categorical_bounds, ingredient_spec -> parameter_template -> categorical_bounds -> process_spec**

In [585]:
# ingredient_spec -> parameter_template ->categorical_bounds -> process_spec
ingredient_spec_expr = parse('$[?(@.type == "ingredient_spec")]')
nodes = ingredient_spec_expr.find(json_data)
processSpec_to_ingredientSpec = collections.defaultdict(list)
for n in nodes:
    ingredient_id = n.value['uids']['auto']
    process_id = n.value['process']['id']
    processSpec_to_ingredientSpec[process_id].append(ingredient_id)
processSpec_to_ingredientSpec
query = '''
    MATCH (n1:ingredient_spec)-[r1]- > (n2:parameter_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:process_spec)
    WHERE n1.node_id = $ingredient_id AND 
          n4.node_id = $process_id
    CREATE (parameter_copy:parameter_template_copy)
    SET parameter_copy = properties(n2)
    SET parameter_copy.node_id = n1.node_id + '_' + n2.node_id + '_' + n4.node_id
    CREATE (categorical_copy:parameter_categorical_bounds)
    SET categorical_copy = properties(n3)
    SET categorical_copy.node_id = n1.node_id + '_' + n3.node_id + '_' + n4.node_id
    RETURN parameter_copy, categorical_copy
'''
for process_id, ingredient_ids in processSpec_to_ingredientSpec.items():
    for ingredient_id in ingredient_ids:
        graph.run(query, ingredient_id = ingredient_id, process_id = process_id)

query = '''
    MATCH (n:parameter_template_copy)
    WITH n.node_id AS id, COLLECT(n) AS nodelist, COUNT(*) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeNodes(nodelist) YIELD node
    RETURN node
'''
graph.run(query)
query = '''
    MATCH (n:parameter_categorical_bounds)
    WITH n.node_id AS id, COLLECT(n) AS nodelist, COUNT(*) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeNodes(nodelist) YIELD node
    RETURN node
'''
graph.run(query)

In [586]:
query = '''
    MATCH (n1:ingredient_spec)-[r1]- > (n2:parameter_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:process_spec)
    MATCH (n2_copy:parameter_template_copy),(n3_copy:parameter_categorical_bounds)
    WHERE n1.node_id = $ingredient_id AND 
          n4.node_id = $process_id AND
          n2_copy.node_id = $ingredient_id + '_' + n2.node_id + '_' + $process_id AND 
          n3_copy.node_id = $ingredient_id + '_' + n3.node_id + '_' + $process_id
    CREATE (n1)-[:ingredientSpec_to_parameterTemplate]- > (n2_copy)-[:property_to_category]- > 
           (n3_copy)-[:category_to_processSpec]- > (n4)
'''
for process_id, ingredient_ids in processSpec_to_ingredientSpec.items():
    for ingredient_id in ingredient_ids:
        graph.run(query, ingredient_id = ingredient_id, process_id = process_id)
        
query = '''
    MATCH (source)-[r]->(target)
    WITH source, target, COLLECT(r) AS relationshipList, COUNT(r) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeRelationships(relationshipList) YIELD rel
    RETURN source,target
'''
graph.run(query)

source,target
"{name: 'Time of Event', real_lower_bound: '0', real_units: 'hour', description: 'time at which an event occurs. Can be relative to the start/end of a process, or absolute, however specified', real_upper_bound: '96', type: 'parameter_template', node_id: '7b383ee6-0886-4c64-8d32-6d2debfc12e2'}","{name: 'EusNb2Se4 pellets (heated in sealed vessel)', type: 'material_run', tags: '01_23', node_id: 'e5de2faa-eb8a-4dc5-ae07-88ad536ee41f'}"
"{name: 'Time of Event', real_lower_bound: '0', real_units: 'hour', description: 'time at which an event occurs. Can be relative to the start/end of a process, or absolute, however specified', real_upper_bound: '96', type: 'parameter_template', node_id: '7b383ee6-0886-4c64-8d32-6d2debfc12e2'}","{name: 'EusNb2Se4 pellets (heated in sealed vessel)', type: 'material_run', tags: '01_10', node_id: '9e0568f5-6e65-49b3-9d0c-0ffe52e102fa'}"
"{name: 'Time of Event', real_lower_bound: '0', real_units: 'hour', description: 'time at which an event occurs. Can be relative to the start/end of a process, or absolute, however specified', real_upper_bound: '96', type: 'parameter_template', node_id: '7b383ee6-0886-4c64-8d32-6d2debfc12e2'}","{name: 'EusNb2Se4 pellets (heated in sealed vessel)', type: 'material_run', tags: '01_13', node_id: '6bfe80f4-53dc-41a5-9c97-8d1b2860092a'}"


**Shared nodes: condition_template & categorical_bounds, ingredient_spec -> condition_template -> categorical_bounds -> process_spec**

In [587]:
# ingredient_spec -> condition_template ->categorical_bounds -> process_spec
query = '''
    MATCH (n1:ingredient_spec)-[r1]- > (n2:condition_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:process_spec)
    WHERE n1.node_id = $ingredient_id AND 
          n4.node_id = $process_id
    CREATE (condition_copy:condition_template_copy)
    SET condition_copy = properties(n2)
    SET condition_copy.node_id = n1.node_id + '_' + n2.node_id + '_' + n4.node_id
    CREATE (categorical_copy:condition_categorical_bounds)
    SET categorical_copy = properties(n3)
    SET categorical_copy.node_id = n1.node_id + '_' + n3.node_id + '_' + n4.node_id
    RETURN condition_copy, categorical_copy
'''
for process_id, ingredient_ids in processSpec_to_ingredientSpec.items():
    for ingredient_id in ingredient_ids:
        graph.run(query, ingredient_id = ingredient_id, process_id = process_id)

query = '''
    MATCH (n:condition_template_copy)
    WITH n.node_id AS id, COLLECT(n) AS nodelist, COUNT(*) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeNodes(nodelist) YIELD node
    RETURN node
'''
graph.run(query)
query = '''
    MATCH (n:condition_categorical_bounds)
    WITH n.node_id AS id, COLLECT(n) AS nodelist, COUNT(*) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeNodes(nodelist) YIELD node
    RETURN node
'''
graph.run(query)

In [588]:
query = '''
    MATCH (n1:ingredient_spec)-[r1]- > (n2:condition_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:process_spec)
    MATCH (n2_copy:condition_template_copy),(n3_copy:condition_categorical_bounds)
    WHERE n1.node_id = $ingredient_id AND 
          n4.node_id = $process_id AND
          n2_copy.node_id = $ingredient_id + '_' + n2.node_id + '_' + $process_id AND 
          n3_copy.node_id = $ingredient_id + '_' + n3.node_id + '_' + $process_id
    CREATE (n1)-[:ingredientSpec_to_conditionTemplate]- > (n2_copy)-[:condition_to_category]- > 
           (n3_copy)-[:category_to_processSpec]- > (n4)
'''
for process_id, ingredient_ids in processSpec_to_ingredientSpec.items():
    for ingredient_id in ingredient_ids:
        graph.run(query, ingredient_id = ingredient_id, process_id = process_id)
        
query = '''
    MATCH (source)-[r]->(target)
    WITH source, target, COLLECT(r) AS relationshipList, COUNT(r) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeRelationships(relationshipList) YIELD rel
    RETURN source,target
'''
graph.run(query)

**Shared nodes: parameter_template & categorical_bounds, ingredient_run -> parameter_template -> categorical_bounds -> process_run**

In [589]:
ingredient_run_expr = parse('$[?(@.type == "ingredient_run")]')
nodes = ingredient_run_expr.find(json_data)
processRun_to_ingredientRun = collections.defaultdict(list)
for n in nodes:
    ingredient_id = n.value['uids']['auto']
    process_id = n.value['process']['id']
    processRun_to_ingredientRun[process_id].append(ingredient_id)

query = '''
    MATCH (n1:ingredient_run)-[r1]- > (n2:parameter_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:process_run)
    WHERE n1.node_id = $ingredient_id AND 
          n4.node_id = $process_id
    CREATE (parameter_copy:parameter_template_copy)
    SET parameter_copy = properties(n2)
    SET parameter_copy.node_id = n1.node_id + '_' + n2.node_id + '_' + n4.node_id
    CREATE (categorical_copy:parameter_categorical_bounds)
    SET categorical_copy = properties(n3)
    SET categorical_copy.node_id = n1.node_id + '_' + n3.node_id + '_' + n4.node_id
    RETURN parameter_copy, categorical_copy
'''
for process_id, ingredient_ids in processRun_to_ingredientRun.items():
    for ingredient_id in ingredient_ids:
        graph.run(query, ingredient_id = ingredient_id, process_id = process_id)

query = '''
    MATCH (n:parameter_template_copy)
    WITH n.node_id AS id, COLLECT(n) AS nodelist, COUNT(*) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeNodes(nodelist) YIELD node
    RETURN node
'''
graph.run(query)
query = '''
    MATCH (n:parameter_categorical_bounds)
    WITH n.node_id AS id, COLLECT(n) AS nodelist, COUNT(*) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeNodes(nodelist) YIELD node
    RETURN node
'''
graph.run(query)

In [590]:
query = '''
    MATCH (n1:ingredient_run)-[r1]- > (n2:parameter_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:process_run)
    MATCH (n2_copy:parameter_template_copy),(n3_copy:parameter_categorical_bounds)
    WHERE n1.node_id = $ingredient_id AND 
          n4.node_id = $process_id AND
          n2_copy.node_id = $ingredient_id + '_' + n2.node_id + '_' + $process_id AND 
          n3_copy.node_id = $ingredient_id + '_' + n3.node_id + '_' + $process_id
    CREATE (n1)-[:ingredientRun_to_parameterTemplate]- > (n2_copy)-[:property_to_category]- > 
           (n3_copy)-[:category_to_processRun]- > (n4)
'''
for process_id, ingredient_ids in processRun_to_ingredientRun.items():
    for ingredient_id in ingredient_ids:
        graph.run(query, ingredient_id = ingredient_id, process_id = process_id)
        
query = '''
    MATCH (source)-[r]->(target)
    WITH source, target, COLLECT(r) AS relationshipList, COUNT(r) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeRelationships(relationshipList) YIELD rel
    RETURN source,target
'''
graph.run(query)

source,target
"{name: 'EusNb2Se4 pellets (heated in sealed vessel) Ingredient', type: 'ingredient_run', tags: '01_13', node_id: 'b9b40025-6892-403f-a65c-ed043ed5673d'}","{name: 'Equipment Used', description: 'A parameter describing the equipment used in a particular process or measurement', type: 'parameter_template', node_id: 'b9b40025-6892-403f-a65c-ed043ed5673d_0471f9aa-d24d-4598-ac94-02c6dd1c8d1d_34fbdd88-4c2e-4df2-9edf-3cc34d2ccb32'}"
"{name: 'EusNb2Se4 pellets Ingredient', type: 'ingredient_run', tags: '01_23', node_id: '13de8a09-aa3b-413a-b622-32443578699e'}","{name: 'Equipment Used', description: 'A parameter describing the equipment used in a particular process or measurement', type: 'parameter_template', node_id: '13de8a09-aa3b-413a-b622-32443578699e_0471f9aa-d24d-4598-ac94-02c6dd1c8d1d_58bbce6d-2805-4a4f-8395-16b7397855b3'}"
"{name: 'EusNb2Se4 pellets Ingredient', type: 'ingredient_run', tags: '01_20', node_id: 'd570d7d1-401a-4d5d-906c-ab7f03f3e228'}","{name: 'Equipment Used', description: 'A parameter describing the equipment used in a particular process or measurement', type: 'parameter_template', node_id: 'd570d7d1-401a-4d5d-906c-ab7f03f3e228_0471f9aa-d24d-4598-ac94-02c6dd1c8d1d_be514bf8-c80f-45ad-9b99-e2c1d5955b23'}"


**Shared nodes: condition_template & categorical_bounds, ingredient_run -> condition_template -> categorical_bounds -> process_run**

In [591]:
query = '''
    MATCH (n1:ingredient_run)-[r1]- > (n2:condition_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:process_run)
    WHERE n1.node_id = $ingredient_id AND 
          n4.node_id = $process_id
    CREATE (condition_copy:condition_template_copy)
    SET condition_copy = properties(n2)
    SET condition_copy.node_id = n1.node_id + '_' + n2.node_id + '_' + n4.node_id
    CREATE (categorical_copy:condition_categorical_bounds)
    SET categorical_copy = properties(n3)
    SET categorical_copy.node_id = n1.node_id + '_' + n3.node_id + '_' + n4.node_id
    RETURN condition_copy, categorical_copy
'''
for process_id, ingredient_ids in processRun_to_ingredientRun.items():
    for ingredient_id in ingredient_ids:
        graph.run(query, ingredient_id = ingredient_id, process_id = process_id)

query = '''
    MATCH (n:condition_template_copy)
    WITH n.node_id AS id, COLLECT(n) AS nodelist, COUNT(*) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeNodes(nodelist) YIELD node
    RETURN node
'''
graph.run(query)
query = '''
    MATCH (n:condition_categorical_bounds)
    WITH n.node_id AS id, COLLECT(n) AS nodelist, COUNT(*) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeNodes(nodelist) YIELD node
    RETURN node
'''
graph.run(query)

In [592]:
query = '''
    MATCH (n1:ingredient_run)-[r1]- > (n2:condition_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:process_run)
    MATCH (n2_copy:condition_template_copy),(n3_copy:condition_categorical_bounds)
    WHERE n1.node_id = $ingredient_id AND 
          n4.node_id = $process_id AND
          n2_copy.node_id = n1.node_id + '_' + n2.node_id + '_' + n4.node_id AND 
          n3_copy.node_id = n1.node_id + '_' + n3.node_id + '_' + n4.node_id
    CREATE (n1)-[:ingredientRun_to_conditionTemplate]- > (n2_copy)-[:condition_to_category]- > 
           (n3_copy)-[:category_to_processRun]- > (n4)
'''
for process_id, ingredient_ids in processRun_to_ingredientRun.items():
    for ingredient_id in ingredient_ids:
        graph.run(query, ingredient_id = ingredient_id, process_id = process_id)
        
query = '''
    MATCH (source)-[r]->(target)
    WITH source, target, COLLECT(r) AS relationshipList, COUNT(r) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeRelationships(relationshipList) YIELD rel
    RETURN source,target
'''
graph.run(query)

**Shared nodes: condition_template & categorical_bounds, measurement_run -> condition_template -> categorical_bounds -> material_run**

In [593]:
measurement_run_expr = parse('$[?(@.type == "measurement_run")]')
nodes = measurement_run_expr.find(json_data)
materialRun_to_measurementRun = collections.defaultdict(list)
for n in nodes:
    measurement_id = n.value['uids']['auto']
    material_id = n.value['material']['id']
    materialRun_to_measurementRun[material_id].append(measurement_id)
    
query = '''
    MATCH (n1:measurement_run)-[r1]- > (n2:condition_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:material_run)
    WHERE n1.node_id = $measurement_id AND 
          n4.node_id = $material_id
    CREATE (condition_copy:condition_template_copy)
    SET condition_copy = properties(n2)
    SET condition_copy.node_id = n1.node_id + '_' + n2.node_id + '_' + n4.node_id
    CREATE (categorical_copy:condition_categorical_bounds)
    SET categorical_copy = properties(n3)
    SET categorical_copy.node_id = n1.node_id + '_' + n3.node_id + '_' + n4.node_id
    RETURN condition_copy, categorical_copy
'''
for material_id, measurement_ids in materialRun_to_measurementRun.items():
    for measurement_id in measurement_ids:
        graph.run(query, measurement_id = measurement_id, material_id = material_id)

query = '''
    MATCH (n:condition_template_copy)
    WITH n.node_id AS id, COLLECT(n) AS nodelist, COUNT(*) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeNodes(nodelist) YIELD node
    RETURN node
'''
graph.run(query)
query = '''
    MATCH (n:condition_categorical_bounds)
    WITH n.node_id AS id, COLLECT(n) AS nodelist, COUNT(*) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeNodes(nodelist) YIELD node
    RETURN node
'''
graph.run(query)

In [594]:
query = '''
    MATCH (n1:measurement_run)-[r1]- > (n2:condition_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:material_run)
    MATCH (n2_copy:condition_template_copy),(n3_copy:condition_categorical_bounds)
    WHERE n1.node_id = $measurement_id AND 
          n4.node_id = $material_id AND
          n2_copy.node_id = n1.node_id + '_' + n2.node_id + '_' + n4.node_id AND 
          n3_copy.node_id = n1.node_id + '_' + n3.node_id + '_' + n4.node_id
    CREATE (n1)-[:measurementRun_to_conditionTemplate]- > (n2_copy)-[:condition_to_category]- > 
           (n3_copy)-[:category_to_materialRun]- > (n4)
'''
for material_id, measurement_ids in materialRun_to_measurementRun.items():
    for measurement_id in measurement_ids:
        graph.run(query, measurement_id = measurement_id, material_id = material_id)
        
query = '''
    MATCH (source)-[r]->(target)
    WITH source, target, COLLECT(r) AS relationshipList, COUNT(r) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeRelationships(relationshipList) YIELD rel
    RETURN source,target
'''
graph.run(query)

source,target
"{real_name_1: 'Time of Measurement', real_value_1: '24.0', real_units_1: 'hour', name: 'First temperature measurement of pellets during heating process', type: 'measurement_run', tags: '01_13', node_id: 'ab1097bb-e6d6-4d46-a201-3c5435f0fc4d'}","{name: 'Location', description: 'A condition describing the location in which a process or measurement is performed', type: 'condition_template', node_id: 'ab1097bb-e6d6-4d46-a201-3c5435f0fc4d_00de68f0-57d8-42fc-b5f4-4a1f21d7dc68_6bfe80f4-53dc-41a5-9c97-8d1b2860092a'}"
"{real_name_1: 'Time of Measurement', real_value_1: '68.0', real_units_1: 'hour', name: 'Second temperature measurement of pellets during heating process', type: 'measurement_run', tags: '01_13', node_id: '0a44a1da-fcaa-4ac8-a827-8d6ba9cd070d'}","{name: 'Location', description: 'A condition describing the location in which a process or measurement is performed', type: 'condition_template', node_id: '0a44a1da-fcaa-4ac8-a827-8d6ba9cd070d_00de68f0-57d8-42fc-b5f4-4a1f21d7dc68_6bfe80f4-53dc-41a5-9c97-8d1b2860092a'}"
"{name: 'XRD characterization on final pellets', type: 'measurement_run', tags: '01_13', node_id: '79fcc7d7-5fe8-4e52-81f7-17bf45c25c53'}","{name: 'Location', description: 'A condition describing the location in which a process or measurement is performed', type: 'condition_template', node_id: '79fcc7d7-5fe8-4e52-81f7-17bf45c25c53_00de68f0-57d8-42fc-b5f4-4a1f21d7dc68_6bfe80f4-53dc-41a5-9c97-8d1b2860092a'}"


**Shared nodes: parameter_template & categorical_bounds, measurement_run -> parameter_template -> categorical_bounds -> material_run**

In [595]:
query = '''
    MATCH (n1:measurement_run)-[r1]- > (n2:parameter_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:material_run)
    WHERE n1.node_id = $measurement_id AND 
          n4.node_id = $material_id
    CREATE (parameter_copy:parameter_template_copy)
    SET parameter_copy = properties(n2)
    SET parameter_copy.node_id = n1.node_id + '_' + n2.node_id + '_' + n4.node_id
    CREATE (categorical_copy:parameter_categorical_bounds)
    SET categorical_copy = properties(n3)
    SET categorical_copy.node_id = n1.node_id + '_' + n3.node_id + '_' + n4.node_id
    RETURN parameter_copy, categorical_copy
'''
for material_id, measurement_ids in materialRun_to_measurementRun.items():
    for measurement_id in measurement_ids:
        graph.run(query, measurement_id = measurement_id, material_id = material_id)

query = '''
    MATCH (n:parameter_template_copy)
    WITH n.node_id AS id, COLLECT(n) AS nodelist, COUNT(*) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeNodes(nodelist) YIELD node
    RETURN node
'''
graph.run(query)
query = '''
    MATCH (n:parameter_categorical_bounds)
    WITH n.node_id AS id, COLLECT(n) AS nodelist, COUNT(*) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeNodes(nodelist) YIELD node
    RETURN node
'''
graph.run(query)

In [596]:
query = '''
    MATCH (n1:measurement_run)-[r1]- > (n2:parameter_template)-[r2]- > 
          (n3:categorical_bounds)-[r3]- > (n4:material_run)
    MATCH (n2_copy:parameter_template_copy),(n3_copy:parameter_categorical_bounds)
    WHERE n1.node_id = $measurement_id AND 
          n4.node_id = $material_id AND
          n2_copy.node_id = n1.node_id + '_' + n2.node_id + '_' + n4.node_id AND 
          n3_copy.node_id = n1.node_id + '_' + n3.node_id + '_' + n4.node_id
    CREATE (n1)-[:measurementRun_to_parameterTemplate]- > (n2_copy)-[:condition_to_category]- > 
           (n3_copy)-[:category_to_materialRun]- > (n4)
'''
for material_id, measurement_ids in materialRun_to_measurementRun.items():
    for measurement_id in measurement_ids:
        graph.run(query, measurement_id = measurement_id, material_id = material_id)
        
query = '''
    MATCH (source)-[r]->(target)
    WITH source, target, COLLECT(r) AS relationshipList, COUNT(r) AS count
    WHERE count > 1
    CALL apoc.refactor.mergeRelationships(relationshipList) YIELD rel
    RETURN source,target
'''
graph.run(query)

**Shared nodes: process_template, process_template -> process_spec**

In [597]:
query = '''
    MATCH (parent:process_template)-[r]- > (child: process_spec)
    CREATE (copy: process_template_copy)
    SET copy = properties(parent)
    SET copy.node_id = parent.node_id + '_' + child.node_id
'''
graph.run(query)
query = '''
    MATCH (parent:process_template)-[r]- > (child:process_spec)
    MATCH (parent_copy:process_template_copy)
    WHERE parent_copy.node_id = parent.node_id + '_' + child.node_id
    CREATE (parent_copy)-[:processTemplate_to_processSpec]- > (child)
'''
graph.run(query)
query = "MATCH (n:process_template) DETACH DELETE n"
graph.run(query)

In [598]:
query = '''
    MATCH (parent:measurement_template)-[r]- > (child: measurement_spec)
    CREATE (copy: measurement_template_copy)
    SET copy = properties(parent)
    SET copy.node_id = parent.node_id + '_' + child.node_id
'''
graph.run(query)
query = '''
    MATCH (parent:measurement_template)-[r]- > (child:measurement_spec)
    MATCH (parent_copy:measurement_template_copy)
    WHERE parent_copy.node_id = parent.node_id + '_' + child.node_id
    CREATE (parent_copy)-[:measurementTemplate_to_measurementSpec]- > (child)
'''
graph.run(query)
query = "MATCH (n:measurement_template) DETACH DELETE n"
graph.run(query)

In [599]:
query = '''
    MATCH (n:condition_template)
    DETACH DELETE n
'''
graph.run(query)

In [600]:
query = '''
    MATCH (n:parameter_template)
    DETACH DELETE n
'''
graph.run(query)

In [601]:
query = '''
    MATCH (n:categorical_bounds)
    DETACH DELETE n
'''
graph.run(query)

## Gremlin ##

# Match #

In [194]:
def construct_subgraph(nodes_json, edges_json):
    nodes = []
    node_id = set()
    for node_str in nodes_json:
        node_json = json.loads(node_str.replace("'", '"'))
        for node in node_json:
            if node['properties']['node_id'] in node_id:
                continue
            else:
                node_id.add(node['properties']['node_id'])
                nodes.append(node)
    edges = []
    edge_id = set()
    for edge_str in edges_json:
        edge_json = json.loads(edge_str.replace("'", '"'))
        for edge in edge_json:
            start_id = edge['start']['properties']['node_id']
            end_id = edge['end']['properties']['node_id']
            if (start_id, end_id) in edge_id:
                continue
            else:
                edge_id.add((start_id, end_id))
                edges.append(edge)
    for node in nodes:
        node_id = ''
        name = ''
        notes = ''
        real_lower_bound = ''
        real_upper_bound = ''
        real_units = ''
        real_name_1 = ''
        real_value_1 = ''
        real_units_1 = ''
        real_name_2 = ''
        real_value_2 = ''
        real_units_2 = ''
        real_name_3 = ''
        real_value_3 = ''
        real_units_3 = ''
        real_name_4 = ''
        real_value_4 = ''
        real_units_4 = ''
        real_name_5 = ''
        real_value_5 = ''
        real_units_5 = ''
        real_name_6 = ''
        real_value_6 = ''
        real_units_6 = ''
        description = ''
        node_type = ''
        tags = ''
        if 'node_id' in node['properties']:
            node_id = node['properties']['node_id']
        if 'name' in node['properties']:
            name = node['properties']['name']
        if 'notes' in node['properties']:
            notes = node['properties']['notes']
        if 'real_lower_bound' in node['properties']:
            real_lower_bound = node['properties']['real_lower_bound']
        if 'real_upper_bound' in node['properties']:
            real_upper_bound = node['properties']['real_upper_bound']
        if 'real_units' in node['properties']:
            real_units = node['properties']['real_units']
        if 'real_name_1' in node['properties']:
            real_name_1 = node['properties']['real_name_1']
        if 'real_value_1' in node['properties']:
            real_value_1 = node['properties']['real_value_1']
        if 'real_units_1' in node['properties']:
            real_units_1 = node['properties']['real_units_1']
        if 'real_name_2' in node['properties']:
            real_name_2 = node['properties']['real_name_2']
        if 'real_value_2' in node['properties']:
            real_value_2 = node['properties']['real_value_2']
        if 'real_units_2' in node['properties']:
            real_units_2 = node['properties']['real_units_2']
        if 'real_name_3' in node['properties']:
            real_name_3 = node['properties']['real_name_3']
        if 'real_value_3' in node['properties']:
            real_value_3 = node['properties']['real_value_3']
        if 'real_units_3' in node['properties']:
            real_units_3 = node['properties']['real_units_3']
        if 'real_name_4' in node['properties']:
            real_name_4 = node['properties']['real_name_4']
        if 'real_value_4' in node['properties']:
            real_value_4 = node['properties']['real_value_4']
        if 'real_units_4' in node['properties']:
            real_units_4 = node['properties']['real_units_4']
        if 'real_name_5' in node['properties']:
            real_name_5 = node['properties']['real_name_5']
        if 'real_value_5' in node['properties']:
            real_value_5 = node['properties']['real_value_5']
        if 'real_units_5' in node['properties']:
            real_units_5 = node['properties']['real_units_5']
        if 'real_name_6' in node['properties']:
            real_name_6 = node['properties']['real_name_6']
        if 'real_value_6' in node['properties']:
            real_value_6 = node['properties']['real_value_6']
        if 'real_units_6' in node['properties']:
            real_units_6 = node['properties']['real_units_6']
        if 'description' in node['properties']:
            description = node['properties']['description']
        if 'type' in node['properties']:
            node_type = node['properties']['type']
        if 'tags' in node['properties']:
            tags = node['properties']['tags']
        g.addV(node['labels'][0]).property('node_id', node_id).property('name', name) \
                                 .property('notes', notes).property('real_lower_bound', real_lower_bound) \
                                 .property('real_upper_bound', real_upper_bound).property('real_units', real_units) \
                                 .property('real_name_1', real_name_1).property('real_value_1', real_value_1) \
                                 .property('real_units_1', real_units_1).property('real_name_2', real_name_2) \
                                 .property('real_value_2', real_value_2).property('real_units_2', real_units_2) \
                                 .property('real_name_3', real_name_3).property('real_value_3', real_value_3) \
                                 .property('real_units_3', real_units_3).property('real_name_4', real_name_4) \
                                 .property('real_value_4', real_value_4).property('real_units_4', real_units_4) \
                                 .property('real_name_5', real_name_5).property('real_value_5', real_value_5) \
                                 .property('real_units_5', real_units_5).property('real_name_6', real_name_6) \
                                 .property('real_value_6', real_value_6).property('real_units_6', real_units_6) \
                                 .property('description', description).property('type', node_type) \
                                 .property('tags', tags).next()
    for edge in edges:
        start_id = edge['start']['properties']['node_id']
        end_id = edge['end']['properties']['node_id']
        label = edge['label']
        g.V().has('node_id', start_id).as_('v1').V().has('node_id', end_id).addE(label).from_('v1').toList()

In [15]:
def convert_jsonpath(path):
    path = path.replace(']','')
    path = re.split(r'&&|\|\||=', path)
    currentPath = ""
    query = []
    i = 0
    while i < len(path)-1:
        if '[' in path[i]:
            first,second = path[i].split("[")
            currentPath += first + "["
            count = currentPath.count("[")
            query.append(currentPath + second + "]"*count)
        else:
            count = currentPath.count("[")
            query.append(currentPath + path[i] + "]"*count)
        query.append(path[i+1])
        i += 2
    return query

In [16]:
def json_to_graph(nodes_json, edges_json):
    for nodes in nodes_json:
        for node in nodes:
            subgraph.create(Node(node['labels'][0], **node["properties"]))
    for edges in edges_json:
        for edge in edges:
            start_node_id = edge['start']['properties']['node_id']
            end_node_id = edge['end']['properties']['node_id']
            relationship_type = edge['label']
            start_node = subgraph.nodes.match(properties={'node_id':start_node_id}).first()
            end_node = subgraph.nodes.match(properties={'node_id':end_node_id}).first()
            if start_node and end_node:
                relationship = Relationship(start_node, relationship_type, end_node)
                graph.create(relationship)
    return subgraph

In [145]:
def generate_match_query(query):
    query = query.split(" FOLLOWS+ ")
    parent_path = query[1]
    child_path = query[0]
    parent = convert_jsonpath(parent_path)
    child = convert_jsonpath(child_path)
    query = '''
    MATCH path = (n) - [*1..5] - > (m) 
    WITH path, apoc.convert.toJson(apoc.convert.toMap(n)) AS json_n, 
         apoc.convert.toJson(apoc.convert.toMap(m)) AS json_m 
    WHERE 
    '''
    i = 0
    while i < len(parent):
        query += "apoc.json.path(json_n, \"{}\")[0] =~ {} AND ".format(parent[i], parent[i+1])
        i += 2
    i = 0
    while i < len(child):
        if i != len(child)-2:
            query += "apoc.json.path(json_m, \"{}\")[0] =~ {} AND ".format(child[i], child[i+1])
        else:
            query += "apoc.json.path(json_m, \"{}\")[0] =~ {} ".format(child[i], child[i+1])
        i += 2
    query += '''
    WITH [node in nodes(path) | { labels: labels(node), properties: properties(node) }] AS nodes,
    relationships(path) AS relationships
    RETURN apoc.convert.toJson(nodes) as nodes_json, apoc.convert.toJson(relationships) as edges_json
    '''
    print(query)
    return query

In [146]:
def match(query_predicate):
    query = generate_match_query(query_predicate)
    result = graph.run(query)
    nodes_json = []
    edges_json = []
    for record in result:
        nodes_json.append(str(json.loads(record['nodes_json'])))
        edges_json.append(str(json.loads(record['edges_json'])))
    return nodes_json, edges_json

In [147]:
query_predicate = "$..['type'='material_run' && 'name'='.*EuS.*'] FOLLOWS+ $..['type'='process_run']"
nodes_json, edges_json = match(query_predicate)


    MATCH path = (n) - [*1..5] - > (m) 
    WITH path, apoc.convert.toJson(apoc.convert.toMap(n)) AS json_n, 
         apoc.convert.toJson(apoc.convert.toMap(m)) AS json_m 
    WHERE 
    apoc.json.path(json_n, "$..['type']")[0] =~ 'process_run' AND apoc.json.path(json_m, "$..['type']")[0] =~ 'material_run'  AND apoc.json.path(json_m, "$..[ 'name']")[0] =~ '.*EuS.*' 
    WITH [node in nodes(path) | { labels: labels(node), properties: properties(node) }] AS nodes,
    relationships(path) AS relationships
    RETURN apoc.convert.toJson(nodes) as nodes_json, apoc.convert.toJson(relationships) as edges_json
    


In [211]:
remote_conn = DriverRemoteConnection('ws://localhost:8182/gremlin','g')
g = traversal().with_remote(remote_conn)

In [213]:
construct_subgraph(nodes_json, edges_json)

In [210]:
remote_conn.close()