### Cook labels and properties

In [288]:
import re
import random
import json
import pandas as pd

# Open and load the graph schema json file
with open('schema.json', 'r',encoding='utf-8-sig') as file:
    schema = json.load(file)
    
# Extract nodes and edges from the schema
#labels = [node['labels'][0] for node in schema[0]['nodes']]
relationships = [relationship['type'] for relationship in schema[0]['relationships']]


# Get detailed properties from the csv file
common_names = pd.read_csv('memgraph-query-results-export.csv', index_col=False)

def group_labels(df, label_col, name_col):
    grouped = df.groupby(label_col)[name_col].apply(list).to_dict()
    return grouped

# Applying the function
grouped_names = group_labels(common_names, 'label', 'commonName')

#***************************************
labels = list(grouped_names.keys())
property_labels= ["commonName"] #will be generalized later

### Building functions

Things to have:
- depthmanager class (done)
- clause class (done)
- node class (done)
- add functions (done)


In [276]:
#Imports
import random

# Define labels and properties
labels = list(grouped_names.keys())
property_labels= ["commonName"] #will be generalized later

In [277]:
class DepthManager:
    _max_depth = 5  # Default maximum depth
    _min_depth = 3

    def __init__(self):
        self.depth = 0  # Starting depth

    @classmethod
    def set_max_depth(cls, depth):
        if depth > cls._min_depth:
            cls._max_depth = depth
        else:
            print("Maximum depth cannot be smaller than the min depth! \n The default max_depth is", cls._max_depth)
    
    def depth_control(self, func):
        def wrapper(*args, **kwargs):
            if self.depth == self._max_depth:
                print("Max depth reached")
                return None
            result = func(*args, **kwargs)
            self.depth += 1  # Increment depth after function call
            return result
        return wrapper
    
    def reset_depth(self):
        self.depth = 0

class Clause():
    def __init__(self, value, children=None):
        self.value = value
        self.children = children if children is not None else []

    def __str__(self):
        if not self.children:
            return str(self.value)
        return f"{self.value} {' '.join(str(child) for child in self.children)}"

class Node():
    def __init__(self, value, children=None):
         self.value = value
         self.children = children if children is not None else []

    def __str__(self):
        if not self.children:
                return str(self.value)
        if self.value == '-':  
            return ' '.join(str(child) for child in self.children)
        return f"{self.value}({', '.join(str(child) for child in self.children)})"



# # Use the depth_control decorator from the manager instance
# @DepthManager.depth_control
# def add_node(depth):
#         label = random.choice(labels)
#         return Node(f"({label.lower()}: {label})", depth=depth)


In [278]:
#Things to import / create / access
depth_manager = DepthManager()
current_depth = depth_manager.depth

In [279]:
# Define labels and properties
labels = list(grouped_names.keys())
property_labels= ["commonName"]


# @depth_manager.depth_control
def add_node():
    label = random.choice(labels)
    return Node(f"({label.lower()}: {label})")

@depth_manager.depth_control
def add_relationship():
    """ Randomly generate a relationship between two nodes """
    current_depth = depth_manager.depth
    rel_type = random.choice(relationships)
    if current_depth>=3 and random.random() < 0.5:
        direction1 = "<-"
        direction2 = "-"
    else:
        direction1 = "-" 
        direction2 = "->"
    relationship = Node(f"{direction1} [:{rel_type}] {direction2}")
    return Node("-", [relationship])
    # return Clause("-", [relationship], depth=depth)

@depth_manager.depth_control
def add_condition():
    node_label = random.choice(labels)
    # label_lower = node_label.lower()
    property_label = random.choice(property_labels)
    possible_properties = grouped_names[node_label]
    sample_prop_type = possible_properties[0]
    
    value = random.randint(20, 50) if isinstance(sample_prop_type, int) else random.choice(possible_properties)
    operator = random.choice([">", "<", "=", "<=", ">="]) if isinstance(sample_prop_type, int) else '='
    # return Node("WHERE", [Node(f"{label_lower}.{property_label} {operator} {value}", [], depth)], depth=depth)
    return Clause("WHERE", [Clause(f"{node_label}.{property_label} {operator} '{value}'", [])])

def add_return(k):
    choices = random.sample(labels, k)
    nodes = [add_node() for choice in choices if add_node()]
    if nodes:  # Check if the list is not empty
        return Clause("RETURN", nodes)
    return None

def alternate_functions(flag):
    if flag:
        return add_node(), not flag
    else:
        return add_relationship(), not flag
    

import re
#Trying
def is_relationship(part):
    """
    Determine if the given part of a query is a relationship based on containing "[]"
    Ensures that part is a string before checking.
    """
    # pattern = re.compile(r'\[(.*?)\]')
    pattern = re.compile(r'(?:-\s*\[:.*?\]\s*->|<-\s*\[:.*?\]\s*-)')
    # Ensure part is a string or bytes-like object
    if isinstance(part, str):
        if pattern.search(part):
            return True
    return False



In [280]:
#DEBUGGING

def alternate_functions_debug(flag):
    if flag:
        print("current depth: ",depth_manager.depth)
        print("added node")
        return add_node(), not flag
    else:
        print("current depth: ",depth_manager.depth)
        print("added relationship")
        return add_relationship(), not flag

depth_manager.reset_depth()
test_where = add_condition()
print(test_where)

WHERE DrugClass.commonName = 'Increased Coagulation Factor VIII Concentration'


In [281]:
def generate_random_query():
    depth_manager.reset_depth()
    depth = depth_manager.depth
    parts = []
    flag = True  # Ensure that we start with producing nodes
    return_num = 1 #default returns ONE random label
    part_num = random.randint(1, depth_manager._max_depth-2)

    # Keep adding nodes and relationships while depth is within limit
    for _ in range(part_num+1):
        part, flag = alternate_functions(flag)
        if part is None:
            break
        parts.append(part)

    # print(parts[-1],type(parts[-1]), is_relationship(parts[-1]), is_relationship(str(parts[-1])))

    # Check if the last part is a relationship; if so, add a terminating node
    if parts and is_relationship(str(parts[-1])): #ensure the input part is in string format
        final_node = add_node()  # Generate a final node
        if final_node:
            parts.append(final_node)
            print("final_node added:", final_node)
    

    # Optionally add a WHERE clause if depth is still under max_depth
    if depth < depth_manager._max_depth-1 and random.random() > 0.5:
        condition = add_condition()
        # if condition:
        parts.append(condition)

    # Add a RETURN clause if depth is still under max_depth
    # if depth < depth_manager._max_depth:
    ret = add_return(return_num)
    if ret:
        parts.append(ret)

    # Create the MATCH node only if there are parts to include
    print("part_num is: ", part_num)
    return Clause("MATCH", parts) if parts else None
    # return Clause("MATCH", parts) if parts else None


In [282]:
depth_manager.set_max_depth(2)

Maximum depth cannot be smaller than the min depth! 
 The default max_depth is 5


In [283]:
print(add_node())

(symptom: Symptom)


In [284]:
depth_manager.set_max_depth(4)
depth_manager.reset_depth()
query = generate_random_query()
print(query)

final_node added: (pathway: Pathway)
part_num is:  1
MATCH (disease: Disease) - [:CHEMICALINCREASESEXPRESSION] -> (pathway: Pathway) WHERE Drug.commonName = 'Aluminum hydroxide' RETURN (disease: Disease)


In [285]:
depth_manager.reset_depth()
generate_random_query()

part_num is:  2


<__main__.Clause at 0x7f81c11a54c0>

In [286]:
#Test is_relationship
test_str = "- [:DRUGINCLASS] -> "
is_relationship(test_str)

True

In [287]:
# Set maximum depth dynamically
depth_manager.set_max_depth(5)

# Generate and print some random queries
for _ in range(5):
    depth_manager.reset_depth()
    query = generate_random_query()
    print(query, "\n", depth_manager.depth)
    

final_node added: (bodypart: BodyPart)
part_num is:  1
MATCH (molecularfunction: MolecularFunction) - [:GENEINTERACTSWITHGENE] -> (bodypart: BodyPart) RETURN (molecularfunction: MolecularFunction) 
 1
final_node added: (drug: Drug)
part_num is:  3
MATCH (cellularcomponent: CellularComponent) - [:GENEASSOCIATEDWITHCELLULARCOMPONENT] -> (biologicalprocess: BiologicalProcess) - [:GENECOVARIESWITHGENE] -> (drug: Drug) WHERE CellularComponent.commonName = 'calcineurin complex' RETURN (drugclass: DrugClass) 
 3
final_node added: (pathway: Pathway)
part_num is:  1
MATCH (cellularcomponent: CellularComponent) - [:GENEASSOCIATESWITHDISEASE] -> (pathway: Pathway) RETURN (bodypart: BodyPart) 
 1
final_node added: (pathway: Pathway)
part_num is:  1
MATCH (disease: Disease) - [:GENEINTERACTSWITHGENE] -> (pathway: Pathway) WHERE MolecularFunction.commonName = 'long-chain fatty acyl-CoA binding' RETURN (drug: Drug) 
 2
final_node added: (molecularfunction: MolecularFunction)
part_num is:  3
MATCH (pa

### To do
- WHERE and RETURN has to match the node called in MATCH 
- Add hops to relationship (done)
- Add specific properties to nodes like (done)

"MATCH (wallstreet:Movie {title: 'Wall Street'})<-[:ACTED_IN]-(actor)

RETURN actor.name"
- Make sure the current generated queries work 
- Mutation/Crossover function (see what GPT says can do; )


In [529]:
# import re

class Clause():
    def __init__(self, value, children=None):
        self.value = value
        self.children = children if children is not None else []

    def __str__(self):
        if not self.children:
            return str(self.value)
        if self.value == "RETURN":
            return f"{self.value} {', '.join(str(child) for child in self.children)}"
        return f"{self.value} {' '.join(str(child) for child in self.children)}"
    
class Node:
    """
    When called, will add connector to either nodes or relationships 
    """
    def __init__(self, value, children=None):
        self.value = value
        self.children = children if children is not None else []

    def __str__(self):
        if not self.children:
            return str(self.value)
        if self.value == '-':  
            return ' '.join(str(child) for child in self.children)
        return f"{self.value}({', '.join(str(child) for child in self.children)})"

class QueryManager:
    depth_manager = DepthManager()
    def __init__(self):
        self.nodes = []
        self.node_labels = []
        self.relationships = []
        
        # self.property_labels = []
        self.grouped_info = {} #need to be grouped format with {'labels':[properties]...}
        #TODO: generalize this to enable random pick of any value of any property; which means
        #need to generalize the grouped_info dict maybe to a mega dict or mega list

        self.usable_labels = set()

        # self.selected_node_label = []
        # self.selected_alias = []
        self.selected_label_alias = {}
        # self.selected_property_label = []
        # self.selected_properties = []
    
    def import_grouped_info(self, input_group):
        if input_group:
            if type(input_group) is dict:
                self.grouped_info = input_group
                self.node_labels = list(self.grouped_info.keys())
                print("loaded node_labels:",self.node_labels)
            else:
                print("input grouped info need to be dictionary type")
        else:
            print("input grouped info cannot be empty")
    
    def import_relationships(self, input_relationships):
        if input_relationships:
            self.relationships = input_relationships
        else:
            print("relationships cannot be empty")


    def create_unique_alias(self, label):
        """Creates a unique alias for a node to prevent label overlap in queries."""
        base_alias = label.lower()
        alias = base_alias
        counter = 1
        while alias in self.usable_labels:
            alias = f"{base_alias}{counter}"
            counter += 1
        return alias
    
    def add_node(self):
        """Adds a node with random properties selected from grouped_info."""
        node_label = ''
        if self.node_labels:
            node_label = random.choice(self.node_labels) 
            # self.selected_node_label.append(node_label)
            # print(self.selected_node_label, node_label, type(self.selected_node_label))

            possible_props = self.grouped_info[node_label]
            property_label, properties_list = random.choice(list(possible_props.items()))

            alias = self.create_unique_alias(node_label)
            # self.selected_alias.append(alias)

            self.selected_label_alias[alias]=node_label

            property_value = random.choice(properties_list)
            properties_str = f"{property_label}: '{property_value}'" if possible_props else ''
            node_value = f"{node_label} {{{properties_str}}}"
            node = Node(f"({alias}:{node_value})")
            self.nodes.append(node)
            self.usable_labels.add(alias)  # Store label for possible RETURN clause usage
            return node 
        print("No node labels available. Please import grouped info first.")
        return None

        
    @depth_manager.depth_control
    def add_hop(self,hop_p=0.5):
        """
        Randomly generate hops as condition to relationship based on a customizable possibility;
        the default possibility is 0.5
        """
        current_depth = depth_manager.depth
        hop = random.randint(1,10) #TODO: see if this is reasonable
        upper_hop = hop + random.randint(1,10)
        exact_hop = f"*{hop}"
        ceiling_hop = f"*..{upper_hop}"
        floor_hop = f"*{hop}.."
        interval_hop = f"*{hop}..{upper_hop}"
        hop_choices = [exact_hop, ceiling_hop, floor_hop, interval_hop]
        if random.random() > hop_p and current_depth < depth_manager._max_depth:
            hop_choice = random.choice(hop_choices)
            return hop_choice
        else:
            return None


    @depth_manager.depth_control
    def add_relationship(self, hop_p=0.5):
        """ Randomly generate a relationship between two nodes """
        current_depth = depth_manager.depth
        rel_type = random.choice(self.relationships)
        if current_depth>=3 and random.random() > 0.5: 
            direction1 = "<-"
            direction2 = "-"
        else:
            direction1 = "-" 
            direction2 = "->"
        hop_result = self.add_hop(hop_p) if self.add_hop(hop_p) else ''
        relationship = Node(f"{direction1} [:{rel_type}{hop_result}] {direction2}")
        return Node("-", [relationship])
    

    @depth_manager.depth_control
    def add_condition(self, where_p=0.5):
        """
        Randomly generate WHERE clause based on a customizable possibility;
        the default possibility where_p is 0.5
        """
        current_depth = depth_manager.depth
        if random.random() > where_p and current_depth < depth_manager._max_depth:
            # node_label = random.choice(labels)
            # label_lower = node_label.lower()
            # property_label = random.choice(property_labels)
            # node_label = random.choice(self.selected_node_label)
            # alias = random.choice
            alias, node_label = random.choice(list(self.selected_label_alias.items()))
            print(alias, node_label)

            possible_properties = self.grouped_info[node_label]
            if possible_properties:
                property_label, properties_list = random.choice(list(possible_properties.items()))
                sample_prop_type = properties_list[0]
                value = random.randint(20, 50) if isinstance(sample_prop_type, int) else random.choice(properties_list) 
            #TODO: customize the int part

                operator = random.choice([">", "<", "=", "<=", ">="]) if isinstance(sample_prop_type, int) else '='
                return Clause("WHERE", [Clause(f"{alias}.{property_label} {operator} '{value}'", [])])
            return None
        return None

    def add_return(self, return_num=None):
        if return_num:
            random_k = random.randint(1,return_num)
            # choices = random.sample(self.usable_labels, return_num)
        usable_labels = self.get_usable_labels()
        random_k = random.randint(1,len(usable_labels))
        choices = random.sample(self.usable_labels, random_k)
        if choices:  # Check if the list is not empty
            return Clause("RETURN", choices)
        return None

    # Helper functions
    def get_usable_labels(self):
        return list(self.usable_labels)
    
    def reset(self):
        self.nodes = []
        self.node_labels = []
        self.grouped_info = {} 
        self.usable_labels = set()
        # self.selected_node_label = []
        self.selected_label_alias = {}
    
        
    def is_relationship(self, part):
        """
        Determine if the given part of a query is a relationship based on containing "[]"
        Ensures that part is a string before checking.
        """
        # pattern = re.compile(r'\[(.*?)\]')
        pattern = re.compile(r'(?:-\s*\[:.*?\]\s*->|<-\s*\[:.*?\]\s*-)')
        trying = r"-\s*\[:?([A-Za-z0-9_]+)?(\*\d*(\.\.\d*)?)?\]\s*[-<>]?"
        # Ensure part is a string or bytes-like object
        if isinstance(part, str):
            # if pattern.search(part):
            if re.search(trying, part):
                return True
        return False

    
    # Integrated function
    def generate_query(self, flag=True, return_num=None, part_num=None, hop_p=0.5, where_p=0.5):
        """
        Generate cypher query with random parts; 
        part_num is by default randomly set, but can also be customized. 
        """
        def alternate_functions(flag):
            if flag:
                return self.add_node(), not flag
            else:
                return self.add_relationship(hop_p), not flag
            
        depth = depth_manager.depth
        parts = []
        #flag = True  # Ensure that we start with producing nodes
        #return_num = 1 #default returns ONE random label
        if part_num is None:
            part_num = random.randint(1, depth_manager._max_depth-2)

        # Keep adding nodes and relationships while depth is within limit
        for _ in range(part_num+1):
            part, flag = alternate_functions(flag)
            if part is None:
                break
            parts.append(part)
        # Check if the last part is a relationship; if so, add a terminating node
        if parts and self.is_relationship(str(parts[-1])): #ensure the input part is in string format
            final_node = self.add_node()  # Generate a final node if previously ended with relationship
            if final_node:
                parts.append(final_node)
                print("final_node added:", final_node)

        # Optionally add a WHERE clause if depth is still under max_depth
        # if depth < depth_manager._max_depth-1:
        condition = self.add_condition(where_p)
        if condition:
            parts.append(condition)

        # Add RETURN clause 
        ret = self.add_return(return_num)
        if ret:
            parts.append(ret)
        
        #reset all paras before a new generation
        self.reset()
        depth_manager.reset_depth()

        # Create the MATCH node only if there are parts to include
        print("part_num is: ", part_num)
        return Clause("MATCH", parts) if parts else None

        
        


In [401]:
import re

regex = r"-\s*\[:?([A-Za-z0-9_]+)?(\*\d*(\.\.\d*)?)?\]\s*[-<>]?"
tests = [
    "- [:GENECOVARIESWITHGENE*4..] ->",
    "- [:GENECOVARIESWITHGENE] ->",
    "- [*3..] -",
    "- [:GENEINTERACTSWITHGENE] ->"
]

for test in tests:
    if re.search(regex, test):
        print(f"Match found in: {test}")
    else:
        print(f"No match found in: {test}")


Match found in: - [:GENECOVARIESWITHGENE*4..] ->
Match found in: - [:GENECOVARIESWITHGENE] ->
Match found in: - [*3..] -
Match found in: - [:GENEINTERACTSWITHGENE] ->


In [524]:
test_dict = {
    'BiologicalProcess': {
        'regulation_labels': 
            'regulation of nervous system development',

        'development_labels': [
            'tube morphogenesis',
            'organ development'
        ]
    },
    'CellComponent': {
        'structure_labels': [
            'mitochondrial membrane',
            'cell cortex'
        ],
        'function_labels': [
            'protein binding',
            'ion channel activity'
        ]
    }
}
type(test_dict)


dict

In [526]:
test_dict.keys()

key, value = random.choice(list(test_dict['BiologicalProcess'].items()))
print(key,'\n',value, type(key), type(value))

development_labels 
 ['tube morphogenesis', 'organ development'] <class 'str'> <class 'list'>


### Create alzkb nested dictionary

In [494]:
# type(grouped_names)
# grouped_names.keys()
# list(grouped_names.values())[0]

geneSymbol_csv = pd.read_csv('geneSymbol.csv', index_col=False)
# type(geneSymbol_csv)
# type(geneSymbol_csv['g.geneSymbol'])

geneSymbol = list(geneSymbol_csv['g.geneSymbol'])
# geneSymbol

geneSymbol_sub_dict = {}
geneSymbol_sub_dict['geneSymbol'] = geneSymbol


alzkb_nested_dict = {}
for key in grouped_names.keys():
    sub_dict = {}
    if key == 'Gene':
        sub_dict['commonName']= grouped_names[key]
        sub_dict['geneSymbol']= geneSymbol
    else:
        sub_dict['commonName']= grouped_names[key]
    alzkb_nested_dict[key] = sub_dict

# alzkb_nested_dict['Gene'].keys()



dict_keys(['commonName', 'geneSymbol'])

In [531]:
# Test if alzkb one works
qm = QueryManager()
for _ in range(5):
    qm.reset()
    qm.import_grouped_info(alzkb_nested_dict)
    qm.import_relationships(relationships)
    depth_manager.set_max_depth(10)
    depth_manager.reset_depth()
    print(qm.generate_query())

loaded node_labels: ['BiologicalProcess', 'BodyPart', 'CellularComponent', 'Disease', 'Drug', 'DrugClass', 'Gene', 'MolecularFunction', 'Pathway', 'Symptom']
part_num is:  8
MATCH (molecularfunction:MolecularFunction {commonName: 'lysophosphatidic acid receptor activity'}) - [:CHEMICALINCREASESEXPRESSION] -> (gene:Gene {geneSymbol: 'LOC105376419'}) - [:GENEREGULATESGENENone] -> (disease:Disease {commonName: 'Alzheimer Disease, Early Onset'}) - [:GENEHASMOLECULARFUNCTIONNone] -> (biologicalprocess:BiologicalProcess {commonName: 'regulation of caveolin-mediated endocytosis'}) - [:DISEASEASSOCIATESWITHDISEASE] -> (pathway:Pathway {commonName: 'MFAP5-mediated ovarian cancer cell motility and invasiveness'}) RETURN pathway, molecularfunction, biologicalprocess
loaded node_labels: ['BiologicalProcess', 'BodyPart', 'CellularComponent', 'Disease', 'Drug', 'DrugClass', 'Gene', 'MolecularFunction', 'Pathway', 'Symptom']
disease Disease
part_num is:  4
MATCH (pathway:Pathway {commonName: 'inhibit

since Python 3.9 and will be removed in a subsequent version.
  choices = random.sample(self.usable_labels, random_k)
