In [377]:
test_dict = {
    'BiologicalProcess': {
        'regulation_labels': 
            'regulation of nervous system development',

        'development_labels': [
            'tube morphogenesis',
            'organ development'
        ]
    },
    'CellComponent': {
        'structure_labels': [
            'mitochondrial membrane',
            'cell cortex'
        ],
        'function_labels': [
            'protein binding',
            'ion channel activity'
        ]
    }
}

### Cook labels and properties
- Grep csv file containing node labels, relationships, property labels, and wanted properties from Memgraph
- Create a nested dictionary with {node_labels:{sub_dictionary of properties}}, the sub_dictionary contains corresponding {property_labels: properties}

In [378]:
import re
import random
import json
import pandas as pd

# Open and load the graph schema json file
with open('schema.json', 'r',encoding='utf-8-sig') as file:
    schema = json.load(file)
    
# Extract nodes and edges from the schema
#labels = [node['labels'][0] for node in schema[0]['nodes']]
relationships = [relationship['type'] for relationship in schema[0]['relationships']]


# Get detailed properties from the csv file
common_names = pd.read_csv('memgraph-query-results-export.csv', index_col=False)

def group_labels(df, label_col, name_col):
    grouped = df.groupby(label_col)[name_col].apply(list).to_dict()
    return grouped

# Applying the function
grouped_names = group_labels(common_names, 'label', 'commonName')

#***************************************
labels = list(grouped_names.keys())
property_labels= ["commonName"] #will be generalized later

### Create alzkb nested dictionary
- In the current test case, all node labels are used; only commonName properties are selected for all nodes except that geneSymbol for Gene node is also added.

In [379]:
# type(grouped_names)
# grouped_names.keys()
# list(grouped_names.values())[0]

geneSymbol_csv = pd.read_csv('geneSymbol.csv', index_col=False)
# type(geneSymbol_csv)
# type(geneSymbol_csv['g.geneSymbol'])

geneSymbol = list(geneSymbol_csv['g.geneSymbol'])
# geneSymbol

geneSymbol_sub_dict = {}
geneSymbol_sub_dict['geneSymbol'] = geneSymbol


alzkb_nested_dict = {}
for key in grouped_names.keys():
    sub_dict = {}
    if key == 'Gene':
        sub_dict['commonName']= grouped_names[key]
        sub_dict['geneSymbol']= geneSymbol
    else:
        sub_dict['commonName']= grouped_names[key]
    alzkb_nested_dict[key] = sub_dict

# alzkb_nested_dict['Gene'].keys()



In [380]:
import random
import re

In [381]:
class DepthManager:
    _max_depth = 5  # Default maximum depth
    _min_depth = 3

    def __init__(self):
        self.depth = 0  # Starting depth

    @classmethod
    def set_max_depth(cls, depth):
        if depth > cls._min_depth:
            cls._max_depth = depth
        else:
            print("Maximum depth cannot be smaller than the min depth! \n The default max_depth is", cls._max_depth)
    
    def depth_control(self, func):
        def wrapper(*args, **kwargs):
            if self.depth == self._max_depth:
                print("Max depth reached")
                return None
            result = func(*args, **kwargs)
            self.depth += 1  # Increment depth after function call
            return result
        return wrapper
    
    def reset_depth(self):
        self.depth = 0
 
class Clause():
    def __init__(self, value, children=None):
        self.value = value
        self.children = children if children is not None else []

    def __str__(self):
        if not self.children:
            return str(self.value)
        if self.value == "RETURN":
            return f"{self.value} {', '.join(str(child) for child in self.children)}"
        return f"{self.value} {' '.join(str(child) for child in self.children)}"
    
class Node:
    """
    When called, will add connector to either nodes or relationships 
    """
    def __init__(self, value, children=None):
        self.value = value
        self.children = children if children is not None else []

    def __str__(self):
        if not self.children:
            return str(self.value)
        if self.value == '-':  
            return ' '.join(str(child) for child in self.children)
        return f"{self.value}({', '.join(str(child) for child in self.children)})"


### Implementing Tree

In [382]:
depth_manager = DepthManager()  

In [383]:
class TreeNode:
   
    def __init__(self, value):
        self.value = value
        self.children = []
        # self.level = 0
    

    def add_child(self, node):
        """Add a TreeNode or value as a child."""
        # if not isinstance(node, TreeNode):
            # node = TreeNode(node)  # Ensure all children are TreeNode instances
        self.children.append(node)
        # self.level += 1

    def __str__(self):
        # Use the helper method for generating the string with indentation
        return self._str_recursive(level=0)

    def _str_recursive(self,level):
        # Create the string representation with indentation for current node
        ret = "\t" *level + str(self.value) + "\n"  # Indent based on the current level
        for child in self.children:
            ret += child._str_recursive(level+1)
        return ret

    def __repr__(self):
        return f'<TreeNode {self.value}>'
    
    # def reset_level(self):
    #     self.level = 0 


In [384]:
class QueryManager:
    def __init__(self):
        self.root = TreeNode("ROOT")  # All parts will be children of root
        self.current_node = self.root  # Current node context for adding parts
        self.node_labels = []
        self.relationships = []
        self.grouped_info = {}
        self.usable_labels = set()

        
        self.parts = []
        self.selected_label_alias = {}
        # self.parts = [] 

    def reset_per_gen(self):
        self.root = TreeNode("ROOT")
        self.current_node = self.root  # Reset the tree for new generation
        self.parts = []
        self.selected_label_alias = {}
        self.usable_labels.clear()

    def import_grouped_info(self, input_group):
        if input_group:
            if type(input_group) is dict:
                self.grouped_info = input_group
                self.node_labels = list(self.grouped_info.keys())
                # print("loaded node_labels:",self.node_labels)
            else:
                print("input grouped info need to be dictionary type")
        else:
            print("input grouped info cannot be empty")
    
    def import_relationships(self, input_relationships):
        if input_relationships:
            self.relationships = input_relationships
        else:
            print("relationships cannot be empty")
    
    def create_unique_alias(self, label):
        """Creates a unique alias for a node to prevent label overlap in queries."""
        base_alias = label.lower()
        alias = base_alias
        counter = 1
        while alias in self.usable_labels:
            alias = f"{base_alias}{counter}"
            counter += 1
        return alias

    # def add_node(self):
    #     """Implementation to add a node under current context."""
    #     node_label = random.choice(self.node_labels)
    #     possible_props = self.grouped_info[node_label]
    #     property_label, properties_list = random.choice(list(possible_props.items()))
    #     property_value = random.choice(properties_list)
    #     properties_str = f"{property_label}: '{property_value}'"
    #     node = TreeNode(f"Node({node_label} {{{properties_str}}})")
    #     self.current_node.add_child(node)
    #     self.usable_labels.add(alias)
    #     return node
    
    def add_node(self):
        """Adds a node with random properties selected from grouped_info."""
        # node_label = ''
        if self.node_labels:
            node_label = random.choice(self.node_labels) 
            possible_props = self.grouped_info[node_label]
            property_label, properties_list = random.choice(list(possible_props.items()))
            alias = self.create_unique_alias(node_label)
            self.selected_label_alias[alias]=node_label

            property_value = random.choice(properties_list)
            properties_str = f'''{property_label}: "{property_value}"''' if possible_props else ''
            node_value = f"{node_label} {{{properties_str}}}"


            node = Node(f"({alias}:{node_value})")
            # self.current_node.add_child(node)

            # self.nodes.append(node)
            self.usable_labels.add(alias)  # Store label for possible RETURN clause usage
            return node 
        print("No node labels available. Please import grouped info first.")
        return None

    # def add_relationship(self, hop_p=0.5):
    #     """Implementation to add a relationship."""
    #     if random.random() < hop_p and len(self.nodes) > 1:
    #         from_node, to_node = random.sample(self.nodes, 2)
    #         relationship = TreeNode(f"Relationship from {from_node.value} to {to_node.value}")
    #         self.current_node.add_child(relationship)
    #         return relationship
    #     return None
    
    @depth_manager.depth_control
    def add_hop(self,hop_p=0.5):
        """
        Randomly generate hops as condition to relationship based on a customizable possibility;
        the default possibility is 0.5
        """
        current_depth = depth_manager.depth
        hop = random.randint(1,10) #TODO: see if this is reasonable
        upper_hop = hop + random.randint(1,10)
        exact_hop = f"*{hop}"
        ceiling_hop = f"*..{upper_hop}"
        floor_hop = f"*{hop}.."
        interval_hop = f"*{hop}..{upper_hop}"
        hop_choices = [exact_hop, ceiling_hop, floor_hop, interval_hop]
        if random.random() > hop_p and current_depth < depth_manager._max_depth:
            hop_choice = random.choice(hop_choices)
            return hop_choice
        else:
            return ''


    @depth_manager.depth_control
    def add_relationship(self, hop_p=0.5):
        """ Randomly generate a relationship between two nodes """
        current_depth = depth_manager.depth
        rel_type = random.choice(self.relationships)
        if current_depth>=3 and random.random() > 0.5: 
            direction1 = "<-"
            direction2 = "-"
        else:
            direction1 = "-" 
            direction2 = "->"
        hop_result = self.add_hop(hop_p) if self.add_hop(hop_p) else ''
        relationship = Node(f"{direction1} [:{rel_type}{hop_result}] {direction2}")
        # self.current_node.add_child(relationship)
        # return relationship
        return Node("-", [relationship])
        
    # @depth_manager.depth_control
    # def add_condition(self, where_p=0.5):
    #     """
    #     Randomly generate WHERE clause based on a customizable possibility;
    #     the default possibility where_p is 0.5
    #     """
    #     current_depth = depth_manager.depth
    #     if random.random() > where_p and current_depth < depth_manager._max_depth:
    #         # node_label = random.choice(labels)
    #         # label_lower = node_label.lower()
    #         # property_label = random.choice(property_labels)
    #         # node_label = random.choice(self.selected_node_label)
    #         # alias = random.choice
    #         alias, node_label = random.choice(list(self.selected_label_alias.items()))
    #         print(alias, node_label)

    #         possible_properties = self.grouped_info[node_label]
    #         if possible_properties:
    #             property_label, properties_list = random.choice(list(possible_properties.items()))
    #             sample_prop_type = properties_list[0]
    #             value = random.randint(20, 50) if isinstance(sample_prop_type, int) else random.choice(properties_list) 
    #         #TODO: customize the int part

    #             operator = random.choice([">", "<", "=", "<=", ">="]) if isinstance(sample_prop_type, int) else '='
    #             return Clause("WHERE", [Clause(f"{alias}.{property_label} {operator} '{value}'", [])])
    #         return None
    #     return None
    @staticmethod
    def is_relationship(part):
        """
        Determine if the given part of a query is a relationship based on containing "[]"
        Ensures that part is a string before checking.
        """
        # pattern = re.compile(r'\[(.*?)\]')
        pattern = re.compile(r'(?:-\s*\[:.*?\]\s*->|<-\s*\[:.*?\]\s*-)')
        trying = r"-\s*\[:?([A-Za-z0-9_]+)?(\*\d*(\.\.\d*)?)?\]\s*[-<>]?"
        # Ensure part is a string or bytes-like object
        if isinstance(part, str):
            # if pattern.search(part):
            if re.search(trying, part):
                return True
            return False
        print("input has to be str!")
        return

    
    def get_usable_labels(self):
        return list(self.usable_labels)
    
    def add_return(self, return_num=None):
        if return_num:
            random_k = random.randint(1,return_num)
            # choices = random.sample(self.usable_labels, return_num)
        usable_labels = self.get_usable_labels()
        # print(usable_labels)
        random_k = random.randint(1,len(usable_labels))
        choices = random.sample(self.usable_labels, random_k)
        if choices:  # Check if the list is not empty
            return Clause("RETURN", choices)
        return None
    

    def generate_query(self, flag=True, return_num=None, part_num=None, hop_p=0.5, where_p=0.5):
        self.reset_per_gen()
        depth_manager.reset_depth()

        def alternate_functions(flag):
            if flag:
                return self.add_node(), not flag
            else:
                return self.add_relationship(hop_p), not flag

        if part_num is None:
            part_num = random.randint(1, depth_manager._max_depth-2)

        # Keep adding nodes and relationships while depth is within limit
        for _ in range(part_num+1):
            part, flag = alternate_functions(flag)
            if part is None:
                break
            self.parts.append(part)
            self.current_node.add_child(TreeNode(part))

        # Check if the last part is a relationship; if so, add a terminating node
        if self.parts and self.is_relationship(str(self.parts[-1])): #ensure the input part is in string format
            final_node = self.add_node()  # Generate a final node
            if final_node:
                self.parts.append(final_node)
                print("final_node added:", final_node)
                self.current_node.add_child(TreeNode(final_node))


        # Optionally add a WHERE clause if depth is still under max_depth
        # condition = self.add_condition(where_p)
        # if condition:
        #     self.current_node.add_child(TreeNode(f"WHERE {condition}"))

        # Add RETURN clause 
        ret = self.add_return(return_num)
        if ret:
            self.current_node.add_child(TreeNode(ret))

        # print("Final Query Structure:")
        # print(self.current_node)  # Visualize the tree structure of the query
        # self.current_node.reset_level()
        return self.current_node
    
    ### FOR CROSSOVER RETURN ADJUSTMENT
    def extract_node_alias(self, node_value_str):
        pattern = r"^\(([^:]+):"
        match = re.search(pattern, node_value_str)
        if match:
            return match.group(1)  # Returns the first capturing group
        # print("None found")
        return None

    def collect_labels(self, tree):
        """ Recursively collect labels from the tree that are usable in the RETURN clause. """
        if isinstance(tree, TreeNode) and isinstance(tree.children, list):
            for child in tree.children:
                # Extract label from the current node's value and add it to usable labels
                child_value = str(child.value)
                # print(child_value, type(child_value))
                label = self.extract_node_alias(child_value)
                if label:
                    self.usable_labels.add(label)
            # Recursively process each child
                # self.collect_labels(child)
                
    
    def adjust_return(self, tree):
        """ Adjust the RETURN clause based on the labels collected from the tree. """
        if not isinstance(tree, TreeNode):
            raise TypeError("Expected a tree that is TreeNode instance")
        # Clear existing labels and recollect from the new tree structure
        self.usable_labels.clear()
        self.collect_labels(tree)
        
        if self.usable_labels:
            random_k = random.randint(1, len(self.usable_labels))
            choices = random.sample(self.usable_labels, random_k)
            new_return = Clause("RETURN", choices)
            
            if tree.children and isinstance(tree.children[-1], TreeNode) and "RETURN" in str(tree.children[-1].value):
                tree.children[-1] = TreeNode(new_return)  # Replace the last child with the new RETURN clause
            else:
                tree.add_child(TreeNode(new_return))  # Add new if no RETURN exists
            # print("updated return:",tree.children[-1])
            return tree
        else:
            return None




#### Sample Usage

In [385]:
#Sample Usage
qm = QueryManager()
qm.import_grouped_info(test_dict)  # Assuming this is filled
qm.import_relationships(["rel1", "rel2","rel3"])  # Example relationships
depth_manager.set_max_depth(9)
depth_manager.reset_depth()
result = qm.generate_query()

print(result)

ROOT
	(biologicalprocess:BiologicalProcess {regulation_labels: "r"})
	- [:rel2] ->
	(biologicalprocess1:BiologicalProcess {regulation_labels: "m"})
	RETURN biologicalprocess1



since Python 3.9 and will be removed in a subsequent version.
  choices = random.sample(self.usable_labels, random_k)


### Swapping and Crossover function implementation

In [386]:
import random

def swap(query_manager1, query_manager2):
    if not query_manager1.children or not query_manager2.children:
        print("One of the trees does not have children to perform swapping.")
        return

    # Select random subtree indices from both trees
    index1 = random.randint(0, len(query_manager1.children) - 1)
    index2 = random.randint(0, len(query_manager2.children) - 1)

    # Swap the subtrees
    query_manager1.children[index1], query_manager2.children[index2] = \
        query_manager2.children[index2], query_manager1.children[index1]

    print("Swapping completed.")


def crossover(tree1, tree2):
    qm = QueryManager()
    if not tree1.children or not tree2.children:
        print("One of the trees does not have children to perform crossover.")
        return

    # Filter out indices that are not relationships
    node_indices1 = [index for index, child in enumerate(tree1.children) if QueryManager.is_relationship(str(child))==False]
    node_indices2 = [index for index, child in enumerate(tree2.children) if QueryManager.is_relationship(str(child))==False]

    # Check if there are any nodes to perform crossover
    if not node_indices1 or not node_indices2:
        print("No nodes available for crossover in one or both trees.")
        return

    # Select random node indices from the filtered lists
    index1 = random.choice(node_indices1[:-1])
    index2 = random.choice(node_indices2[:-1])

    # Swap the subtrees at these indices
    tree1.children[index1:], tree2.children[index2:] = \
        tree2.children[index2:], tree1.children[index1:]
    # print("tree1 after crossover:", tree1)
    # print("tree2 after crossover:", tree2)
    # print("Before adjust, tree1 and tree2", tree1.children[-1].value, tree2.children[-1].value)
    
    qm = QueryManager()
    tree1 = qm.adjust_return(tree1)
    tree2 = qm.adjust_return(tree2)

    # print("After adjust, tree1 and tree2", '\n', tree1, '\n',tree2)

    print("Crossover and Return clause adjustment completed.")


#### Sample Usage

In [387]:
# Try if a single qm is enough -- YES!
qm = QueryManager()

qm.import_grouped_info(alzkb_nested_dict) 
qm.import_relationships(relationships)
tree1 = qm.generate_query() 
tree2 = qm.generate_query()

crossover(tree1, tree2)

final_node added: (disease:Disease {commonName: "Alzheimer Disease, Familial, 3, with Spastic Paraparesis and Unusual Plaques"})
Crossover and Return clause adjustment completed.


since Python 3.9 and will be removed in a subsequent version.
  choices = random.sample(self.usable_labels, random_k)
since Python 3.9 and will be removed in a subsequent version.
  choices = random.sample(self.usable_labels, random_k)


In [406]:
print(tree1)

ROOT
	(gene:Gene {commonName: "-"})
	<- [:GENEINPATHWAY*2] -
	(bodypart:BodyPart {commonName: "mammary gland"})
	<- [:GENEHASMOLECULARFUNCTION] -
	(disease:Disease {commonName: "Alzheimer Disease, Familial, 3, with Spastic Paraparesis and Unusual Plaques"})
	RETURN disease, gene



In [390]:
print(tree2)

ROOT
	(biologicalprocess:BiologicalProcess {commonName: "cell aging"})
	- [:GENEHASMOLECULARFUNCTION*2..7] ->
	(drug:Drug {commonName: "AMG-222"})
	<- [:GENEREGULATESGENE] -
	(gene:Gene {commonName: "-"})
	- [:GENEASSOCIATESWITHDISEASE] ->
	(biologicalprocess:BiologicalProcess {commonName: "regulation of lymphocyte activation"})
	- [:SYMPTOMMANIFESTATIONOFDISEASE] ->
	(molecularfunction:MolecularFunction {commonName: "inositol monophosphate 1-phosphatase activity"})
	RETURN gene, biologicalprocess, molecularfunction



### Building GP workflow and Connect to Memgraph

In [None]:
import pandas as pd

#### Try processing the bad csv

In [395]:
# pd.read_csv('test_none.csv')
import pandas as pd


def process_csv(input_filepath, output_filepath):
    # Read the CSV file
    df = pd.read_csv(input_filepath, header=None, names=['Type', 'Data'])
    
    # Ensure all data is treated as string
    df['Data'] = df['Data'].astype(str)
    
    # Process the 'Data' to escape internal quotes and encapsulate it as a single string if it contains commas
    df['Data'] = df['Data'].apply(lambda x: '"' + x.replace('"', '""') + '"' if ',' in x else x)

    # Aggregating results for each type into one cell, separated by a semicolon
    # This assumes the 'Type' column correctly identifies different queries
    agg_df = df.groupby('Type')['Data'].agg(lambda x: '; '.join(x)).reset_index()

    # Write the processed data back to a new CSV
    agg_df.to_csv(output_filepath, index=False)


# File paths
input_csv = 'test_none.csv'
output_csv = 'test_modified_output.csv'

# Process the CSV
process_csv(input_csv, output_csv)


In [396]:
pd.read_csv("test_modified_output.csv")

Unnamed: 0,Type,Data
0,"(:Drug {commonName: ""Basiliximab"", nodeID: ""26...",
1,"(:Drug {commonName: ""Muromonab"", nodeID: ""264""...",
2,"(:Gene {commonName: ""TATA-box binding protein""...",[:GENEPARTICIPATESINBIOLOGICALPROCESS]; [:GENE...
3,m,r
4,n,


In [398]:
!chmod +x run_queries.sh

python(7691) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [400]:
import pandas as pd
import csv

def process_csv(input_filepath, output_filepath):
    # Open the raw CSV file for reading
    with open(input_filepath, 'r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        # Initialize a list to hold processed results
        processed_results = []

        # Temporary storage for aggregating related data
        current_query = None
        current_data = []

        for row in reader:
            if not row:
                continue

            # Extract query and data
            if len(row) == 2:
                query, data = row
            elif len(row) > 2:  # Handles cases where commas split the data erroneously
                query = row[0]
                data = ','.join(row[1:])
            else:
                continue  # Skip empty or malformed rows

            # Check if we've moved to a new query
            if current_query is None or current_query == query:
                current_query = query
                current_data.append(data.replace('""', '"').strip('"'))  # Normalize double quotes and strip outer quotes
            else:
                # Concatenate all collected data entries for the previous query
                processed_results.append([current_query, '; '.join(current_data)])
                current_query = query
                current_data = [data.replace('""', '"').strip('"')]

        # Don't forget to add the last batch of collected data
        if current_data:
            processed_results.append([current_query, '; '.join(current_data)])

    # Write processed data to the new CSV file
    with open(output_filepath, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Query', 'Result'])  # Write header
        for result in processed_results:
            writer.writerow(result)

# File paths
input_csv = 'final_result.csv'
output_csv = 'formatted_final_result.csv'

# Process the CSV
process_csv(input_csv, output_csv)


In [405]:
pd.read_csv("final_result_test_format.csv", header=None)

Unnamed: 0,0,1,2,3,4,5
0,"'''""MATCH (gene:Gene {commonName: ""-""})<- [:GE...",Familial,3,"with Spastic Paraparesis and Unusual Plaques""...","gene; ""'''","'''""no_result""'''"
1,MATCH (n) RETURN n LIMIT 2;,"''' ""n""",,,,
2,"(:Drug {commonName: ""Basiliximab"", nodeID: ""26...","nodeID: """"264""""","uri: """"http://jdr.bio/ontologies/alzkb.owl#dr...","xrefCasRN: """"140608-64-6""""","xrefDrugbank: """"DB00075""""})""'''",
3,"'''""MATCH (m:Gene)-[r]->(Gene) RETURN m","r LIMIT 2;""'''","''' ""m""",r,,
4,"(:Gene {commonName: ""TATA-box binding protein""...",[:GENEPARTICIPATESINBIOLOGICALPROCESS],,,,
5,"(:Gene {commonName: ""TATA-box binding protein""...",[:GENEPARTICIPATESINBIOLOGICALPROCESS]''',,,,


In [408]:
!chmod +x single_query_run.sh
!./single_query_run.sh

python(43435) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(43438) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Query 1 executed and output saved to ./outputs/query1.csv
Query 2 executed and output saved to ./outputs/query2.csv
Query 3 executed and output saved to ./outputs/query3.csv
All queries processed.


In [None]:
def convert_csv(folder):
    
