In [528]:
test_dict = {
    'BiologicalProcess': {
        'regulation_labels': 
            'regulation of nervous system development',

        'development_labels': [
            'tube morphogenesis',
            'organ development'
        ]
    },
    'CellComponent': {
        'structure_labels': [
            'mitochondrial membrane',
            'cell cortex'
        ],
        'function_labels': [
            'protein binding',
            'ion channel activity'
        ]
    }
}

### Cook labels and properties
- Grep csv file containing node labels, relationships, property labels, and wanted properties from Memgraph
- Create a nested dictionary with {node_labels:{sub_dictionary of properties}}, the sub_dictionary contains corresponding {property_labels: properties}

In [529]:
import re
import random
import json
import pandas as pd
import copy
import hashlib
import base64
import numpy as np

In [530]:


# Open and load the graph schema json file
with open('/Users/yufeimeng/Desktop/KGCypherGenerator/schema.json', 'r',encoding='utf-8-sig') as file:
    schema = json.load(file)
    
# Extract nodes and edges from the schema
#labels = [node['labels'][0] for node in schema[0]['nodes']]
relationships = [relationship['type'] for relationship in schema[0]['relationships']]


# Get detailed properties from the csv file
common_names = pd.read_csv('/Users/yufeimeng/Desktop/KGCypherGenerator/memgraph-query-results-export.csv', index_col=False)

def group_labels(df, label_col, name_col):
    grouped = df.groupby(label_col)[name_col].apply(list).to_dict()
    return grouped

# Applying the function
grouped_names = group_labels(common_names, 'label', 'commonName')

#***************************************
labels = list(grouped_names.keys())
property_labels= ["commonName"] #will be generalized later

### Create alzkb nested dictionary
- In the current test case, all node labels are used; only commonName properties are selected for all nodes except that geneSymbol for Gene node is also added.

In [531]:
# type(grouped_names)
# grouped_names.keys()
# list(grouped_names.values())[0]

geneSymbol_csv = pd.read_csv('geneSymbol.csv', index_col=False)
# type(geneSymbol_csv)
# type(geneSymbol_csv['g.geneSymbol'])

geneSymbol = list(geneSymbol_csv['g.geneSymbol'])
# geneSymbol

geneSymbol_sub_dict = {}
geneSymbol_sub_dict['geneSymbol'] = geneSymbol


alzkb_nested_dict = {}
for key in grouped_names.keys():
    sub_dict = {}
    if key == 'Gene':
        sub_dict['commonName']= grouped_names[key]
        sub_dict['geneSymbol']= geneSymbol
    else:
        sub_dict['commonName']= grouped_names[key]
    alzkb_nested_dict[key] = sub_dict

# alzkb_nested_dict['Gene'].keys()



In [532]:
class TestDepthManager: #TRY TO MAKE DEPTHMANAGER SINGLETON 
    _max_depth = 5  # Default maximum depth
    _min_depth = 3
    _instance = None

    @classmethod
    def getInstance(cls):
        if cls._instance is None:
            cls._instance = cls()
        return cls._instance

    def __init__(self):
        if TestDepthManager._instance is not None:
            raise Exception("This is a singleton class. Use 'getInstance()'.")
        self.depth = 0  # Starting depth
        self.depth_record = {}

    @classmethod
    def set_max_depth(cls, depth):
        if depth > cls._min_depth:
            cls._max_depth = depth
        else:
            print("Maximum depth cannot be smaller than the min depth! \n The default max_depth is", cls._max_depth)
    
    def reset_depth(self):
        self.depth = 0
    
    # def reset_depth_record(self):
    #     self.depth_record = {}
    
def depth_control(func):
    def wrapper(*args, **kwargs):
        dm = TestDepthManager.getInstance()
        if dm.depth == dm._max_depth:
            print("Max depth reached")
            return None
        result = func(*args, **kwargs)
        dm.depth += 1  # Increment depth after function call
        return result
    return wrapper

def class_depth_control(cls):
    class WrappedClass(cls):  # Create a new class that wraps the original class
        def __init__(self, *args, **kwargs):
            dm = TestDepthManager.getInstance()
            if dm.depth == dm._max_depth:
                print("Max depth reached")
                return None
            super().__init__(*args, **kwargs)
            dm.depth += 1
    return WrappedClass 

### Nodes

In [533]:

class TreeNode:
    depth = 0
   
    def __init__(self, value):
        self.value = value
        self.children = []
        # self.level = 0

    def add_child(self, node):
        """Add a TreeNode or value as a child."""
        # if not isinstance(node, TreeNode):
            # node = TreeNode(node)  # Ensure all children are TreeNode instances
        self.children.append(node)
        # self.level += 1

    def __str__(self):
        # Use the helper method for generating the string with indentation
        return self._str_recursive(level=0)

    def _str_recursive(self,level):
        # Create the string representation with indentation for current node
        ret = "\t" *level + str(self.value) + "\n"  # Indent based on the current level
        for child in self.children:
            ret += child._str_recursive(level+1)
        return ret

    def __repr__(self):
        return f'<TreeNode {self.value}>'
    
    def get_depth(self):
        pass #because it needs later-defined class type

    def to_querystr(self):
        """
        convert the generate query tree into query string with ; separation to get ready for querying the Memgraph client
        """
        child_compose = ''
        final_query_str = 'MATCH'
        for child in self.children:
            if child.children:
                for grandchild in child.children:
                    child_compose = ' '+ str(grandchild.value)
            final_query_str += ' ' + str(child.value) + child_compose
        final_query_str += ';'
        return final_query_str

        
class Clause(TreeNode):
    def __init__(self, value, children=None):
        super().__init__(value)
        self.children = children if children is not None else []

    def __str__(self):
        if not self.children:
            return str(self.value)
        if self.value == "RETURN":
            return f"{self.value} {', '.join(str(child) for child in self.children)}"
        return f"{self.value} {' '.join(str(child) for child in self.children)}"
    
class Node(TreeNode):
    """
    When called, will add connector to nodes and make nodes Node type
    """ 
    def __init__(self, value, children=None):
        super().__init__(value)
        self.children = children if children is not None else []
 
    def __str__(self):
        if not self.children:
            return str(self.value)
        # if self.value == '-':  
        #     return ' '.join(str(child) for child in self.children)
        return f"{self.value}({', '.join(str(child) for child in self.children)})"

# @class_depth_control
class Relationship(TreeNode):
    """
    When called, will add connector to relationships and make relationships Relationship type
    """
    def __init__(self, value, hop_only=False):
        super().__init__(value)
        # self.children = children if children is not None else []
        self.hop_only = True if hop_only else False
        
    @depth_control
    def __str__(self):
        # if not self.children:
            # return str(self.value)
        return f"{self.value}"
        # return f"{self.value} {' '.join(str(child) for child in self.children)}"

    def calculate_individual_depth(self):
        # This method checks for both the presence of a relationship and additional depth from hops
        base_depth = 1 if self.hop_only == False else 0 # Start with a depth of 1 for the relationship itself
        # Look for hop patterns, each '*' adds one to the depth
        hop_matches = re.findall(r'\*', self.value.value)
        return base_depth + len(hop_matches)  # Add one additional depth for each hop pattern

def get_depth(self):
    # This now maps through children, checking if they are Relationship instances, and sums their depths
    updated_depth = 0
    for child in self.children:
        if isinstance(child, Relationship):
            updated_depth += child.calculate_individual_depth()
        elif isinstance(child,Condition):
            updated_depth += 1
        else:
            updated_depth = updated_depth
    # updated_depth = sum(child.calculate_individual_depth() if isinstance(child, Relationship) else 0 for child in self.children)
    return updated_depth
TreeNode.get_depth = get_depth


class Condition(TreeNode):
    def __init__(self, value, children=None):
        super().__init__(value)
        self.children = children if children is not None else []

    def __str__(self):
        if not self.children:
            return str(self.value)
        return f"{self.value} {' '.join(str(child) for child in self.children)}"


## Testing dm & qm

In [534]:
# class DepthManagerInUse:

class TestQueryManager:
    def __init__(self, dm):
        self.root = TreeNode("ROOT")  # All parts will be children of root
        self.current_node = self.root  # Current node context for adding parts
        self.node_labels = []
        self.relationships = []
        self.grouped_info = {}
        self.usable_labels = set()
        # self.usable_props = set()

        
        self.parts = []
        self.selected_label_alias = {}

        self.query_str = ''
        
        self.final_depth = 0
        # self.depth_manager = DepthManager()
        # self.id = self.generate_id()
        self.observed_depths = set()
        self.dm = dm
       
    
    @staticmethod
    def generate_id(input_object):
        if isinstance(input_object, TreeNode):
            content = str(input_object)  
        elif isinstance(input_object, str):
            content = input_object
        else:
            raise ValueError("Unsupported object type for ID generation")
        hash_digest = hashlib.sha256(content.encode('utf-8')).digest()
        return base64.urlsafe_b64encode(hash_digest).decode('utf-8').rstrip('=')

        

    def reset_per_gen(self):
        self.root = TreeNode("ROOT")
        self.current_node = self.root  # Reset the tree for new generation
        self.parts = []
        self.selected_label_alias = {}
        self.usable_labels.clear()
        # self.usable_props.clear()
        self.final_depth = 0
        self.query_str = ''

    def import_grouped_info(self, input_group):
        if input_group:
            if type(input_group) is dict:
                self.grouped_info = input_group
                self.node_labels = list(self.grouped_info.keys())
                # print("loaded node_labels:",self.node_labels)
            else:
                print("input grouped info need to be dictionary type")
        else:
            print("input grouped info cannot be empty")
    
    def import_relationships(self, input_relationships):
        if input_relationships:
            self.relationships = input_relationships
        else:
            print("relationships cannot be empty")
    
    def create_unique_alias(self, label):
        """Creates a unique alias for a node to prevent label overlap in queries."""
        base_alias = label.lower()
        alias = base_alias
        counter = 1
        while alias in self.usable_labels:
            alias = f"{base_alias}{counter}"
            counter += 1
        return alias
    
    def extract_alias_label(self, node_value_str):
        # Pattern to capture both the alias and the node label
        pattern = r"^\(([^:]+):([^ {]+)"
        match = re.search(pattern, node_value_str)

        if match:
            alias = match.group(1).strip()  # Get the alias, strip any whitespace
            label = match.group(2).strip()  # Get the label, strip any whitespace
            return alias, label
        else:
            # No match found, return None for both
            return None, None

    def add_node(self):
        """Adds a node with random properties selected from grouped_info."""
        # node_label = ''
        if self.node_labels:
            node_label = random.choice(self.node_labels) 
            possible_props = self.grouped_info[node_label]
            property_label, properties_list = random.choice(list(possible_props.items()))
            alias = self.create_unique_alias(node_label)
            self.selected_label_alias[alias]=node_label

            property_value = random.choice(properties_list)
            properties_str = f'''{property_label}: "{property_value}"''' if possible_props else ''
            node_value = f"{node_label} {{{properties_str}}}"


            node = Node(f"({alias}:{node_value})")
            # self.current_node.add_child(node)

            # self.nodes.append(node)
            self.usable_labels.add(alias)  # Store label for possible RETURN clause usage
            return node 
        print("No node labels available. Please import grouped info first.")
        return None

    def add_hop(self):
        """
        Randomly generate hops as condition to relationship based on a customizable possibility;
        the default possibility is 0.5
        """
        current_depth = self.dm.depth
        hop = random.randint(1,10) #TODO: see if this is reasonable
        upper_hop = hop + random.randint(1,10)
        exact_hop = f"*{hop}"
        ceiling_hop = f"*..{upper_hop}"
        floor_hop = f"*{hop}.."
        interval_hop = f"*{hop}..{upper_hop}"
        hop_choices = [exact_hop, ceiling_hop, floor_hop, interval_hop]
        if current_depth < self.dm._max_depth: #random.random() > hop_p and 
            hop_choice = random.choice(hop_choices)
            return hop_choice
        else:
            return ''



    def add_relationship(self, bi_dir_p=0.3, rev_dir_p=0.5, hop_only_p=0.5, hop_p=0.5):
        """ 
        Randomly generate a relationship between two nodes 
        bi_dir: probability of getting a bidirectional direction
        rev_dir_p: probability of getting a reversed direction
        hop_only_p: probability of getting only hops without specific relationships
        hop_p: probability of getting hops in addition to a specific relationship
        """
        current_depth = self.dm.depth
        rel_type = random.choice(self.relationships)
        if random.random() < bi_dir_p:
            direction1 = "-"
            direction2 = "-"
        if current_depth>=3 and random.random() > rev_dir_p: 
            direction1 = "<-"
            direction2 = "-"
        else:
            direction1 = "-" 
            direction2 = "->"
        # if random.random() > hop_p:
        
        hop_result = Relationship(self.add_hop()) if random.random() > hop_p else ''
        if random.random() > hop_only_p and hop_result:
            relationship = Node(f"{direction1} [{hop_result}] {direction2}")
            return Relationship(relationship, hop_only=True)
        else:
            relationship = Node(f"{direction1} [:{rel_type}{hop_result}] {direction2}")
            return Relationship(relationship)
        # self.current_node.add_child(relationship)
        # return relationship
        # return Relationship(relationship, hop_only)
        
    @depth_control
    def add_condition(self, where_p=0.5):
        """
        Randomly generate WHERE clause based on a customizable possibility;
        Will add to a random node as its child
        the default possibility where_p is 0.5;
        currently only accepts and will only generate values that are str type properties
        """
        current_depth = self.dm.depth
        if random.random() > where_p and current_depth < self.dm._max_depth:
            # random_node = random.choice(self.current_node)
            # print("selected ranndom node:", random_node, type(random_node))

            np_children = np.array(self.current_node.children,dtype=object)
            is_node = np.vectorize(lambda x: isinstance(x, Node))
            # Apply the function to the numpy array
            node_checks = is_node(np_children)
            node_idx = np.where(node_checks)
            node_children = np_children[node_idx]
            random_node = random.choice(node_children)

            alias, node_label = self.extract_alias_label(random_node.value)
            # alias, node_label = random.choice(list(self.selected_label_alias.items()))
            # print(alias, node_label)

            # selected_node_label = random.choice(selected_node_labels)
            possible_properties = self.grouped_info[node_label]
            if possible_properties:
                property_label, properties_list = random.choice(list(possible_properties.items()))
                sample_prop_type = properties_list[0]
                # value = random.randint(20, 50) if isinstance(sample_prop_type, int) else random.choice(properties_list) 
                value = random.choice(properties_list) #TODO: generalize to other data type
            #TODO: customize the int part

                operator = random.choice([">", "<", "=", "<=", ">="]) if isinstance(sample_prop_type, int) else '='
                grandchild = Condition("WHERE", [Clause(f"{alias}.{property_label} {operator} '{value}'", [])])
                random_node.add_child(grandchild)
                # return Condition("WHERE", [Clause(f"{alias}.{property_label} {operator} '{value}'", [])])
            raise ValueError("No available properties for the label selected: {node_label}")
        return 
    
    @staticmethod
    def is_relationship(part):
        """
        Determine if the given part of a query is a relationship based on containing "[]"
        Ensures that part is a string before checking.
        """
        # pattern = re.compile(r'\[(.*?)\]')
        trying = r"-\s*\[:?([A-Za-z0-9_]+)?(\*\d*(\.\.\d*)?)?\]\s*[-<>]?"
        # Ensure part is a string or bytes-like object
        if isinstance(part,str):
            # if pattern.search(part):
            if re.search(trying, part):
                return True
            return False
        print("input has to be str!")
        return

    
    def get_usable_labels(self):
        return list(self.usable_labels)
    
    def add_return(self, return_num=None):

        # print("selected_label_alias:", self.selected_label_alias)

        selected_alias = list(self.selected_label_alias.keys())
        selected_node_labels = list(self.selected_label_alias.values())
        
        if return_num:
            random_k = random.randint(1,return_num)
        else:
            random_k = random.randint(1,len(selected_alias))
           
        # print("selected_node_labels:",selected_node_labels)
        # choices = random.sample(self.usable_labels, random_k)
        random_indices = random.sample(range(len(selected_node_labels)), random_k)
        return_list = []
        for i in random_indices:
            current_alias = selected_alias[i]
            current_label = selected_node_labels[i]
            # print("type of current_label:", repr(current_label), type(current_label))
            # print("check if current_label is in self.node_labels", repr(self.node_labels), current_label in self.node_labels)
            current_possible_properties = self.grouped_info[str(current_label).strip()]
            if current_possible_properties:
                property_label = random.choice(list(current_possible_properties.keys()))
                current_return = Clause(f"{current_alias}.{property_label}")
                return_list.append(current_return)

        return Clause("RETURN", return_list)
        # return None
    
    def parts_to_str(self):
        """
        convert the generate query tree into query string with ; separation to get ready for querying the Memgraph client
        """
        final_query_str = 'MATCH'
        for part in self.parts:
            final_query_str = final_query_str + ' ' + str(part)
        final_query_str += ';'
        return final_query_str


    
    ### FOR CROSSOVER RETURN ADJUSTMENT

    # def extract_alias_label(self, node_value_str):
    #     # Pattern to capture both the alias and the node label
    #     pattern = r"^\(([^:]+):([^ {]+)"
    #     match = re.search(pattern, node_value_str)

    #     if match:
    #         alias = match.group(1).strip()  # Get the alias, strip any whitespace
    #         label = match.group(2).strip()  # Get the label, strip any whitespace
    #         return alias, label
    #     else:
    #         # No match found, return None for both
    #         return None, None

    def collect_alias_labels(self, tree):
        """ Recursively collect labels from the tree that are usable in the RETURN clause. """
        if isinstance(tree, TreeNode) and isinstance(tree.children, list):
            for child in tree.children:
                # Extract label from the current node's value and add it to usable labels
                child_value = str(child.value)
                # print(child_value, type(child_value))
                # label = self.extract_node_alias(child_value)
                alias, label = self.extract_alias_label(child_value)
                if alias and label:
                    self.selected_label_alias[alias] = label
                    # self.usable_labels.add(label)
            # Recursively process each child
                # self.collect_labels(child)
                
    
    def adjust_return(self, tree):
        """ Adjust the RETURN clause based on the labels collected from the tree. """
        if not isinstance(tree, TreeNode):
            raise TypeError("Expected a tree that is TreeNode instance")
        # Clear existing labels and recollect from the new tree structure
        # self.usable_labels.clear() 
        self.selected_label_alias = {}
        self.collect_alias_labels(tree)
        
        if self.selected_label_alias:
            # random_k = random.randint(1, len(self.usable_labels))
            # choices = random.sample(self.usable_labels, random_k)
            new_return = self.add_return()
            
            if tree.children and isinstance(tree.children[-1], TreeNode) and "RETURN" in str(tree.children[-1].value):
                tree.children[-1] = new_return  # Replace the last child with the new RETURN clause
            else:
                tree.add_child(new_return)  # Add new if no RETURN exists
            # print("updated return:",tree.children[-1])
            return tree
        else:
            return None
        
    
    def generate_query(self, flag=True, return_num=None, part_num=None, hop_p=0.5, where_p=0.5):
        self.reset_per_gen()
        self.dm.reset_depth()

        def alternate_functions(flag):
            if flag:
                return self.add_node(), not flag
            else:
                return self.add_relationship(hop_p), not flag

        if part_num is None:
            part_num = random.randint(1, self.dm._max_depth-2)

        # Keep adding nodes and relationships while depth is within limit
        for _ in range(part_num+1):
            part, flag = alternate_functions(flag)
            if part is None:
                break
            self.parts.append(part)
            # self.current_node.add_child(TreeNode(part))
            self.current_node.add_child(part)


        if self.parts and self.is_relationship(str(self.parts[-1])): #ensure the input part is in string format
            final_node = self.add_node()  # Generate a final node
            if final_node:
                self.parts.append(final_node)
                print("final_node added:", final_node)
                # self.current_node.add_child(TreeNode(final_node))
                self.current_node.add_child(final_node)


        # Optionally add a WHERE clause to a random node if depth is still under max_depth
        self.add_condition(where_p) 
        # if condition:
        #     self.current_node.add_child(condition)

        # Add RETURN clause 
        ret = self.add_return(return_num)
        if ret:
            self.parts.append(ret)
            # self.current_node.add_child(TreeNode(ret))
            self.current_node.add_child(ret)

        self.query_str = self.parts_to_str()
        self.current_node.depth = self.dm.depth
        return self.current_node, self.query_str #return the treenode type and string type of query



In [535]:
dmTest = TestDepthManager.getInstance()
qmTest = TestQueryManager(dm=dmTest)
qmTest.import_grouped_info(alzkb_nested_dict)  
qmTest.import_relationships(relationships)  
tree, query = qmTest.generate_query()
tree

<TreeNode ROOT>

In [539]:
print(tree)

ROOT
	(gene:Gene {commonName: "-"})
		WHERE
			gene.geneSymbol = 'LOC129936984'
	- [*7] ->
	(molecularfunction:MolecularFunction {commonName: "prostaglandin-D synthase activity"})
	RETURN
		gene.geneSymbol
		molecularfunction.commonName



In [536]:
np_children = np.array(qmTest.current_node.children,dtype=object)
is_node = np.vectorize(lambda x: isinstance(x, Node))

# Apply the function to the numpy array
node_checks = is_node(np_children)
node_idx = np.where(node_checks)
node_children = np_children[node_idx]
random_node = random.choice(node_children)
print(random_node)
alias, label = qmTest.extract_alias_label(random_node.value)
print(alias, label)
print(type(label))
print(qmTest.grouped_info[label])
possible_properties = qmTest.grouped_info[label]
if possible_properties:
    property_label, properties_list = random.choice(list(possible_properties.items()))
    sample_prop_type = properties_list[0]
    # value = random.randint(20, 50) if isinstance(sample_prop_type, int) else random.choice(properties_list) 
    value = random.choice(properties_list) #TODO: generalize to other data type
#TODO: customize the int part

    operator = random.choice([">", "<", "=", "<=", ">="]) if isinstance(sample_prop_type, int) else '='
    result = Condition("WHERE", [Clause(f"{alias}.{property_label} {operator} '{value}'", [])])
    random_node.add_child(result)

print(random_node)
# np_children[type==Node]
# print(node_checks)
# print(np_children)
# type(random.choice(qmTest.current_node.children))

(gene:Gene {commonName: "-"})
gene Gene
<class 'str'>
{'commonName': ['TATA-box binding protein', 'PDS5 cohesin associated factor A', 'PDS5 cohesin associated factor B', 'SEL1L adaptor subunit of SYVN1 ubiquitin ligase', 'carbamoyl-phosphate synthetase 2, aspartate transcarbamylase, and dihydroorotase', 'RecQ mediated genome instability 1', 'fatty acid synthase', 'nuclear receptor subfamily 2 group F member 6', 'methionine adenosyltransferase 2 non-catalytic beta subunit', 'DnaJ heat shock protein family (Hsp40) member C10', 'leucine rich repeat containing 41', 'nucleoporin 62', 'bone gamma-carboxyglutamate protein', 'vesicle associated membrane protein 7', 'transmembrane and coiled-coil domains 6', 'tumor protein p53 inducible protein 3', 'aldo-keto reductase family 7 member A2', 'mutY DNA glycosylase', 'vesicle associated membrane protein 3', 'splA/ryanodine receptor domain and SOCS box containing 3', 'methionyl-tRNA synthetase 2, mitochondrial', 'MAP7 domain containing 1', 'methiony

## Next building workflow:
- function to convert resulting tree nodes into working queries (done)
- write them into test_queries.txt file (done)
- then apply the bash file & convertion function to run the queries & store as readable csv
- write a basic scoring/evaluation function on depth (done)

#### Fitness function basics buildup

#### EA try
- initialize population with cutomizable size and max depth
- all queried through mgconsole and only maintained and added to the population if it returns result
- fitness function to evaluate initial population

Questions
- generally how customizable we want EA to be? and how do we usually achieve such flexibility in adjusting parameters? through functions or...?
- is the current method of initializing population "wise" enough?


In [537]:
import random

class EvolutionaryAlgorithm:
    def __init__(self, qm, depth_manager, population_size, max_depth, max_generation):
        self.population_size = population_size
        self.max_depth = max_depth
        self.tree_population = []
        self.str_population = []
        self.fitness_scores = {}
        self.query_ids = {}
        self.observed_depths = set() #TODO: clear this set after each initializED population
        self.qm = qm
        self.depth_manager = depth_manager
        self.depth_manager.set_max_depth(self.max_depth)
        self.generation = 0
        self.max_generation = max_generation
    
    def fitness_function(self, query:TreeNode):
        """ Evaluates the fitness of a single query. """
        score = 0
        depth = query.get_depth()
        # depth = self.depth_manager.depth_record[TestQueryManager.generate_id(query)]
        # Simulating a result check - replace with actual database check
        results = ["data"] if depth >= 1 else [] #TODO: change this to actually connecting to server
        
        # Basic result-based scoring
        if results:
            score += 10  # Positive score for non-empty results
        else:
            score -= 5  # Negative score for empty results

        # Depth novelty scoring
        if depth and depth >= 0:
            if depth in self.observed_depths:
                score -= 3  # Penalize if depth is not unique
            else:
                score += 5
                #score += 5 * depth  # Reward new unique depths
                self.observed_depths.add(depth)  # Record this new depth
        else:
            print("depth not found or not valid!", query)
        return score

    def evaluate_population(self):
        """ Evaluates the entire population and updates fitness scores. """
        self.generation += 1 
        for query in self.tree_population:
            tree_id = TestQueryManager.generate_id(query)
            self.query_ids[query] = tree_id
            self.fitness_scores[tree_id] = self.fitness_function(query)
        # self.fitness_scores = [self.fitness_function(query) for query in self.tree_population]

    def initialize_population(self): #TODO: change to generate till size is satisfied
        """ Initializes the population with random depth queries. """
        for _ in range(self.population_size):
            # self.depth_manager.set_max_depth(self.max_depth)
            tree, query = self.qm.generate_query()
            self.tree_population.append(tree)
            self.str_population.append(query)

            # print("generated str query and id:", TestQueryManager.generate_id(query), query)
            # print("the depth of current query:", self.depth_manager.depth)

    def tournament_parent_selection(self, k: int = None):
        """
        Selects the fittest individual from a random sample of the population using a tournament selection approach.

        Parameters:
        - k (int, optional): The number of individuals to sample for the tournament. Defaults to half the population size.

        Returns:
        - The fittest individual from the sampled tournament.
        """
        if k is None:
            k = self.population_size // 2
        if k > len(self.tree_population):
            raise ValueError("Sample size k cannot be larger than the population size.")

        # Sample k individuals from the population
        tournament = random.sample(self.tree_population, k)

        # Determine the fittest individual based on the fitness scores
        fittest = max(tournament, key=lambda individual: self.fitness_scores[self.query_ids[individual]])
        return fittest

    def select_parents(self, num_pairs, k):
        """
        Selects pairs of parents for reproduction, ensuring no individual is selected more than once.

        Parameters:
        - num_pairs (int): The number of parent pairs to select.

        Returns:
        - List of tuples, where each tuple contains two parent individuals.
        """
        if num_pairs <= 0:
            raise ValueError("num_pairs must be a positive integer")
        elif num_pairs * 2 > len(self.tree_population):
            raise ValueError("Insufficient population to select the requested number of unique pairs")

        parents = []
        parent_pairs=[]
        # selected_individuals = set()  # Keep track of selected individuals

        while len(parents) < num_pairs * 2: #and len(selected_individuals) < len(self.tree_population):
            parent1 = self.tournament_parent_selection(k)
            parent2 = self.tournament_parent_selection(k)
            while parent1 == parent2:
                parent2 = self.tournament_parent_selection(k)
            parent_pair = (parent1,parent2)
            parents.append(parent1)
            parents.append(parent2)
            parent_pairs.append(parent_pair)
        return parent_pairs


    
    def swap(self, tree1, tree2):
        if not tree1.children or not tree2.children:
            print("One of the trees does not have children to perform swapping.")
            return
        # Select random subtree indices from both trees
        index1 = random.randint(0, len(tree1.children) - 1)
        index2 = random.randint(0, len(tree2.children) - 1)

        tree1_swap = copy.deepcopy(tree1)
        tree2_swap = copy.deepcopy(tree2)

        # Swap the subtrees
        tree1_swap.children[index1], tree2_swap.children[index2] = \
            tree2_swap.children[index2], tree1_swap.children[index1]
        print("Swapping completed.")
        return tree1_swap, tree2_swap

    def one_point_crossover(self, tree1, tree2):
        if not tree1.children or not tree2.children:
            print("One of the trees does not have children to perform crossover.")
            return
        #get indices that are not relationships as possible crossover point
        node_indices1 = [index for index, child in enumerate(tree1.children) if type(child)!= Relationship]
        node_indices2 = [index for index, child in enumerate(tree2.children) if type(child)!= Relationship]

        #check node existence
        if not node_indices1 or not node_indices2:
            print("No nodes available for crossover in one or both trees.")
            return

        #select random node indices from the filtered lists
        index1 = random.choice(node_indices1[:-1])
        index2 = random.choice(node_indices2[:-1])

        #exchange the subtrees at these indices
        tree1_crossover = copy.deepcopy(tree1)
        tree2_crossover = copy.deepcopy(tree2)

        tree1_crossover.children[index1:], tree2_crossover.children[index2:] = \
            tree2_crossover.children[index2:], tree1_crossover.children[index1:]
        
        #adjust RETURN clause based on exchanged trees
        tree1_crossover = self.qm.adjust_return(tree1_crossover)
        tree2_crossover = self.qm.adjust_return(tree2_crossover)

        print("Crossover and Return clause adjustment completed.")
        return tree1_crossover, tree2_crossover
    

    def output_top_queries(self, top_n):
        """
        Outputs the top N queries from the current population based on fitness scores,
        considering depth diversity and query diversity.
        
        Parameters:
        - top_n (int): Number of top queries to return.
        
        Returns:
        - list: Top N queries as per the defined criteria.
        """
        # Sort the population based on fitness scores

        sorted_population = sorted(self.tree_population, key=lambda x: self.fitness_scores[self.query_ids[x]], reverse=True)
        # Implement logic to ensure diversity if needed, example placeholder:
        # diverse_population = self.ensure_diversity(sorted_population, top_n)
        top_queries_with_scores = [(query, self.fitness_scores[self.query_ids[query]]) for query in sorted_population[:top_n]]
        return top_queries_with_scores
        # return diverse_population[:top_n]


    def reset_ea(self):
        # self.depth_manager.reset_depth_record()
        self.observed_depths = set()
        self.tree_population = []
        self.str_population = []
        self.fitness_scores = {}
        self.query_ids = {}
        self.generation=0


### Sample Usage

In [538]:

dmTest = TestDepthManager.getInstance()
qmTest = TestQueryManager(dm=dmTest)
qmTest.import_grouped_info(alzkb_nested_dict)  
qmTest.import_relationships(relationships)  

ea = EvolutionaryAlgorithm(qm=qmTest, depth_manager=dmTest, population_size=10,max_depth=10, max_generation=3)
# print(ea.depth_manager._max_depth)
print("generation:",ea.generation)
ea.initialize_population() 
ea.evaluate_population()
print("generation:",ea.generation)
print("Fitness Scores:", ea.fitness_scores)

parent_pairs = ea.select_parents(num_pairs=5,k=3) #assume 5 pairs of parents --> maintain 10 total in next gen
offspring_list = []
for parent1, parent2 in parent_pairs:
    # print("parent:",parent1.children[-1])
    # print(parent2.children[-1])
    offspring1, offspring2 = ea.one_point_crossover(parent1, parent2)
    # print("offspring:", offspring1.children[-1])
    # print(offspring2.children[-1])
    offspring_list.extend([offspring1, offspring2])

ea.tree_population = offspring_list
ea.evaluate_population()
print("generation:",ea.generation)
print("Fitness Scores:", ea.fitness_scores)
# ea.reset_ea()


# print(ea.depth_manager.depth_record)
result = ea.output_top_queries(top_n=10)

generation: 0
final_node added: (pathway:Pathway {commonName: "Intrinsic Pathway for Apoptosis"})


ValueError: No available properties for the label selected: {node_label}

In [None]:
output_file_path = 'output_queries.txt'
with open(output_file_path, 'w') as file:
    for tuple in result:
        tree = tuple[0]
        querystr = tree.to_querystr()
        file.write(querystr + '\n')
print("All queries have been written to", output_file_path)


All queries have been written to output_queries.txt


## Connect to Memgraph and Execute Queries

#### First half: connect to memgraph console mgconsole
- Need docker installation following notion notes; paste them here later
- But the bash file will handle the querying so no actual terminal operation needed from the user side

#### Second half: query execution
- because of the csv format glich of memgraph, the bash file **single_query_run.s**h** first execute each individual generated query in the **test_queries.txt**
- the results are stored in the outputs folder in fake csv format
- a function converting and aggregating the results into readable, interpretable csv is then applied to obtain a **aggregated_results.csv**
 

In [None]:
!chmod +x single_query_run.sh
!./single_query_run.sh

Query 1 executed and output saved to ./outputs/query1.csv
Query 2 executed and output saved to ./outputs/query2.csv
Query 3 executed and output saved to ./outputs/query3.csv
Failed query: MATCH (symptom:Symptom {commonName: "Neck Pain"}) - [:DRUGTREATSDISEASE] -> (cellularcomponent:CellularComponent {commonName: "MHC class I protein complex"}) - [:BODYPARTUNDEREXPRESSESGENE] -> (drug1:Drug {commonName: "Bentonite"}) WHERE drug.commonName = 'Indoleacetic acid' RETURN cellularcomponent.commonName
Client received query exception: Unbound variable: drug.
Query 4 executed and output saved to ./outputs/query4.csv
Failed query: MATCH (drugclass:DrugClass {commonName: "Neuromuscular Depolarizing Blockade"}) - [*..13] -> (drug:Drug {commonName: "AZD-9164"}) - [:GENEASSOCIATEDWITHCELLULARCOMPONENT] -> (disease:Disease {commonName: "Alzheimer's Disease"}) - [:DRUGTREATSDISEASE] -> (drug1:Drug {commonName: "Bentonite"}) (disease:Disease {commonName: "Alzheimer Disease 7"}) RETURN drugclass.commonN

In [None]:
import pandas as pd
import os

def aggregate_text_files(folder_path, output_file):
    # Initialize a list to store the aggregated data
    aggregated_data = []

    # List all files in the specified folder
    file_list = [f for f in os.listdir(folder_path) if f.endswith('.csv')]  # or change to '.txt' if appropriate

    for file_name in file_list:
        file_path = os.path.join(folder_path, file_name)
        
        # Check if the file is empty
        if os.stat(file_path).st_size == 0:
            aggregated_data.append({"variables_returned": "no_result", "results_returned": "no_result"})
            continue
        
        # Open and read the file as a plain text file
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        
        # Check again for empty content after reading lines
        if not lines:
            aggregated_data.append({"variables_returned": "no_result", "results_returned": "no_result"})
            continue

        # First line as 'variables_returned'
        variables_returned = lines[0].strip()
        # Join the rest of the lines for 'results_returned'
        if len(lines) > 1:
            results_returned = '; '.join([line.strip().replace('\n', ';') for line in lines[1:]])
        else:
            results_returned = "no_result"
        
        aggregated_data.append({
            "variables_returned": variables_returned,
            "results_returned": results_returned
        })

    # Create a DataFrame from the aggregated data
    result_df = pd.DataFrame(aggregated_data)
    # Save the aggregated data to a CSV file
    result_df.to_csv(output_file, index=False)

# Example usage
folder_path = './outputs'
output_file = './aggregated_results.csv'
aggregate_text_files(folder_path, output_file)


In [None]:
results = pd.read_csv("aggregated_results.csv")
results

Unnamed: 0,variables_returned,results_returned
0,no_result,no_result
1,"""m"";""r""","""(:Gene {commonName: """"TATA-box binding protei..."
2,"""n""","""(:Drug {commonName: """"Basiliximab"""", nodeID: ..."


In [None]:
results["results_returned"][1]

'"(:Gene {commonName: ""TATA-box binding protein"", geneSymbol: ""TBP"", nodeID: ""1"", typeOfGene: ""protein-coding"", uri: ""http://jdr.bio/ontologies/alzkb.owl#gene_tbp"", xrefEnsembl: ""ENSG00000112592"", xrefHGNC: ""11588"", xrefNcbiGene: 6908, xrefOMIM: ""600075""})";"[:GENEPARTICIPATESINBIOLOGICALPROCESS]"; "(:Gene {commonName: ""TATA-box binding protein"", geneSymbol: ""TBP"", nodeID: ""1"", typeOfGene: ""protein-coding"", uri: ""http://jdr.bio/ontologies/alzkb.owl#gene_tbp"", xrefEnsembl: ""ENSG00000112592"", xrefHGNC: ""11588"", xrefNcbiGene: 6908, xrefOMIM: ""600075""})";"[:GENEPARTICIPATESINBIOLOGICALPROCESS]"'