In [1]:
test_dict = {
    'BiologicalProcess': {
        'regulation_labels': 
            'regulation of nervous system development',

        'development_labels': [
            'tube morphogenesis',
            'organ development'
        ]
    },
    'CellComponent': {
        'structure_labels': [
            'mitochondrial membrane',
            'cell cortex'
        ],
        'function_labels': [
            'protein binding',
            'ion channel activity'
        ]
    }
}

## Preparing Inputs

### Cook labels and properties
- Grep csv file containing node labels, relationships, property labels, and wanted properties from Memgraph
- Create a nested dictionary with {node_labels:{sub_dictionary of properties}}, the sub_dictionary contains corresponding {property_labels: properties}

In [2]:
import re
import random
import json
import pandas as pd
import copy
import hashlib
import base64
import numpy as np

In [3]:


# Open and load the graph schema json file
with open('./example/schema.json', 'r',encoding='utf-8-sig') as file:
    schema = json.load(file)
    
# Extract nodes and edges from the schema
#labels = [node['labels'][0] for node in schema[0]['nodes']]
relationships = [relationship['type'] for relationship in schema[0]['relationships']]


# Get detailed properties from the csv file
common_names = pd.read_csv('./example/memgraph-query-results-export.csv', index_col=False)

def group_labels(df, label_col, name_col):
    grouped = df.groupby(label_col)[name_col].apply(list).to_dict()
    return grouped

# Applying the function
grouped_names = group_labels(common_names, 'label', 'commonName')

#***************************************
labels = list(grouped_names.keys())
property_labels= ["commonName"] #will be generalized later

In [7]:
common_names.head()

Unnamed: 0,label,commonName
0,Drug,Basiliximab
1,Drug,Muromonab
2,Drug,Trastuzumab
3,Drug,Rituximab
4,Drug,Ibritumomab tiuxetan


### Create alzkb nested dictionary
- In the current test case, all node labels are used; only commonName properties are selected for all nodes except that geneSymbol for Gene node is also added.

In [4]:
# type(grouped_names)
# grouped_names.keys()
# list(grouped_names.values())[0]

geneSymbol_csv = pd.read_csv('./example/geneSymbol.csv', index_col=False)
# type(geneSymbol_csv)
# type(geneSymbol_csv['g.geneSymbol'])

geneSymbol = list(geneSymbol_csv['g.geneSymbol'])
# geneSymbol

geneSymbol_sub_dict = {}
geneSymbol_sub_dict['geneSymbol'] = geneSymbol


alzkb_nested_dict = {}
for key in grouped_names.keys():
    sub_dict = {}
    if key == 'Gene':
        sub_dict['commonName']= grouped_names[key]
        sub_dict['geneSymbol']= geneSymbol
    else:
        sub_dict['commonName']= grouped_names[key]
    alzkb_nested_dict[key] = sub_dict

# alzkb_nested_dict['Gene'].keys()



In [5]:
with open('alzkb_nested_dict_test.txt', 'w') as test: 
     test.write(json.dumps(alzkb_nested_dict))

## DepthManager, QueryManager, Nodes definitions

### DepthManager

In [5]:
class TestDepthManager: #TRY TO MAKE DEPTHMANAGER SINGLETON 
    _max_depth = 5  # Default maximum depth
    _min_depth = 3
    _instance = None

    @classmethod
    def getInstance(cls):
        if cls._instance is None:
            cls._instance = cls()
        return cls._instance

    def __init__(self):
        if TestDepthManager._instance is not None:
            raise Exception("This is a singleton class. Use 'getInstance()'.")
        self.depth = 0  # Starting depth
        self.depth_record = {}

    @classmethod
    def set_max_depth(cls, depth):
        if depth > cls._min_depth:
            cls._max_depth = depth
        else:
            print("Maximum depth cannot be smaller than the min depth! \n The default max_depth is", cls._max_depth)
    
    def reset_depth(self):
        self.depth = 0
    
    # def reset_depth_record(self):
    #     self.depth_record = {}
    
def depth_control(func):
    def wrapper(*args, **kwargs):
        dm = TestDepthManager.getInstance()
        if dm.depth == dm._max_depth:
            print("Max depth reached")
            return None
        result = func(*args, **kwargs)
        dm.depth += 1  # Increment depth after function call
        return result
    return wrapper

def class_depth_control(cls):
    class WrappedClass(cls):  # Create a new class that wraps the original class
        def __init__(self, *args, **kwargs):
            dm = TestDepthManager.getInstance()
            if dm.depth == dm._max_depth:
                print("Max depth reached")
                return None
            super().__init__(*args, **kwargs)
            dm.depth += 1
    return WrappedClass 

### Nodes

In [6]:

class TreeNode:
    depth = 0
    score = 1 #defaultly assume tree as garbage query
   
    def __init__(self, value):
        self.value = value
        self.children = []
        # self.level = 0

    def add_child(self, node):
        """Add a TreeNode or value as a child."""
        # if not isinstance(node, TreeNode):
            # node = TreeNode(node)  # Ensure all children are TreeNode instances
        self.children.append(node)
        # self.level += 1

    def __str__(self):
        # Use the helper method for generating the string with indentation
        return self._str_recursive(level=0)

    def _str_recursive(self,level):
        # Create the string representation with indentation for current node
        ret = "\t" *level + str(self.value) + "\n"  # Indent based on the current level
        for child in self.children:
            ret += child._str_recursive(level+1)
        return ret

    def __repr__(self):
        return f'<TreeNode {self.value}>'
    
    def get_depth(self):
        pass #because it needs later-defined class type

    def to_querystr(self):
        """
        convert the generate query tree into query string with ; separation to get ready for querying the Memgraph client
        """
        child_compose = ''
        final_query_str = 'MATCH'
        for child in self.children:
            if child.children:
                for grandchild in child.children:
                    child_compose = ' '+ str(grandchild.value)
            final_query_str += ' ' + str(child.value) + child_compose
        final_query_str += ';'
        return final_query_str

        
class Clause(TreeNode):
    def __init__(self, value, children=None):
        super().__init__(value)
        self.children = children if children is not None else []

    def __str__(self):
        if not self.children:
            return str(self.value)
        if self.value == "RETURN":
            return f"{self.value} {', '.join(str(child) for child in self.children)}"
        return f"{self.value} {' '.join(str(child) for child in self.children)}"
    
class Node(TreeNode):
    """
    When called, will add connector to nodes and make nodes Node type
    """ 
    def __init__(self, value, children=None):
        super().__init__(value)
        self.children = children if children is not None else []
 
    def __str__(self):
        if not self.children:
            return str(self.value)
        # if self.value == '-':  
        #     return ' '.join(str(child) for child in self.children)
        return f"{self.value}({', '.join(str(child) for child in self.children)})"

# @class_depth_control
class Relationship(TreeNode):
    """
    When called, will add connector to relationships and make relationships Relationship type
    """
    def __init__(self, value, hop_only=False):
        super().__init__(value)
        # self.children = children if children is not None else []
        self.hop_only = True if hop_only else False
        
    @depth_control
    def __str__(self):
        # if not self.children:
            # return str(self.value)
        return f"{self.value}"
        # return f"{self.value} {' '.join(str(child) for child in self.children)}"

    def calculate_individual_depth(self):
        # This method checks for both the presence of a relationship and additional depth from hops
        base_depth = 1 if self.hop_only == False else 0 # Start with a depth of 1 for the relationship itself
        # Look for hop patterns, each '*' adds one to the depth
        hop_matches = re.findall(r'\*', self.value.value)
        return base_depth + len(hop_matches)  # Add one additional depth for each hop pattern

def get_depth(self):
    # This now maps through children, checking if they are Relationship instances, and sums their depths
    updated_depth = 0
    for child in self.children:
        if isinstance(child, Relationship):
            updated_depth += child.calculate_individual_depth()
        elif isinstance(child,Condition):
            updated_depth += 1
        else:
            updated_depth = updated_depth
    # updated_depth = sum(child.calculate_individual_depth() if isinstance(child, Relationship) else 0 for child in self.children)
    return updated_depth
TreeNode.get_depth = get_depth


class Condition(TreeNode):
    def __init__(self, value, children=None):
        super().__init__(value)
        self.children = children if children is not None else []

    def __str__(self):
        if not self.children:
            return str(self.value)
        return f"{self.value} {' '.join(str(child) for child in self.children)}"


### QueryManager

In [7]:
# class DepthManagerInUse:

class TestQueryManager:
    def __init__(self, dm):
        self.root = TreeNode("ROOT")  # All parts will be children of root
        self.current_node = self.root  # Current node context for adding parts
        self.node_labels = []
        self.relationships = []
        self.grouped_info = {}
        self.usable_labels = set()
        # self.usable_props = set()

        
        self.parts = []
        self.selected_label_alias = {}

        self.query_str = ''
        
        self.final_depth = 0
        # self.depth_manager = DepthManager()
        # self.id = self.generate_id()
        self.observed_depths = set()
        self.dm = dm
       
    
    @staticmethod
    def generate_id(input_object):
        if isinstance(input_object, TreeNode):
            content = str(input_object)  
        elif isinstance(input_object, str):
            content = input_object
        else:
            raise ValueError("Unsupported object type for ID generation")
        hash_digest = hashlib.sha256(content.encode('utf-8')).digest()
        return base64.urlsafe_b64encode(hash_digest).decode('utf-8').rstrip('=')

        

    def reset_per_gen(self):
        self.root = TreeNode("ROOT")
        self.current_node = self.root  # Reset the tree for new generation
        self.parts = []
        self.selected_label_alias = {}
        self.usable_labels.clear()
        # self.usable_props.clear()
        self.final_depth = 0
        self.query_str = ''

    def import_grouped_info(self, input_group):
        if input_group:
            if type(input_group) is dict:
                self.grouped_info = input_group
                self.node_labels = list(self.grouped_info.keys())
                # print("loaded node_labels:",self.node_labels)
            else:
                print("input grouped info need to be dictionary type")
        else:
            print("input grouped info cannot be empty")
    
    def import_relationships(self, input_relationships):
        if input_relationships:
            self.relationships = input_relationships
        else:
            print("relationships cannot be empty")
    
    def create_unique_alias(self, label):
        """Creates a unique alias for a node to prevent label overlap in queries."""
        base_alias = label.lower()
        alias = base_alias
        counter = 1
        while alias in self.usable_labels:
            alias = f"{base_alias}{counter}"
            counter += 1
        return alias
    
    def extract_alias_label(self, node_value_str):
        # Pattern to capture both the alias and the node label
        pattern = r"^\(([^:]+):([^ {]+)"
        match = re.search(pattern, node_value_str)

        if match:
            alias = match.group(1).strip()  # Get the alias, strip any whitespace
            label = match.group(2).strip()  # Get the label, strip any whitespace
            return alias, label
        else:
            # No match found, return None for both
            return None, None

    def add_node(self):
        """Adds a node with random properties selected from grouped_info."""
        # node_label = ''
        if self.node_labels:
            node_label = random.choice(self.node_labels) 
            possible_props = self.grouped_info[node_label]
            property_label, properties_list = random.choice(list(possible_props.items()))
            alias = self.create_unique_alias(node_label)
            self.selected_label_alias[alias]=node_label

            property_value = random.choice(properties_list)
            properties_str = f'''{property_label}: "{property_value}"''' if possible_props else ''
            node_value = f"{node_label} {{{properties_str}}}"


            node = Node(f"({alias}:{node_value})")
            # self.current_node.add_child(node)

            # self.nodes.append(node)
            self.usable_labels.add(alias)  # Store label for possible RETURN clause usage
            return node 
        print("No node labels available. Please import grouped info first.")
        return None

    def add_hop(self):
        """
        Randomly generate hops as condition to relationship based on a customizable possibility;
        the default possibility is 0.2
        """
        current_depth = self.dm.depth
        hop = random.randint(1,3) #TODO: see if this is reasonable
        upper_hop = hop + random.randint(1,5)
        exact_hop = f"*{hop}"
        ceiling_hop = f"*..{upper_hop}"
        floor_hop = f"*{hop}.."
        interval_hop = f"*{hop}..{upper_hop}"
        hop_choices = [exact_hop, ceiling_hop, floor_hop, interval_hop]
        if current_depth < self.dm._max_depth: #random.random() > hop_p and 
            hop_choice = random.choice(hop_choices)
            return hop_choice
        else:
            return ''



    def add_relationship(self, bi_dir_p=0.3, rev_dir_p=0.5, hop_only_p=0.2, hop_p=0.2):
        """ 
        Randomly generate a relationship between two nodes 
        bi_dir: probability of getting a bidirectional direction
        rev_dir_p: probability of getting a reversed direction
        hop_only_p: probability of getting only hops without specific relationships
        hop_p: probability of getting hops in addition to a specific relationship
        """
        current_depth = self.dm.depth
        rel_type = random.choice(self.relationships)
        if random.random() < bi_dir_p:
            direction1 = "-"
            direction2 = "-"
        if current_depth>=3 and random.random() > rev_dir_p: 
            direction1 = "<-"
            direction2 = "-"
        else:
            direction1 = "-" 
            direction2 = "->"
        # if random.random() > hop_p:
        
        hop_result = Relationship(self.add_hop()) if random.random() > hop_p else ''
        if random.random() > hop_only_p and hop_result:
            relationship = Node(f"{direction1} [{hop_result}] {direction2}")
            return Relationship(relationship, hop_only=True)
        else:
            relationship = Node(f"{direction1} [:{rel_type}{hop_result}] {direction2}")
            return Relationship(relationship)
        # self.current_node.add_child(relationship)
        # return relationship
        # return Relationship(relationship, hop_only)
        
    @depth_control
    def add_condition(self, children_source, where_p=0.5, for_ea=False):
        """
        Randomly generate WHERE clause based on a customizable possibility;
        Will add to a random node as its child (no)
        the default possibility where_p is 0.5;
        currently only accepts and will only generate values that are str type properties
        children_source: list of children. If from tree_population, should be tree.children, or other list type.
        """
        if for_ea == True:
            current_depth = 0 #to make sure as long as WHERE is found in previous query, it will be replaced only by chance and not depth
        else:
            current_depth = self.dm.depth
        if random.random() > where_p and current_depth < self.dm._max_depth:
            
            np_children = np.array(children_source,dtype=object)
            is_node = np.vectorize(lambda x: isinstance(x, Node))
            # Apply the function to the numpy array
            node_checks = is_node(np_children)
            node_idx = np.where(node_checks)
            node_children = np_children[node_idx]
            random_node = random.choice(node_children)

            alias, node_label = self.extract_alias_label(random_node.value)
            # alias, node_label = random.choice(list(self.selected_label_alias.items()))
            # print(alias, node_label)

            # selected_node_label = random.choice(selected_node_labels)
            possible_properties = self.grouped_info[node_label.strip()]
            if possible_properties:
                property_label, properties_list = random.choice(list(possible_properties.items()))
                sample_prop_type = properties_list[0]
                # value = random.randint(20, 50) if isinstance(sample_prop_type, int) else random.choice(properties_list) 
                value = random.choice(properties_list) #TODO: generalize to other data type
            #TODO: customize the int part

                operator = random.choice([">", "<", "=", "<=", ">="]) if isinstance(sample_prop_type, int) else '='
                # grandchild = Condition("WHERE", [Condition(f"{alias}.{property_label} {operator} '{value}'")])
                # random_node.add_child(grandchild)
                # print("Added WHERE clause to node", random_node)
                return Condition("WHERE", [Clause(f'''{alias}.{property_label} {operator} "{value}"''', [])])
            else:
                raise ValueError("No available properties for the label selected:", {node_label})
        return 
    
    @staticmethod
    def is_relationship(part):
        """
        Determine if the given part of a query is a relationship based on containing "[]"
        Ensures that part is a string before checking.
        """
        # pattern = re.compile(r'\[(.*?)\]')
        trying = r"-\s*\[:?([A-Za-z0-9_]+)?(\*\d*(\.\.\d*)?)?\]\s*[-<>]?"
        # Ensure part is a string or bytes-like object
        if isinstance(part,str):
            # if pattern.search(part):
            if re.search(trying, part):
                return True
            else:
                return False
        else:   
            print("input has to be str!")
            return None
        

    
    def get_usable_labels(self):
        return list(self.usable_labels)
    
    def add_return(self, return_num=None):

        # print("selected_label_alias:", self.selected_label_alias)

        selected_alias = list(self.selected_label_alias.keys())
        selected_node_labels = list(self.selected_label_alias.values())
        
        if return_num:
            random_k = random.randint(1,return_num)
        else:
            random_k = random.randint(1,len(selected_alias))
           
        # print("selected_node_labels:",selected_node_labels)
        # choices = random.sample(self.usable_labels, random_k)
        random_indices = random.sample(range(len(selected_node_labels)), random_k)
        return_list = []
        for i in random_indices:
            current_alias = selected_alias[i]
            current_label = selected_node_labels[i]
            # print("type of current_label:", repr(current_label), type(current_label))
            # print("check if current_label is in self.node_labels", repr(self.node_labels), current_label in self.node_labels)
            current_possible_properties = self.grouped_info[str(current_label).strip()]
            if current_possible_properties:
                property_label = random.choice(list(current_possible_properties.keys()))
                current_return = Clause(f"{current_alias}.{property_label}")
                return_list.append(current_return)

        return Clause("RETURN", return_list)
        # return None
    
    def parts_to_str(self):
        """
        convert the generate query tree into query string with ; separation to get ready for querying the Memgraph client
        """
        final_query_str = 'MATCH'
        for part in self.parts:
            final_query_str = final_query_str + ' ' + str(part)
        final_query_str += ';'
        return final_query_str


    
    ### FOR CROSSOVER RETURN ADJUSTMENT

    def collect_alias_labels(self, tree):
        """ Recursively collect labels from the tree that are usable in the RETURN clause. """
        if isinstance(tree, TreeNode) and isinstance(tree.children, list):
            for child in tree.children:
                # Extract label from the current node's value and add it to usable labels
                child_value = str(child.value)
                # print(child_value, type(child_value))
                # label = self.extract_node_alias(child_value)
                alias, label = self.extract_alias_label(child_value)
                if alias and label:
                    self.selected_label_alias[alias] = label
                    # self.usable_labels.add(label)
            # Recursively process each child
                # self.collect_labels(child)
                
    
    def adjust_return(self, tree):
        """ Adjust the RETURN clause based on the labels collected from the tree. """
        if not isinstance(tree, TreeNode):
            raise TypeError("Expected a tree that is TreeNode instance")
        # Clear existing labels and recollect from the new tree structure
        # self.usable_labels.clear() 
        self.selected_label_alias = {}
        self.collect_alias_labels(tree)
        
        if self.selected_label_alias:
            # random_k = random.randint(1, len(self.usable_labels))
            # choices = random.sample(self.usable_labels, random_k)
            new_return = self.add_return()
            
            if tree.children and isinstance(tree.children[-1], TreeNode) and "RETURN" in str(tree.children[-1].value):
                tree.children[-1] = new_return  # Replace the last child with the new RETURN clause
            else:
                tree.add_child(new_return)  # Add new if no RETURN exists
            # print("updated return:",tree.children[-1])
            return tree
        else:
            return None
        
    
    def generate_query(self, flag=True, return_num=None, part_num=None, hop_p=0.5, where_p=0.5):
        self.reset_per_gen()
        self.dm.reset_depth()
        def alternate_functions(flag):
            if flag:
                return self.add_node(), not flag
            else:
                return self.add_relationship(hop_p), not flag
        if part_num is None:
            part_num = random.randint(1, self.dm._max_depth-2)
        # Keep adding nodes and relationships while depth is within limit
        for _ in range(part_num+1):
            part, flag = alternate_functions(flag)
            if part is None:
                break
            self.parts.append(part)
            # self.current_node.add_child(TreeNode(part))
            self.current_node.add_child(part)
        if self.parts and self.is_relationship(str(self.parts[-1]))==True: #ensure the input part is in string format
            final_node = self.add_node()  # Generate a final node
            if final_node:
                self.parts.append(final_node)
                # print("final_node added:", final_node)
                # self.current_node.add_child(TreeNode(final_node))
                self.current_node.add_child(final_node)
        # Optionally add a WHERE clause to a random node if depth is still under max_depth
        condition = self.add_condition(self.current_node.children, where_p) 
        if condition:
            self.parts.append(condition)
            self.current_node.add_child(condition)
           
        # Add RETURN clause 
        ret = self.add_return(return_num)
        if ret:
            self.parts.append(ret)
            self.current_node.add_child(ret)

        self.query_str = self.parts_to_str()
        self.current_node.depth = self.dm.depth
        return self.current_node, self.query_str #return the treenode type and string type of query



## Next building workflow:
- function to convert resulting tree nodes into working queries (done)
- write them into test_queries.txt file (done)
- then apply the bash file & convertion function to run the queries & store as readable csv (done)
- write a basic scoring/evaluation function on depth (done)

#### Fitness function basics buildup

#### EA try
- initialize population with cutomizable size and max depth
- all queried through mgconsole and only maintained and added to the population if it returns result
- fitness function to evaluate initial population

Questions
- Do I need to make sure all initial queries must work?
- generally how customizable we want EA to be? and how do we usually achieve such flexibility in adjusting parameters? through functions or...?
- is the current method of initializing population "wise" enough?
- 


In [8]:
import mgclient
import threading
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor

class DatabaseConn:
    def __init__(self, host, port):
        self.host = host
        self.port = port
        # Create a thread-local data storage
        self.local = threading.local()

    def get_connection(self):
        """
        Retrieve or establish a database connection for the current thread.
        """
        if not hasattr(self.local, 'conn'):
            self.local.conn = mgclient.connect(host=self.host, port=self.port)
            self.local.conn.autocommit = True
        return self.local.conn

    def execute_query(self, query):
        """
        Execute a given query using a thread-specific database connection.
        """
        try:
            conn = self.get_connection()
            cursor = conn.cursor()
            cursor.execute(query)
            results = cursor.fetchall()
            return results
        except Exception as e:
            print(f"Failed to execute query {query}: {str(e)}")
            return e
        finally:
            if 'cursor' in locals():
                cursor.close()

In [9]:
import random
import mgclient
import threading
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
conn = mgclient.connect(host='127.0.0.1', port=7687)
cursor = conn.cursor()

class EvolutionaryAlgorithm:
    def __init__(self, qm, depth_manager, population_size, max_depth, max_generation, mut_rate=0.05):
        self.population_size = population_size
        self.max_depth = max_depth
        self.tree_population = []
        self.str_population = []
        self.fitness_scores = {}
        self.query_ids = {}
        self.observed_depths = set() #TODO: clear this set after each initializED population
        self.qm = qm
        self.depth_manager = depth_manager
        self.depth_manager.set_max_depth(self.max_depth)
        # self.database_conn = database_conn
        self.generation = 0
        self.max_generation = max_generation
        self.mut_rate = mut_rate

        # self.host = host
        # self.port = port

        self.valid_queries = [] #store queries with score = 2 across all generations

            
    def population_to_query_list(self, population):
        return [tree.to_querystr() for tree in population]

        
    def evaluate_population(self, max_workers=5):
        """ Evaluates the entire population and updates fitness scores. """
        self.generation += 1 
        # query_list = self.population_to_query_list(self.tree_population)
        successful_indices, failed_indices = self.run_queries_multithreaded(tree_population=self.tree_population, max_workers=max_workers)
        for index, tree in enumerate(self.tree_population):
            if index in successful_indices:
                tree.score += 1
                self.valid_queries.append(tree)
            elif index in failed_indices:
                tree.score -= 1


    def initialize_population(self): #TODO: change to generate till size is satisfied
        """ Initializes the population with random depth queries. """
        for _ in range(self.population_size):
            # self.depth_manager.set_max_depth(self.max_depth)
            tree, query = self.qm.generate_query()
            self.tree_population.append(tree)
            self.str_population.append(query)
            self.qm.reset_per_gen()

    # def test_initialize_population(self):
    #     """ Initializes the population with random depth queries that return results. """  
    #     # Generate queries until the population size is satisfied
    #     while len(self.tree_population) < self.population_size:
    #         tree, query = self.qm.generate_query()
    #         if self.is_valid_query(cursor, query):
    #             self.tree_population.append(tree)
    #             self.str_population.append(query)
    #         self.qm.reset_per_gen()

    #     cursor.close()
    #     conn.close()

    def is_valid_query(self, cursor, query):
        """ Execute the query and check if it returns any results. """
        try:
            cursor.execute(query)
            results = cursor.fetchall()
            return len(results) > 0  # Return True if there are results
        except Exception as e:
            print(f"Query failed: {e}")
            return False  # Query failed or returned no results


    def tournament_parent_selection(self, k: int = None):
        """
        Selects the fittest individual from a random sample of the population using a tournament selection approach.

        Parameters:
        - k (int, optional): The number of individuals to sample for the tournament. Defaults to half the population size.

        Returns:
        - The fittest individual from the sampled tournament.
        """
        if k is None:
            k = self.population_size // 2
        if k > len(self.tree_population):
            raise ValueError("Sample size k cannot be larger than the population size.")
        tournament = random.sample(self.tree_population, k)
        fittest = max(tournament, key=lambda individual: self.fitness_scores[self.query_ids[individual]])
        return fittest

    def select_parents(self, num_pairs, k):
        """
        Selects pairs of parents for reproduction, ensuring parents within a pair would not repeat.

        Parameters:
        - num_pairs (int): The number of parent pairs to select.

        Returns:
        - List of tuples, where each tuple contains two parent individuals.
        """
        if num_pairs <= 0:
            raise ValueError("num_pairs must be a positive integer")
        elif num_pairs * 2 > len(self.tree_population):
            raise ValueError("Insufficient population to select the requested number of unique pairs")

        parents = []
        parent_pairs=[]
        # selected_individuals = set()  # Keep track of selected individuals

        while len(parents) < num_pairs * 2: #and len(selected_individuals) < len(self.tree_population):
            parent1 = self.tournament_parent_selection(k)
            parent2 = self.tournament_parent_selection(k)
            while parent1 == parent2:
                parent2 = self.tournament_parent_selection(k)
            parent_pair = (parent1,parent2)
            parents.append(parent1)
            parents.append(parent2)
            parent_pairs.append(parent_pair)
        return parent_pairs

    # JUL25: implementing mutation
    def mutate_query(self, tree):
        """
        Randomly mutate either choice based on mut_rate probability below:
        - node label of the query
        - WHERE clause
        And the returned tree query will also have an adjusted RETURN clause
        """
        mutations = ['node_label','condition']
        mut_type = random.choice(mutations)
        if type(tree)==TreeNode:
            if mut_type == 'condition':
                # mutated_condition = self.qm.add_condition(for_ea=True)
                for index, element in enumerate(tree.children):
                    if 'WHERE' in str(element.value):
                        children_source = tree.children[:index] + tree.children[index+1:]
                        tree.children[index] = self.qm.add_condition(children_source, for_ea=True)
                    else:
                        mut_type = 'node_label' #ensure mutation takes place for selected queries without WHERE clause
            if mut_type == 'node_label':
                # mutated_node = self.qm.add_node()
                indices = []
                for index, element in enumerate(tree.children):
                    if type(element)==Node:
                        indices.append(index)
                ind = random.choice(indices)
                tree.children[ind] = self.qm.add_node()
            tree = self.qm.adjust_return(tree)
            return tree
        else:
            raise ValueError("The input tree query has to be TreeNode type!")

    def mutation(self):
        num_to_mutate = int(len(self.tree_population) * self.mut_rate)
        for _ in range(num_to_mutate):
            # Randomly pick an individual to mutate
            individual_index = random.randint(0, len(self.tree_population) - 1)
            print("mutated index:", individual_index)
            # Perform mutation on this individual
            self.tree_population[individual_index] = self.mutate_query(self.tree_population[individual_index])
        return self.tree_population  

    def swap(self, tree1, tree2):
        if not tree1.children or not tree2.children:
            print("One of the trees does not have children to perform swapping.")
            return
        # Select random subtree indices from both trees
        index1 = random.randint(0, len(tree1.children) - 1)
        index2 = random.randint(0, len(tree2.children) - 1)

        tree1_swap = copy.deepcopy(tree1)
        tree2_swap = copy.deepcopy(tree2)

        # Swap the subtrees
        tree1_swap.children[index1], tree2_swap.children[index2] = \
            tree2_swap.children[index2], tree1_swap.children[index1]
        print("Swapping completed.")
        return tree1_swap, tree2_swap

    def one_point_crossover(self, tree1, tree2):
        if not tree1.children or not tree2.children:
            print("One of the trees does not have children to perform crossover.")
            return
        #get indices that are not relationships as possible crossover point
        node_indices1 = [index for index, child in enumerate(tree1.children) if type(child)!= Relationship]
        node_indices2 = [index for index, child in enumerate(tree2.children) if type(child)!= Relationship]

        #check node existence
        if not node_indices1 or not node_indices2:
            print("No nodes available for crossover in one or both trees.")
            return

        #select random node indices from the filtered lists
        index1 = random.choice(node_indices1[:-1])
        index2 = random.choice(node_indices2[:-1])

        #exchange the subtrees at these indices
        tree1_crossover = copy.deepcopy(tree1)
        tree2_crossover = copy.deepcopy(tree2)

        tree1_crossover.children[index1:], tree2_crossover.children[index2:] = \
            tree2_crossover.children[index2:], tree1_crossover.children[index1:]
        
        #adjust RETURN clause based on exchanged trees
        tree1_crossover = self.qm.adjust_return(tree1_crossover)
        tree2_crossover = self.qm.adjust_return(tree2_crossover)

        print("Crossover and Return clause adjustment completed.")
        return tree1_crossover, tree2_crossover
    

    def output_top_queries(self, top_n):
        """
        Outputs the top N queries from the current population based on fitness scores,
        considering depth diversity and query diversity.
        
        Parameters:
        - top_n (int): Number of top queries to return.
        
        Returns:
        - list: Top N queries as per the defined criteria.
        """
        # Sort the population based on fitness scores

        sorted_population = sorted(self.tree_population, key=lambda x: self.fitness_scores[self.query_ids[x]], reverse=True)
        # Implement logic to ensure diversity if needed, example placeholder:
        # diverse_population = self.ensure_diversity(sorted_population, top_n)
        top_queries_with_scores = [(query, self.fitness_scores[self.query_ids[query]]) for query in sorted_population[:top_n]]
        return top_queries_with_scores
        # return diverse_population[:top_n]


    def reset_ea(self):
        # self.depth_manager.reset_depth_record()
        self.observed_depths = set()
        self.tree_population = []
        self.str_population = []
        self.fitness_scores = {}
        self.query_ids = {}
        self.generation=0



#TEST WITH INDEPENDENT FUNCTIONS
# def execute_query(query):
#     # Create thread-local storage for database connections
#     if not hasattr(execute_query, "conn"):
#         execute_query.conn = mgclient.connect(host='127.0.0.1', port=7687)
#         execute_query.conn.autocommit = True
#     try:
#         cursor = execute_query.conn.cursor()
#         cursor.execute(query)
#         results = cursor.fetchall()
#         return results
#     except Exception as e:
#         print(f"Failed to execute query {query}: {str(e)}")
#         return None
#     finally:
#         cursor.close()
# def execute_query(query):
#     # Create thread-local storage for database connections
#     if not hasattr(execute_query, "conn"):
#         execute_query.conn = mgclient.connect(host='127.0.0.1', port=7687)
#         execute_query.conn.autocommit = True
#     try:
#         cursor = execute_query.conn.cursor()
#         cursor.execute(query)
#         results = cursor.fetchall()
#         if not results:
#             # No results found, return a specific value that indicates empty results
#             return 'NoResult'
#         else:
#             return results
#     except Exception as e:
#         # print(f"Failed to execute query {query}: {str(e)}")
#         return None
#     finally:
#         cursor.close()

# def run_queries_multithreaded(query_list, max_workers=5):
#     # Create a ThreadPoolExecutor to manage multiple threads
#     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
#         # Map futures to their corresponding query indices
#         future_to_index = {executor.submit(execute_query, query): i for i, query in enumerate(query_list)}

#         # Create a list to store the indices of successful queries
#         successful_indices = []
#         failed_indices = []

#         # Collect results as they complete
#         for future in concurrent.futures.as_completed(future_to_index):
#             index = future_to_index[future]
#             try:
#                 result = future.result()
#                 if result:
#                     print(f"Query at index {index} returned results.")
#                     successful_indices.append(index)
#                 else:
#                     print(f"Query at index {index} returned no results.")
#             except Exception as e:
#                 print(f"Query execution failed for index {index}: {str(e)}")
#                 failed_indices.append(index)

#         return successful_indices, failed_indices

### Sample Usage

In [10]:
databaseTest = DatabaseConn(host='127.0.0.1',port=7687)

In [11]:
dmTest = TestDepthManager.getInstance()
qmTest = TestQueryManager(dm=dmTest)
qmTest.import_grouped_info(alzkb_nested_dict)  
qmTest.import_relationships(relationships)  


ea = EvolutionaryAlgorithm(qm=qmTest, depth_manager=dmTest, population_size=50,max_depth=4, max_generation=3)
# ea.test_initialize_population() 
print("generation:",ea.generation)
ea.initialize_population() 
# for index, query in enumerate(ea.str_population):
#     print(index, query)
# ea.mutation()
query_list = ea.population_to_query_list(ea.tree_population)

generation: 0


In [12]:
query_list

['MATCH (drugclass:DrugClass {commonName: "Corticosteroid Hormone Receptor Agonists"}) - [:DISEASELOCALIZESTOANATOMY] -> (drugclass1:DrugClass {commonName: "UGT1A4 Inhibitors"}) WHERE drugclass.commonName = "Decreased Histamine Release" RETURN drugclass.commonName;',
 'MATCH (cellularcomponent:CellularComponent {commonName: "nuclear condensin complex"}) - [*1] -> (gene:Gene {geneSymbol: "LOC110121188"}) WHERE gene.geneSymbol = "LOC130065403" RETURN gene.geneSymbol;',
 'MATCH (bodypart:BodyPart {commonName: "autonomic ganglion"}) - [*3..5] -> (symptom:Symptom {commonName: "Ageusia"}) WHERE bodypart.commonName = "sphenoid bone" RETURN bodypart.commonName;',
 'MATCH (bodypart:BodyPart {commonName: "nervous system"}) - [:GENEASSOCIATEDWITHCELLULARCOMPONENT] -> (disease:Disease {commonName: "Non-Alzheimer\'s dementia (e.g., Lewy body dementia, vascular or multi-infarct dementia; mixed dementia; frontotemporal dementia such as Pick\'s disease; and dementia related to stroke, Parkinson\'s or 

## Connect to Memgraph and Execute Queries

#### First half: connect to memgraph console mgconsole
- Need docker installation following notion notes; paste them here later
- But the bash file will handle the querying so no actual terminal operation needed from the user side

#### Second half: query execution
- because of the csv format glich of memgraph, the bash file **single_query_run.s**h** first execute each individual generated query in the **test_queries.txt**
- the results are stored in the outputs folder in fake csv format
- a function converting and aggregating the results into readable, interpretable csv is then applied to obtain a **aggregated_results.csv**
 

In [12]:
"""
1. write 10 queries into a txt file
2. execute
3. write another 10 queries into txt, execute ... till all are executed -- DONE
4. grep results from aggregated csv with indices. --DONE
    if "no_result", no change;
    if error, corresponding query.score = 0
    if return results, corresponding query.score = 2 
"""

'\n1. write 10 queries into a txt file\n2. execute\n3. write another 10 queries into txt, execute ... till all are executed -- DONE\n4. grep results from aggregated csv with indices. --DONE\n    if "no_result", no change;\n    if error, corresponding query.score = 0\n    if return results, corresponding query.score = 2 \n'

In [31]:
import logging
import os
import subprocess
from multiprocessing import Pool
from test_multiprocessing import create_batch, execute_script, aggregate_text_files, merge_csv_files, get_indices
########################################
#TESTING WITH MULTIPROCESSING
# def process_batches(batch_folder):
#     # Find all batch files
#     batch_files = [os.path.join(batch_folder, f) for f in os.listdir(batch_folder) if f.startswith('batch') and f.endswith('.txt')]
#     # Ensure the script is executable
#     subprocess.run(['chmod', '+x', 'test_query_run.sh'])
    
#     # Process each batch in parallel
#     with Pool(processes=os.cpu_count()) as pool:
#         pool.map(execute_script, batch_files)

#     # Aggregate results
#     folder_path = './outputs'
#     output_file = './aggregates/results.csv'
#     aggregate_text_files(folder_path, output_file)

# # Merge CSV files if needed
# merge_csv_files('aggregates', 'multiproc_merged_output.csv')

# # Get indices of successful and failed queries
# get_indices('multiproc_merged_output.csv')


In [None]:
import os
import shutil
import subprocess
from multiprocessing import Pool
from test_multiprocessing import create_batch, process_batch

# create_batch(query_list, batch_size=10)
if __name__ == "__main__":
    batch_folder = './input_batch'
    with Pool(processes=os.cpu_count()) as pool:
        batch_files = [os.path.join(batch_folder, f) for f in os.listdir(batch_folder) if f.startswith('batch')]
        pool.starmap(process_batch, [(batch_file, i) for i, batch_file in enumerate(batch_files)])


Current Batch processed.
Aggregated to ./aggregates/results3.csv
Current Batch processed.
Aggregated to ./aggregates/results4.csv


KeyboardInterrupt: 

Failed query: MATCH (gene:Gene {commonName: "-"}) - [*3..] -> (pathway:Pathway {commonName: "Cargo trafficking to the periciliary membrane"}) WHERE gene.geneSymbol = "LOC129999684" RETURN gene.geneSymbol
Client received query exception: Transaction was asked to abort because of transaction timeout.


Query execution failed, writing 'failed' to ./outputs/1.csv


Failed query: MATCH (symptom:Symptom {commonName: "Auditory Perceptual Disorders"}) - [*1..] -> (gene:Gene {commonName: "-"}) RETURN symptom.commonName
Client received query exception: Transaction was asked to abort because of transaction timeout.


Query execution failed, writing 'failed' to ./outputs/6.csv


Failed query: MATCH (disease:Disease {commonName: "Lewy Body Variant of Alzheimer Disease"}) - [*3..7] -> (pathway:Pathway {commonName: "RUNX3 regulates NOTCH signaling"}) WHERE disease.commonName = "Non-Alzheimer's dementia (e.g., Lewy body dementia, vascular or multi-infarct dementia; mixed dementia; frontotemporal dementia such as Pick's disease; and dementia related to stroke, Parkinson's or Creutzfeldt-Jakob diseases)" RETURN pathway.commonName
Client received query exception: Transaction was asked to abort because of transaction timeout.


Query execution failed, writing 'failed' to ./outputs/10.csv
Current Batch processed.
Current Batch processed.
Current Batch processed.


Failed query: MATCH (gene:Gene {commonName: "-"}) - [*3..] -> (pathway:Pathway {commonName: "Cargo trafficking to the periciliary membrane"}) WHERE gene.geneSymbol = "LOC129999684" RETURN gene.geneSymbol
Client received query exception: Transaction was asked to abort because of transaction timeout.


Query execution failed, writing 'failed' to ./outputs/1.csv


Failed query: MATCH (symptom:Symptom {commonName: "Auditory Perceptual Disorders"}) - [*1..] -> (gene:Gene {commonName: "-"}) RETURN symptom.commonName
Client received query exception: Transaction was asked to abort because of transaction timeout.


Query execution failed, writing 'failed' to ./outputs/6.csv
Current Batch processed.
Current Batch processed.


Failed query: MATCH (disease:Disease {commonName: "Lewy Body Variant of Alzheimer Disease"}) - [*3..7] -> (pathway:Pathway {commonName: "RUNX3 regulates NOTCH signaling"}) WHERE disease.commonName = "Non-Alzheimer's dementia (e.g., Lewy body dementia, vascular or multi-infarct dementia; mixed dementia; frontotemporal dementia such as Pick's disease; and dementia related to stroke, Parkinson's or Creutzfeldt-Jakob diseases)" RETURN pathway.commonName
Client received query exception: Transaction was asked to abort because of transaction timeout.


Query execution failed, writing 'failed' to ./outputs/10.csv
Current Batch processed.


In [42]:
if __name__ == "__main__":
    batch_folder = './input_batch'
    with Pool(processes=os.cpu_count()) as pool:
        batch_files = [os.path.join(batch_folder, f) for f in os.listdir(batch_folder) if f.startswith('batch')]
        pool.starmap(process_batch, [(batch_file, i) for i, batch_file in enumerate(batch_files)])


Process SpawnPoolWorker-98:
Process SpawnPoolWorker-101:
Process SpawnPoolWorker-96:
Process SpawnPoolWorker-102:
Process SpawnPoolWorker-100:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/yufeimeng/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/yufeimeng/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/yufeimeng/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/yufeimeng/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/yufeimeng/opt/anaconda3/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/yufeimeng/opt/anaconda3/lib/python3.9/multiprocessing/queues.py", line 368, in get
    return _ForkingPickler.loa

KeyboardInterrupt: 

In [30]:
if __name__ == "__main__":
    batch_folder = './input_batch'
    suc_ind, fail_ind = process_batches(batch_folder)
    print(f"Successful indices: {suc_ind}")
    print(f"Failed indices: {fail_ind}")

Current Batch processed.
Current Batch processed.
Current Batch processed.
Current Batch processed.


KeyboardInterrupt: 

In [None]:
#changed result at index 5 to be something valid to test later steps
merge_csv_files('aggregates', 'merged_output.csv')


Merged CSV saved as merged_output.csv


In [None]:
#TRY TO CONNECT TO PREVIOUS QUERY
def get_indices(csv_dir):
    # csv_dir_to_examine = 'merged_output.csv'
    merged = pd.read_csv(csv_dir)
    mask = (merged['results_returned'] != 'no_result') & (merged['results_returned'] != 'failed')
    # mask = merged['results_returned'] != 'no_result' or 'failed'
    failed_mask = merged['results_returned'] == 'failed'
    successful_indices = merged.index[mask].tolist()
    failed_indices = merged.index[failed_mask].tolist()
    return successful_indices, failed_indices

In [None]:
#test if failed query can be outputed
!chmod +x test_query_run.sh
!./test_query_run.sh ./input_batch/batch0.txt

Failed query: MATCH (bodypart:BodyPart {commonName: "scrotum"}) - [:CHEMICALBINDSGENE] -> (drug:Drug {commonName: "Hydroxyethylcysteine"}) RETURN r
Client received query exception: Unbound variable: r.
Query execution failed, writing 'failed' to ./outputs/1.csv
Query 2 executed and output saved to ./outputs/2.csv
Query 3 executed and output saved to ./outputs/3.csv
Query 4 executed and output saved to ./outputs/4.csv
Query 5 executed and output saved to ./outputs/5.csv
Query 6 executed and output saved to ./outputs/6.csv
Query 7 executed and output saved to ./outputs/7.csv
Query 8 executed and output saved to ./outputs/8.csv
Query 9 executed and output saved to ./outputs/9.csv
Query 10 executed and output saved to ./outputs/10.csv
All queries processed.


In [None]:
folder_path = './outputs'
output_file = f'./aggregates/results0.csv'
aggregate_text_files(folder_path, output_file)

In [None]:
merge_csv_files('aggregates', 'test_merged_output.csv')

Merged CSV saved as test_merged_output.csv


In [None]:
suc_ind, fail_ind = get_indices('test_merged_output.csv')


In [None]:
print(suc_ind, fail_ind)

[1, 2, 15] [0]


### Trying mgclient with timeout feature for single query as a fatal feature

In [31]:
import mgclient
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import threading
# conn = mgclient.connect(host='127.0.0.1', port=7687)
# cursor = conn.cursor()
# Define a function to connect to Memgraph and execute a single query
def execute_query(query):
    # Create thread-local storage for database connections
    if not hasattr(execute_query, "conn"):
        execute_query.conn = mgclient.connect(host='127.0.0.1', port=7687)
        execute_query.conn.autocommit = True
    try:
        cursor = execute_query.conn.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
        return results
    except Exception as e:
        print(f"Failed to execute query {query}: {str(e)}")
        return None
    finally:
        cursor.close()

# def run_queries_multithreaded(query_list, max_workers=5):
#     # Create a ThreadPoolExecutor to manage multiple threads
#     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
#         # Map futures to their corresponding query indices
#         future_to_index = {executor.submit(execute_query, query): i for i, query in enumerate(query_list)}

#         # Create a list to store the indices of successful queries
#         successful_indices = []
#         failed_indices = []

#         # Collect results as they complete
#         for future in concurrent.futures.as_completed(future_to_index):
#             index = future_to_index[future]
#             try:
#                 result = future.result()
#                 if result:
#                     print(f"Query at index {index} returned results.")
#                     successful_indices.append(index)
#                 else:
#                     print(f"Query at index {index} returned no results.")
#             except Exception as e:
#                 print(f"Query execution failed for index {index}: {str(e)}")
#                 failed_indices.append(index)

#         return successful_indices, failed_indices

def run_queries_multithreaded(query_list, max_workers=5, timeout=10):
    # Create a ThreadPoolExecutor to manage multiple threads
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Map futures to their corresponding query indices
        future_to_index = {executor.submit(execute_query, query): i for i, query in enumerate(query_list)}

        # Create a list to store the indices of successful and failed queries
        successful_indices = []
        failed_indices = []

        # Collect results as they complete
        for future in concurrent.futures.as_completed(future_to_index, timeout=timeout):
            index = future_to_index[future]
            try:
                result = future.result(timeout=timeout)  # Apply timeout for each query result collection
                if result:
                    print(f"Query at index {index} returned results.")
                    successful_indices.append(index)
                # else:
                    # print(f"Query at index {index} returned no results.")
                    # failed_indices.append(index)  # Treat no results as failed (or adjust as necessary)
            except concurrent.futures.TimeoutError:
                print(f"Query at index {index} timed out.")
                failed_indices.append(index)
            except Exception as e:
                print(f"Query execution failed for index {index}: {str(e)}")
                failed_indices.append(index)
        return successful_indices, failed_indices


In [32]:
query_list

['MATCH (drug:Drug {commonName: "Indium In-111 oxyquinoline"}) - [*..5] -> (pathway:Pathway {commonName: "Human Complement System"}) RETURN pathway.commonName;',
 'MATCH (molecularfunction:MolecularFunction {commonName: "thyroid hormone receptor coactivator activity"}) - [*1] -> (drug:Drug {commonName: "Bean"}) RETURN drug.commonName;',
 'MATCH (symptom:Symptom {commonName: "Overweight"}) - [*..3] -> (symptom1:Symptom {commonName: "Vocal Cord Paralysis"}) WHERE symptom.commonName = "Angina, Unstable" RETURN symptom1.commonName;',
 'MATCH (disease:Disease {commonName: "Alzheimer disease, familial, type 3"}) - [*1] -> (drugclass:DrugClass {commonName: "Osmotic Activity"}) WHERE drugclass.commonName = "Nucleic Acid Synthesis Inhibitors" RETURN drugclass.commonName;',
 'MATCH (cellularcomponent:CellularComponent {commonName: "cytosolic large ribosomal subunit"}) - [:DRUGCAUSESEFFECT] -> (pathway:Pathway {commonName: "NADE modulates death signalling"}) RETURN pathway.commonName;',
 'MATCH (

In [None]:
# Example usage
if __name__ == "__main__":
    # queries = [
    #     "MATCH (n) RETURN n LIMIT 1",
    #     "MATCH (n:Person) WHERE n.name = 'Alice' RETURN n",
    #     "MATCH (n) RETURN n"
    # ]
    
    successful_query_indices, failed_indices = run_queries_multithreaded(query_list)
    print("Successful query indices:", successful_query_indices)
    print("failed_indices:", failed_indices)


KeyboardInterrupt: 

### Single_query_run bash file archive

### Archive-Multithreading/For loop single threading

In [None]:
 # def run_queries_multithreaded(self,query_list, execution, max_workers=5):
    #     # Create a ThreadPoolExecutor to manage multiple threads
    #     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    #         # Map futures to their corresponding query indices
    #         future_to_index = {executor.submit(execution, query): i for i, query in enumerate(query_list)}

    #         # Create a list to store the indices of successful queries
    #         successful_indices = []
    #         failed_indices = []

    #         # Collect results as they complete
    #         for future in concurrent.futures.as_completed(future_to_index):
    #             index = future_to_index[future]
    #             try:
    #                 result = future.result()
    #                 if result:
    #                     print(f"Query at index {index} returned results.")
    #                     successful_indices.append(index)
    #                 else:
    #                     print(f"Query at index {index} returned no results.")
    #             except Exception as e:
    #                 print(f"Query execution failed for index {index}: {str(e)}")
    #                 failed_indices.append(index)

    #         return successful_indices, failed_indices

    # Function to run multiple queries using multithreading
    # def run_queries_multithreaded(self, tree_population, max_workers=10):
    #     # Using ThreadPoolExecutor to manage multiple threads
    #     with ThreadPoolExecutor(max_workers=max_workers) as executor:
    #         query_list = self.population_to_query_list(tree_population)
    #         # Submit all queries to the executor
    #         future_to_query = {executor.submit(self.execute_query, query): query for query in query_list}
    #         # Collect results as they complete
    #         for future in concurrent.futures.as_completed(future_to_query):
    #             query = future_to_query[future]
    #             try:
    #                 result = future.result()
    #                 if result:
    #                     print(f"Query: {query} returned results")
    #                     self.valid_queries.append(query)
                        
    #                 else:
                        
    #                     # print(f"Query: {query} returned no results")
    #             except Exception as e:
                    
                    # print(f"Query execution failed for {query}: {str(e)}")

    # def run_queries_multithreaded(self, query_list, max_workers=5):
    #     """ 
    #     Us multithreading to run queries on the Memgraph client.
    #     Returns indices corresponding to successful queries and garbage queries for later use.
    #     """
    #     # Create a ThreadPoolExecutor to manage multiple threads
    #     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    #         # Map futures to their corresponding query indices
 #         future_to_index = {executor.submit(self.database_conn.execute_query, query): i for i, query in enumerate(query_list)}
    #         # Create a list to store the indices of successful queries
    #         successful_indices = []
    #         failed_indices = []
    #         # Collect results as they complete
    #         for future in concurrent.futures.as_completed(future_to_index):
    #             index = future_to_index[future]
    #             try:
    #                 result = future.result()
    #                 if result:
    #                     print(f"Query at index {index} returned results.")
    #                     successful_indices.append(index)
    #                 else:
    #                     # garbage_indices.append(index)
    #                     print(f"Query at index {index} returned no results.")
    #             except Exception as e:
    #                 print(f"Query execution failed for index {index}: {str(e)}")
    #                 failed_indices.append(index)
    #         return successful_indices, failed_indices
        
    # def run_queries_multithreaded(self, tree_population, max_workers=5):
    #     """ 
    #     Us multithreading to run queries on the Memgraph client.
    #     Returns indices corresponding to successful queries and garbage queries for later use.
    #     """
    #     query_list = self.population_to_query_list(tree_population)
    #     # Create a ThreadPoolExecutor to manage multiple threads
    #     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    #         future_to_index = {executor.submit(self.database_conn.execute_query, query): i for i, query in enumerate(query_list)}
    #         return self.process_futures(future_to_index)
            # # Map futures to their corresponding query indices
            # future_to_index = {executor.submit(self.database_conn.execute_query, query): i for i, query in enumerate(query_list)}
            # # Create a list to store the indices of successful queries
            # successful_indices = []
            # failed_indices = []
            # # Collect results as they complete
            # for future in concurrent.futures.as_completed(future_to_index):
            #     index = future_to_index[future]
            #     try:
            #         result = future.result()
            #         if result:
            #             print(f"Query at index {index} returned results.")
            #             successful_indices.append(index)
            #         else:
            #             print(f"Query at index {index} returned no results.")
            #     except Exception as e:
            #         print(f"Query execution failed for index {index}: {str(e)}")
            #         failed_indices.append(index)
            # return successful_indices, failed_indices
        
    # def process_futures(self, future_to_index):
    #     successful_indices = []
    #     failed_indices = []
    #     for future in concurrent.futures.as_completed(future_to_index):
    #         index = future_to_index[future]
    #         try:
    #             result = future.result(timeout=10)  # Add timeout to avoid hanging indefinitely
    #             if result:
    #                 successful_indices.append(index)
    #             # else:
    #             #     failed_indices.append(index)
    #         except Exception as e:
    #             print(f"Query execution failed for index {index}: {str(e)}")
    #             failed_indices.append(index)

   
    #     return successful_indices, failed_indices

In [None]:
test = query_list[:5]
test

['MATCH (drugclass:DrugClass {commonName: "Hematologic Activity Alteration"}) - [:GENEREGULATESGENE] -> (gene:Gene {commonName: "-"}) RETURN drugclass.commonName;',
 'MATCH (biologicalprocess:BiologicalProcess {commonName: "regulation of bone trabecula formation"}) - [:GENEASSOCIATESWITHDISEASE] -> (pathway:Pathway {commonName: "Methotrexate Pathway, Pharmacokinetics"}) WHERE pathway.commonName = "Esomeprazole Action Pathway" RETURN biologicalprocess.commonName;',
 'MATCH (drugclass:DrugClass {commonName: "Genitourinary Arterial Vasodilation"}) - [:GENECOVARIESWITHGENE] -> (symptom:Symptom {commonName: "Polydipsia, Psychogenic"}) WHERE drugclass.commonName = "Increased Blood Pressure" RETURN symptom.commonName;',
 'MATCH (biologicalprocess:BiologicalProcess {commonName: "blastoderm segmentation"}) - [*7..10] -> (disease:Disease {commonName: "Familial Alzheimer-like prion disease"}) RETURN biologicalprocess.commonName;',
 'MATCH (gene:Gene {commonName: "-"}) - [*..18] -> (drugclass:Drug

In [None]:
def split_query_list(query_list, size=5):
    """
    Splits a list of queries into sublists, each containing a specific number of queries.

    Parameters:
    - query_list (list): The original list of queries.
    - size (int): The number of queries each sublist should contain.

    Returns:
    - list of lists: A list where each element is a sublist containing 'size' number of queries.
    """
    return [query_list[i:i + size] for i in range(0, len(query_list), size)]

splits = split_query_list(query_list, size=5)

In [None]:
successful_indices = []
failed_indices = []

for sublist in splits:
    for index, query in enumerate(sublist):
    # print(index)
        result = execute_query(query)
        if result:
            if result == 'NoResult':
                pass
            else:
                successful_indices.append(index)
        else:
            failed_indices.append(index)


In [None]:
# try without multithreading
successful_indices = []
failed_indices = []


for index, query in enumerate(test):
    # print(index)
    result = execute_query(query)
    if result:
        if result == 'NoResult':
            pass
        else:
            successful_indices.append(index)
    else:
        failed_indices.append(index)

    # try:
    #     result = execute_query(query)
    #     if result:
    #         print(f"Query at index {index} returned results.")
    #         successful_indices.append(index)
    #     else:
    #         print(f"Query at index {index} returned no results.")
    # except Exception as e:
    #     print(f"Query execution failed for index {index}: {str(e)}")
    #     failed_indices.append(index)

In [None]:
successful_indices
failed_indices

[]

In [None]:
if __name__ == "__main__":
    successful_query_indices, failed_indices = run_queries_multithreaded(query_list)
    print("Successful query indices:", successful_query_indices)
    print("failed_indices:", failed_indices)

Query at index 2 returned results.
Query at index 3 returned results.
Query at index 4 returned results.
Query at index 0 returned results.
Query at index 1 returned results.
Query at index 5 returned results.
Query at index 7 returned results.
Query at index 8 returned results.
Query at index 9 returned results.
Query at index 6 returned results.
Query at index 11 returned results.
Query at index 12 returned results.
Query at index 15 returned results.
Query at index 14 returned results.
Query at index 16 returned results.
Query at index 10 returned results.
Query at index 13 returned results.
Query at index 17 returned results.
Query at index 18 returned results.
Query at index 19 returned results.
Successful query indices: [2, 3, 4, 0, 1, 5, 7, 8, 9, 6, 11, 12, 15, 14, 16, 10, 13, 17, 18, 19]
failed_indices: []


----------

In [None]:
#TEST if test_3 can run
import mgclient

def execute_queries_as_batch(conn, queries):
    cursor = conn.cursor()
    try:
        # cursor.execute("BEGIN;")
        for query in queries:
            cursor.execute(query)
        # cursor.execute("COMMIT;")
    except Exception as e:
        print("An error occurred:", str(e))
        # cursor.execute("ROLLBACK;")  # Rollback in case of an error
    finally:
        cursor.close()  # Ensure the cursor is closed after operation

# Connection setup
conn = mgclient.connect(host='127.0.0.1', port=7687)
conn.autocommit = False  # Turn off autocommit for manual transaction management

# # List of queries to execute as part of a single batch
# queries = [
#     "CREATE (n:Person {name: 'Alice'})",
#     "MATCH (n:Person) WHERE n.name = 'Alice' RETURN n",
#     "CREATE (n:Person {name: 'Bob'})"
# ]

# Execute as batch
execute_queries_as_batch(conn, test_list)
conn.close()


NameError: name 'test_list' is not defined

In [None]:
#TRYING MULTITHREADING
import mgclient
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import threading
# conn = mgclient.connect(host='127.0.0.1', port=7687)
# cursor = conn.cursor()
# Define a function to connect to Memgraph and execute a single query
def execute_query(query):
    # Create thread-local storage for database connections
    if not hasattr(execute_query, "conn"):
        execute_query.conn = mgclient.connect(host='127.0.0.1', port=7687)
        execute_query.conn.autocommit = True
    try:
        cursor = execute_query.conn.cursor()
        cursor.execute(query)
        results = cursor.fetchall()
        return results
    except Exception as e:
        print(f"Failed to execute query {query}: {str(e)}")
        return None
    finally:
        cursor.close()

def run_queries_multithreaded(query_list, max_workers=5):
    # Create a ThreadPoolExecutor to manage multiple threads
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Map futures to their corresponding query indices
        future_to_index = {executor.submit(execute_query, query): i for i, query in enumerate(query_list)}

        # Create a list to store the indices of successful queries
        successful_indices = []
        failed_indices = []

        # Collect results as they complete
        for future in concurrent.futures.as_completed(future_to_index):
            index = future_to_index[future]
            try:
                result = future.result()
                if result:
                    print(f"Query at index {index} returned results.")
                    successful_indices.append(index)
                else:
                    print(f"Query at index {index} returned no results.")
            except Exception as e:
                print(f"Query execution failed for index {index}: {str(e)}")
                failed_indices.append(index)

        return successful_indices, failed_indices

# Example usage
if __name__ == "__main__":
    # queries = [
    #     "MATCH (n) RETURN n LIMIT 1",
    #     "MATCH (n:Person) WHERE n.name = 'Alice' RETURN n",
    #     "MATCH (n) RETURN n"
    # ]
    successful_query_indices, failed_indices = run_queries_multithreaded(test_list)
    print("Successful query indices:", successful_query_indices)
    print("failed_indices:", failed_indices)

# # Example usage
# if __name__ == "__main__":
#     # Example list of queries
#     queries = [
#         "MATCH (n) RETURN n LIMIT 1",
#         "MATCH (n:Person) WHERE n.name = 'Alice' RETURN n",
#         '''MATCH (cellularcomponent:CellularComponent {commonName: "cortical cytoskeleton"}) - [:CHEMICALDECREASESEXPRESSION] -> (cellularcomponent1:CellularComponent {commonName: "ubiquitin conjugating enzyme complex"}) RETURN cellularcomponent.commonName, cellularcomponent1.commonName;'''
#         # Add more queries as needed
#     ]

    # run_queries_multithreaded(queries, max_workers=5)


NameError: name 'test_list' is not defined

In [None]:
ea = EvolutionaryAlgorithm(qm=qmTest, depth_manager=dmTest, population_size=10,max_depth=10, max_generation=3)
# print(ea.depth_manager._max_depth)
print("generation:",ea.generation)
ea.initialize_population() 
ea.evaluate_population()
print("generation:",ea.generation)
print("Fitness Scores:", ea.fitness_scores)

parent_pairs = ea.select_parents(num_pairs=5,k=3) #assume 5 pairs of parents --> maintain 10 total in next gen
offspring_list = []
for parent1, parent2 in parent_pairs:
    # print("parent:",parent1.children[-1])
    # print(parent2.children[-1])
    offspring1, offspring2 = ea.one_point_crossover(parent1, parent2)
    # print("offspring:", offspring1.children[-1])
    # print(offspring2.children[-1])
    offspring_list.extend([offspring1, offspring2])

ea.tree_population = offspring_list
ea.evaluate_population()
print("generation:",ea.generation)
print("Fitness Scores:", ea.fitness_scores)
# ea.reset_ea()


# print(ea.depth_manager.depth_record)
result = ea.output_top_queries(top_n=10)

In [None]:
output_file_path = 'output_queries.txt'
with open(output_file_path, 'w') as file:
    for tuple in result:
        tree = tuple[0]
        querystr = tree.to_querystr()
        file.write(querystr + '\n')
print("All queries have been written to", output_file_path)


All queries have been written to output_queries.txt


In [19]:
output_file_path = 'output_queries.txt'
with open(output_file_path, 'w') as file:
    counter = 0
    for query in query_list:
        if counter >= 10:
            break  # Stop writing if the counter reaches 10
        file.write(query + '\n')  # Write the query to the file
        counter += 1 
print("10 queries have been written to", output_file_path)

10 queries have been written to output_queries.txt


In [22]:
!chmod +x single_query_run.sh
!./single_query_run.sh

Query 1 executed and output saved to ./outputs/query1.csv
Query 2 executed and output saved to ./outputs/query2.csv
Query 3 executed and output saved to ./outputs/query3.csv
Query 4 executed and output saved to ./outputs/query4.csv
Query 5 executed and output saved to ./outputs/query5.csv
Query 6 executed and output saved to ./outputs/query6.csv
Query 7 executed and output saved to ./outputs/query7.csv
Query 8 executed and output saved to ./outputs/query8.csv
Query 9 executed and output saved to ./outputs/query9.csv
Query 10 executed and output saved to ./outputs/query10.csv
All queries processed.


In [17]:
import pandas as pd
import os

def aggregate_text_files(folder_path, output_file):
    # Initialize a list to store the aggregated data
    aggregated_data = []

    # List all files in the specified folder
    file_list = [f for f in os.listdir(folder_path) if f.endswith('.csv')]  # or change to '.txt' if appropriate

    for file_name in file_list:
        file_path = os.path.join(folder_path, file_name)
        
        # Check if the file is empty
        if os.stat(file_path).st_size == 0:
            aggregated_data.append({"variables_returned": "no_result", "results_returned": "no_result"})
            continue
        
        # Open and read the file as a plain text file
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        
        # Check again for empty content after reading lines
        if not lines:
            aggregated_data.append({"variables_returned": "no_result", "results_returned": "no_result"})
            continue

        # First line as 'variables_returned'
        variables_returned = lines[0].strip()
        # Join the rest of the lines for 'results_returned'
        if len(lines) > 1:
            results_returned = '; '.join([line.strip().replace('\n', ';') for line in lines[1:]])
        else:
            results_returned = "no_result"
        
        aggregated_data.append({
            "variables_returned": variables_returned,
            "results_returned": results_returned
        })

    # Create a DataFrame from the aggregated data
    result_df = pd.DataFrame(aggregated_data)
    # Save the aggregated data to a CSV file
    result_df.to_csv(output_file, index=False)

# Example usage
folder_path = './outputs'
output_file = './aggregated_results.csv'
aggregate_text_files(folder_path, output_file)


In [18]:
results = pd.read_csv("aggregated_results.csv")
results

Unnamed: 0,variables_returned,results_returned
0,no_result,no_result
1,no_result,no_result
2,no_result,no_result
3,no_result,no_result
4,no_result,no_result
5,no_result,no_result
6,no_result,no_result
7,no_result,no_result
8,no_result,no_result
9,no_result,no_result


In [None]:
results["results_returned"][1]

'"(:Gene {commonName: ""TATA-box binding protein"", geneSymbol: ""TBP"", nodeID: ""1"", typeOfGene: ""protein-coding"", uri: ""http://jdr.bio/ontologies/alzkb.owl#gene_tbp"", xrefEnsembl: ""ENSG00000112592"", xrefHGNC: ""11588"", xrefNcbiGene: 6908, xrefOMIM: ""600075""})";"[:GENEPARTICIPATESINBIOLOGICALPROCESS]"; "(:Gene {commonName: ""TATA-box binding protein"", geneSymbol: ""TBP"", nodeID: ""1"", typeOfGene: ""protein-coding"", uri: ""http://jdr.bio/ontologies/alzkb.owl#gene_tbp"", xrefEnsembl: ""ENSG00000112592"", xrefHGNC: ""11588"", xrefNcbiGene: 6908, xrefOMIM: ""600075""})";"[:GENEPARTICIPATESINBIOLOGICALPROCESS]"'

In [None]:
#TRING mgclient
import mgclient
conn = mgclient.connect(host='127.0.0.1', port=7687)
cursor = conn.cursor()

In [None]:
def test_connection(conn):
    if conn is not None:
        try:
            cursor = conn.cursor()
            # Example test query to fetch 10 nodes
            cursor.execute("MATCH (n) RETURN n LIMIT 10")
            results = cursor.fetchall()
            if results:
                print("Connection test successful, received data:", results)
            else:
                print("Connection test successful, no data received")
        except Exception as e:
            print(f"Failed to execute test query: {e}")
        finally:
            cursor.close()


In [None]:
test_connection(conn)

Connection test successful, received data: [(<mgclient.Node(id=0, labels={'Drug'}, properties={'commonName': 'Basiliximab', 'nodeID': '263', 'uri': 'http://jdr.bio/ontologies/alzkb.owl#drug_db00074', 'xrefCasRN': '179045-86-4', 'xrefDrugbank': 'DB00074'}) at 0x7feffadcb150>,), (<mgclient.Node(id=1, labels={'Drug'}, properties={'commonName': 'Muromonab', 'nodeID': '264', 'uri': 'http://jdr.bio/ontologies/alzkb.owl#drug_db00075', 'xrefCasRN': '140608-64-6', 'xrefDrugbank': 'DB00075'}) at 0x7ff00a34e2d0>,), (<mgclient.Node(id=2, labels={'Drug'}, properties={'commonName': 'Trastuzumab', 'nodeID': '265', 'uri': 'http://jdr.bio/ontologies/alzkb.owl#drug_db00072', 'xrefCasRN': '180288-69-1', 'xrefDrugbank': 'DB00072'}) at 0x7feffb7d3c00>,), (<mgclient.Node(id=3, labels={'Drug'}, properties={'commonName': 'Rituximab', 'nodeID': '266', 'uri': 'http://jdr.bio/ontologies/alzkb.owl#drug_db00073', 'xrefCasRN': '174722-31-7', 'xrefDrugbank': 'DB00073'}) at 0x7feffb7d3810>,), (<mgclient.Node(id=4, la

In [None]:


    def initialize_population(self):
        """ Initializes the population with random depth queries that return results. """
        # Connect to Memgraph database
        conn = connect(host='localhost', port=7687)
        cursor = conn.cursor()
        
        # Generate queries until the population size is satisfied
        while len(self.tree_population) < self.population_size:
            tree, query = self.qm.generate_query()
            if self.is_valid_query(cursor, query):
                self.tree_population.append(tree)
                self.str_population.append(query)
            self.qm.reset_per_gen()

        # Close the database connection
        cursor.close()
        conn.close()

    def is_valid_query(self, cursor, query):
        """ Execute the query and check if it returns any results. """
        try:
            cursor.execute(query)
            results = cursor.fetchall()
            return len(results) > 0  # Return True if there are results
        except Exception as e:
            print(f"Query failed: {e}")
            return False  # Query failed or returned no results

# Example usage:
# Assuming 'qm' is an instance of QueryManager initialized properly
ea = EvolutionaryAlgorithm(qm)
ea.initialize_population()
