### Next steps
- make sure the bad_commands are complete (as for now ok)
- get right nodes, edges, properties from alzkb (done)
- write a function to output filtered, updated queries into a dataframe/csv (done)
- write a function to quest neo4j as second filter
- Try with different depth to see after filter + query, what's the filter rate for each depth range

### Import necessary packages; define customized nodes, relationships, properties

In [58]:
import re
import random
import json

# Open and load the graph schema json file
with open('schema.json', 'r',encoding='utf-8-sig') as file:
    schema = json.load(file)
    
# Extract nodes and edges from the schema
nodes = [node['labels'][0] for node in schema[0]['nodes']]
edges = [relationship['type'] for relationship in schema[0]['relationships']]

# Example input
relationships = ['changerel1', 'changerel2', 'changerel3']
properties = ['changeprop1','changeprop2','changeprop3']
property_dict = {'GENEINPATHWAY':'geneinpathwayprop',
                 'BiologicalProcess': ['bioprocessprop1', 'bioprocessprop2'],
                 'Gene':['geneprop1','geneprop2']}
bad_commands = ['CREATE','DELETE','MERGE','REMOVE','SET','FINISH','FOREACH','LOAD CSV','SKIP',
                 'LIT\d+'] #Need to double check to see if this is complete
#'UNWIND'

In [59]:
#def load_node_properties( !!can do this in the future

#load properties for each type to form a dictionary
node_properties = {}

#load shcema_properties to process and generate properties dictionary
with open('schema_properties.json', 'r',encoding='utf-8-sig') as file:
    schema_props = json.load(file)

label = ''
for i in range(len(schema_props)):
    #print(i)
    label = ''
    #print(schema_props[i])
    current = schema_props[i]
    label = current['label']
    props = current['sample_node']['properties'].keys()
    props = list(props)
    node_properties[label] = props

node_properties
        

{'Gene': ['typeOfGene',
  'commonName',
  'xrefOMIM',
  'xrefHGNC',
  'xrefEnsembl',
  'geneSymbol',
  'uri',
  'xrefNcbiGene'],
 'DrugClass': ['commonName', 'xrefNciThesaurus', 'uri'],
 'Drug': ['commonName', 'xrefDrugbank', 'xrefCasRN', 'uri'],
 'Disease': ['commonName', 'xrefUmlsCUI', 'xrefDiseaseOntology', 'uri'],
 'Datatype': ['uri'],
 'Pathway': ['commonName', 'sourceDatabase', 'pathwayId', 'uri'],
 'BiologicalProcess': ['commonName', 'xrefGeneOntology', 'uri'],
 'MolecularFunction': ['commonName', 'xrefGeneOntology', 'uri'],
 'CellularComponent': ['commonName', 'xrefGeneOntology', 'uri'],
 'Symptom': ['commonName', 'xrefMeSH', 'uri'],
 'BodyPart': ['commonName', 'uri', 'xrefUberon']}

In [60]:
# For Alzkb practical use, just keep typeOfGene and commonName as properties
useful_properties = {'Gene': ['typeOfGene',
  'commonName'],
 'DrugClass': ['commonName'],
 'Drug': ['commonName'],
 'Disease': ['commonName'],
 #'Datatype': '',
 'Pathway': ['commonName'],
 'BiologicalProcess': ['commonName'],
 'MolecularFunction': ['commonName'],
 'CellularComponent': ['commonName'],
 'Symptom': ['commonName'],
 'BodyPart': ['commonName']}


### Define first filter function to filter out queries with modification functions

In [61]:
def filter(query, unwanted_commands):
    """
    Filter out queries that contain modification and/or other unwanted commands
    """
    pattern = r"\b(" + "|".join(unwanted_commands) + r")\b"
    
    # Use re.search to find any of the unwanted commands in the query
    if re.search(pattern, query, re.IGNORECASE):
        return True  # Unwanted command found, return True
    else:
        return False  # No unwanted command found, return False



### Define replacement functions to convert queries

In [62]:
def replace_properties(query, properties_dict):
    """
    Replace class properties in the content with random properties from a provided dictionary.
    If no matching class name is found in the dictionary, delete the ".prop".

    Args:
    content (str): The original content string containing Cypher-like queries.
    properties_dict (dict): A dictionary mapping class names to lists of new properties.

    Returns:
    str: The content with replaced properties.
    """
    def replacement_function(match):
        class_name = match.group(1)  # The class name from the regex match
        if class_name in properties_dict and properties_dict[class_name]:
            # Select a random new property from the list for the matched class
            random_property = random.choice(properties_dict[class_name])
            return f"{class_name}.{random_property}"
        else:
            # Also remove the .prop if no matches found
            return class_name

    # Pattern to find 'ClassName.propX' where ClassName is any of the keys in the dictionary
    # We capture the class name and the propX part to allow precise replacement
    pattern = rf"(\b{'|'.join(properties_dict.keys())})\.prop\d+"
    pattern_general = rf"(\w+)\.prop\d+"


    def conditional_replacement(match):
        full_match = match.group(0)
        class_name = match.group(1)

        # Check if the class name matches pattern1 (is in useful_properties)
        if re.fullmatch(pattern, full_match):
            # If it matches pattern1, return the match unchanged
            return full_match
        else:
            # If it does not match pattern1, remove '.propX'
            return class_name

    # Remove .prop from labels without properties, then update to customized properties for other labels
    removed_content = re.sub(pattern_general, conditional_replacement, query)
    updated_content = re.sub(pattern, replacement_function, removed_content)

    return updated_content
    

def replace_query(query, cust_rels, cust_props):
    # Function to replace 'reltype' with a random relationship
    def replace_reltype(match):
        return random.choice(cust_rels)
    # Replace each 'reltype' occurrence with a different random relationship
    query = re.sub(r"reltype\d+", replace_reltype, query)
    # Replace each 'prop' occurrence with a different random property
    query = replace_properties(query,cust_props)
    return query


#### Sample Replacement and Update

In [63]:
eg_file_path = './tests/test_5'
with open(eg_file_path, 'r') as file:
    content_5 = file.read()
    print(content_5)
print(replace_query(content_5,edges, useful_properties))

 MERGE (:label8{}) SET BODYPARTOVEREXPRESSESGENE.prop6 = GENEASSOCIATESWITHDISEASE MERGE path_0 = (node_6{})-[:reltype5]->() MERGE path_0 = () ; 
 MERGE (:label8{}) SET BODYPARTOVEREXPRESSESGENE = GENEASSOCIATESWITHDISEASE MERGE path_0 = (node_6{})-[:DISEASELOCALIZESTOANATOMY]->() MERGE path_0 = () ; 


In [64]:
#Test
pattern = rf"(\b{'|'.join(useful_properties.keys())})\.prop\d+"
pattern2 = rf"(\w+)\.prop\d+"
print(content_5)
sub = re.sub(pattern, "repl", content_5)
print (sub)
sub2 = re.sub(pattern2, "repl", content_5)
print (sub2)

matches_pattern1 = set(re.findall(pattern, content_5))
matches_pattern2 = set(re.findall(pattern2, content_5))

# Subtract matches of pattern1 from matches of pattern2
unique_pattern2_matches = matches_pattern2 - matches_pattern1

# Print the unique matches to pattern2
print("Matches for pattern2 but not pattern1:", unique_pattern2_matches)

 MERGE (:label8{}) SET BODYPARTOVEREXPRESSESGENE.prop6 = GENEASSOCIATESWITHDISEASE MERGE path_0 = (node_6{})-[:reltype5]->() MERGE path_0 = () ; 
 MERGE (:label8{}) SET BODYPARTOVEREXPRESSESGENE.prop6 = GENEASSOCIATESWITHDISEASE MERGE path_0 = (node_6{})-[:reltype5]->() MERGE path_0 = () ; 
 MERGE (:label8{}) SET repl = GENEASSOCIATESWITHDISEASE MERGE path_0 = (node_6{})-[:reltype5]->() MERGE path_0 = () ; 
Matches for pattern2 but not pattern1: {'BODYPARTOVEREXPRESSESGENE'}


### Automate & Process all files

In [65]:
"""
Read each generated query file with primary filtering and replacement.
Output: updated_query filtered and replaced with customized nodes/edges/properties
"""
import os
#import pandas as pd

filtered_query = []
updated_query = []


# Set the path to the directory containing the files
tests_path = './tests'

# Loop through each file in the directory
for filename in os.listdir(tests_path):
    # Check if the filename follows the expected pattern
    if filename.startswith('test_'):
        # Create the full path to the file
        file_path = os.path.join(tests_path, filename)
        #print(os.path.isfile(file_path))
        
    # Open and read the file
    with open(file_path, 'r') as file:
        current_query = file.read()
        # print(content, filter(content,bad_commands))
        # print("\n---\n")
        if filter(current_query,bad_commands)==False: #when the query doesn't contain unwanted commands
            filtered_query += [current_query]
            
#filtered_query_data = pd.concat(filtered_query)
#print(filtered_query)


for query in filtered_query:
    updated_query += [replace_query(query, edges, useful_properties)]
updated_query

[' UNWIND [0, 9] AS unwind_var RETURN DISTINCT * ORDER BY DRUGINCLASS ASC LIMIT 92 ; ',
 ' MATCH ()-[]->()<-->(), ({})<-[]-({})<-->({})-[]-({})<-->({}), path_0 = (), path_0 = () WHERE Gene.typeOfGene <> TRUE AND [ ] > FALSE AND DrugClass.commonName < FALSE AND CHEMICALBINDSGENE <> NULL RETURN DISTINCT GENEASSOCIATEDWITHCELLULARCOMPONENT ; ',
 ' MATCH path_0 = ()<-[*..]-({}), (), path_1 = () WHERE 3 <> [ ] AND [ ] < NULL RETURN DRUGTREATSDISEASE ORDER BY NULL LIMIT 7 ',
 ' RETURN DISTINCT CellularComponent ORDER BY GENEHASMOLECULARFUNCTION LIMIT 0 ',
 ' UNWIND [0, 9, 2] AS unwind_var  WITH DISTINCT Drug  var, GENEREGULATESGENE  var ORDER BY CellularComponent ASCENDING LIMIT 5 WHERE .9015 > .0 UNWIND [0] AS unwind_var UNWIND [0] AS unwind_var RETURN DISTINCT * , NULL  var , NULL , TRUE ORDER BY NULL ASCENDING ; ',
 ' MATCH path_0 = ({})--() WHERE Gene.commonName < FALSE UNWIND [0, 8] AS unwind_var RETURN DISTINCT * ; ',
 ' UNWIND [0] AS unwind_var RETURN * ORDER BY Drug.commonName, GENEI

### Output final filtered queries

In [66]:
# Will add second filter after Cypher works
import pandas as pd

final = pd.DataFrame(updated_query)
final
final.to_csv('workable_queries.csv',header=False, index=False)

### TODO: Second filter through Cypher Memgraph

In [67]:
import pandas as pd
import numpy as np
from gqlalchemy import Memgraph
from gqlalchemy import match
from gqlalchemy.query_builders.memgraph_query_builder import Operator

In [68]:
# Make a connection to the database
#memgraph = Memgraph(host='44.231.174.230', port=7687)
memgraph = Memgraph(host='alzkb.ai', port=7687)

In [69]:
results = memgraph.execute_and_fetch(query)

# Print the first member
print(list(results)[0]['result'])

DatabaseError: Unbound variable: Gene.

In [77]:
# Try
test = memgraph.execute_and_fetch(
"""
MATCH path_0 = ()-->(), path_0 = () RETURN path_0 LIMIT 8
""")
#print(test)
print(list(test)[0])
printtest[0])


{'path_0': <Path nodes=[<Node id=23 labels={'Drug'} properties={'nodeID': '327', 'uri': 'http://jdr.bio/ontologies/alzkb.owl#drug_db00093', 'xrefDrugbank': 'DB00093', 'xrefCasRN': '56-59-7', 'commonName': 'Felypressin'}>, <Node id=40026 labels={'Gene'} properties={'xrefHGNC': '895', 'xrefNcbiGene': 552, 'commonName': 'arginine vasopressin receptor 1A', 'nodeID': '3880', 'xrefEnsembl': 'ENSG00000166148', 'uri': 'http://jdr.bio/ontologies/alzkb.owl#gene_avpr1a', 'xrefOMIM': '600821', 'geneSymbol': 'AVPR1A', 'typeOfGene': 'protein-coding'}>] relationships=[<Relationship id=0 start_node_id=23 end_node_id=40026 nodes=(23, 40026) type=CHEMICALBINDSGENE properties={}>]>}




TypeError: 'generator' object is not subscriptable

In [None]:
query = memgraph.execute_and_fetch(
"""
RETURN DISTINCT * ORDER BY BiologicalProcess ASCENDING, DrugClass
""")
node_stats = pd.DataFrame(query)

In [None]:
updated_query

[' RETURN DISTINCT * ',
 " UNWIND [0] AS unwind_var MATCH ({})<--()-->()<--({})<--({})-->({}), path_0 = () RETURN * ORDER BY path_0.prop5 DESC, Gene.typeOfGene, 'LIT5' DESC, GENEREGULATESGENE ASC, BiologicalProcess, BodyPart.commonName DESCENDING ; "]

In [13]:
# Try with exported csv
import pandas as pd
node_props = pd.read_csv('memgraph-query-results-export.csv', index_col=None)
node_props.head()

Unnamed: 0,label,name
0,Drug,Basiliximab
1,Drug,Muromonab
2,Drug,Trastuzumab
3,Drug,Rituximab
4,Drug,Ibritumomab tiuxetan


KeyError: 'a'