# Load Ontological Concepts

This notebook reads in the ontology file ('data/Ontology.xlsx') in Excel format and convert it into a nodes file ('data/Onto_nodes.tsv') and an edges file ('data/Onto_edges.tsv') in table separated values file format for loading into Neo4j database.

## Import Libraries

In [91]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

import os 
import configparser
import csv

In [92]:
# install or import Neo4j GraphDataScience library
try: 
  from graphdatascience import GraphDataScience
  print('Successfully imported GraphDataScience')
except ModuleNotFoundError:
  !pip3 install graphdatascience
  from graphdatascience import GraphDataScience
  print('installed and imported GraphDataScience')

Successfully imported GraphDataScience


# Custom Functions

In [93]:
# function adapted from Neo4j GDS Demo Notebook (h/t Zach B.) mainly used for connecting to Neo4j database from Python
def read_neo4j_properties(NEO4J_PROPERTIES_FILE: str=None) -> str:
  '''Parses Neo4j database or Aura connection details from provided .ini filepath.
  Requirements:
    configparser

  Args:
    NEO4J_PROPERTIES_FILE: path to a .ini file
  
  Returns:
    HOST: link to Neo4j or Aura host 
    USERNAME: login username
    PASSWORD: login password 

  Note: The .ini file should use the following syntax
    [NEO4J]
    PASSWORD=<password>
    USERNAME=<database name>
    HOST=<host uri>

  If no path is passed, the function will return the defaults:
    HOST = 'neo4j://localhost'
    USERNAME = 'neo4j'
    PASSWORD = 'password'
  '''

  if NEO4J_PROPERTIES_FILE is not None and os.path.exists(NEO4J_PROPERTIES_FILE):
      config = configparser.RawConfigParser()
      config.read(NEO4J_PROPERTIES_FILE)
      HOST = config['NEO4J']['HOST']
      USERNAME = config['NEO4J']['USERNAME']
      PASSWORD = config['NEO4J']['PASSWORD']
      print('Using HOST, USERNAME, PASSWORD from .ini file')
      return HOST, USERNAME, PASSWORD
  else:
      print('Could not find database properties file, using defaults:')
      HOST = 'neo4j://localhost'
      USERNAME = 'neo4j'
      PASSWORD = 'password'
      print(f'HOST: {HOST} \nUSERHAME: {USERNAME} \nPASSWORD: {PASSWORD}')
      return HOST, USERNAME, PASSWORD 

In [94]:
# function to convert ontology in Excel format into Onto_nodes.tsv and Onto_edges.tsv
def onto_excel_to_neo4j(onto_excel_file_path, onto_nodes_file_path, onto_edges_file_path):
    
    nodes = []
    edges = []

    # Read Ontology file in Excel format
    df = pd.read_excel(onto_excel_file_path).fillna('')

    for index, row in df.iterrows():
        Level_1_Class = row['Level_1_Class']
        Level_1_Class_Id = row['Level_1_Class_Id']
        Level_2_Class = row['Level_2_Class']
        Level_2_Class_Id = row['Level_2_Class_Id']
        Level_1_2_Relation = row['Level_1_2_Relation']
        Level_3_Class = row['Level_3_Class']
        Level_3_Class_Id = row['Level_3_Class_Id']
        Level_2_3_Relation = row['Level_2_3_Relation']
        Level_4_Class = row['Level_4_Class']
        Level_4_Class_Id = row['Level_4_Class_Id']
        Level_3_4_Relation = row['Level_3_4_Relation']
        
        if Level_1_Class:
            tmp = f'{Level_1_Class_Id}\t{Level_1_Class}\n'
                
            if tmp not in nodes:
                nodes.append(tmp)
        if Level_2_Class:
            tmp = f'{Level_2_Class_Id}\t{Level_2_Class}\n'
            if tmp not in nodes:
                nodes.append(tmp)
                    
            hierarchy = ""
            if Level_1_2_Relation == "is_a":
                hierarchy = f'{Level_2_Class_Id}\tSUB_CLASS_OF\t{Level_1_Class_Id}\n'
            else:
                hierarchy = f'{Level_2_Class_Id}\t{Level_1_2_Relation.upper()}\t{Level_1_Class_Id}\n'
            if hierarchy not in edges:
                edges.append(hierarchy)
            
        if Level_3_Class:
            tmp = f'{Level_3_Class_Id}\t{Level_3_Class}\n'
            if tmp not in nodes:
                nodes.append(tmp)
                    
            hierarchy = ""
            if Level_2_3_Relation == "is_a":
                hierarchy = f'{Level_3_Class_Id}\tSUB_CLASS_OF\t{Level_2_Class_Id}\n'
            else:
                hierarchy = f'{Level_3_Class_Id}\t{Level_2_3_Relation.upper()}\t{Level_2_Class_Id}\n'
            if hierarchy not in edges:
                edges.append(hierarchy)
        
        if Level_4_Class:
            tmp = f'{Level_4_Class_Id}\t{Level_4_Class}\n'
            if tmp not in nodes:
                nodes.append(tmp)
                    
            hierarchy = ""
            if Level_3_4_Relation == "is_a":
                hierarchy = f'{Level_4_Class_Id}\tSUB_CLASS_OF\t{Level_3_Class_Id}\n'
            else:
                hierarchy = f'{Level_4_Class_Id}\t{Level_3_4_Relation.upper()}\t{Level_3_Class_Id}\n'
            if hierarchy not in edges:
                edges.append(hierarchy)

    # open nodes file for writing
    with open(onto_nodes_file_path, "w") as nodes_file:
        # write to file
        nodes_header = f"id\tname\n"
        nodes_file.write(nodes_header)
        for chunk in nodes:
            nodes_file.write(chunk)
    nodes_file.close()
    
    # open edges file for writing
    with open(onto_edges_file_path, "w") as edges_file:
        edges_header = f"source\trelation\ttarget\n"
        edges_file.write(edges_header)
        # write to file
        for chunk in edges:
            edges_file.write(chunk)
    edges_file.close()

In [95]:
# Convert ontology in Excel format to nodes file and edges file for loading into neo4j
onto_excel_to_neo4j(onto_excel_file_path='data/Ontology.xlsx', onto_nodes_file_path='data/Onto_nodes.tsv', onto_edges_file_path='data/Onto_edges.tsv')

# Connect to Neo4j DB
It is recommended to store authentication credentials in a separate file and read them in to the notebook as variables. This code assumes the files are stored in a local auth directory.

In [96]:
# get authentication credentials from local auth file
NEO4J_PROPERTIES_FILE = 'auth/immerse_kg_auth.ini'
HOST, USERNAME, PASSWORD = read_neo4j_properties(NEO4J_PROPERTIES_FILE=NEO4J_PROPERTIES_FILE)

Using HOST, USERNAME, PASSWORD from .ini file


In [None]:
# connect to neo4j instance 
gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=False)

In [98]:
# confirm connection with GDS version 
gds.version()

'2.5.0'

# Clean up ontological concepts from the database

In [99]:
# if the steps of loading ontologyical concepts failed or want to reload ontological concepts, clear out the ontological concepts from the database
gds.run_cypher('''
                MATCH (n:OntologicalConcept)
                DETACH DELETE n
                ''')

In [100]:
# read Onto_nodes.tsv
onto_nodes = pd.read_csv('data/Onto_nodes.tsv', sep='\t', header=0).fillna('NA') 
onto_nodes.head().to_dict(orient='records')

[{'id': 'C1', 'name': 'metabolic pathways'},
 {'id': 'C2', 'name': 'cell cycle pathway G0/G1/M/G2'},
 {'id': 'C3', 'name': 'substrates'},
 {'id': 'C4', 'name': 'nucleotide or nucleoside'},
 {'id': 'C5', 'name': 'amino acid'}]

In [101]:
# read Onto_edges.tsv
onto_edges = pd.read_csv('data/Onto_edges.tsv', sep='\t', header=0).fillna('NA') 
onto_edges.head().to_dict(orient='records')

[{'source': 'C2', 'relation': 'SUB_CLASS_OF', 'target': 'C1'},
 {'source': 'C3', 'relation': 'IS_OBJECT_OF_CATALYST', 'target': 'C1'},
 {'source': 'C3', 'relation': 'SUB_CLASS_OF', 'target': 'C1'},
 {'source': 'C4', 'relation': 'SUB_CLASS_OF', 'target': 'C3'},
 {'source': 'C5', 'relation': 'SUB_CLASS_OF', 'target': 'C3'}]

In [102]:
# Create contraint on OntologicalConcept nodes.
gds.run_cypher('''CREATE CONSTRAINT ontological_concept IF NOT EXISTS FOR (c:OntologicalConcept) REQUIRE c.id IS UNIQUE''')

In [103]:
# Check the constraint created in the step above.
gds.run_cypher('''SHOW CONSTRAINTS''')

Unnamed: 0,id,name,type,entityType,labelsOrTypes,properties,ownedIndex
0,2,ontological_concept,UNIQUENESS,NODE,[OntologicalConcept],[id],ontological_concept


# Load Ontology

In [104]:
# Load ontological concept nodes into Neo4j database.
gds.run_cypher('''
               UNWIND $node_list AS node
               CALL apoc.merge.node (["OntologicalConcept"], {id: node.id, name: node.name })
               YIELD node as n 
               RETURN n
              ''', {'node_list': onto_nodes.to_dict('records')})


Unnamed: 0,n
0,"(name, id)"
1,"(name, id)"
2,"(name, id)"
3,"(name, id)"
4,"(name, id)"
5,"(name, id)"
6,"(name, id)"
7,"(name, id)"
8,"(name, id)"
9,"(name, id)"


In [105]:
# Load ontological concept edges into Neo4j database
gds.run_cypher('''
                UNWIND $edge_list AS edge
                MATCH (s)
                WHERE s.id = edge.source
                MATCH (t)
                WHERE t.id = edge.target
                CALL apoc.merge.relationship(s, edge.relation,{},{}, t,{})
                YIELD rel
                RETURN rel;
              ''', {'edge_list': onto_edges.to_dict('records')})

Unnamed: 0,rel
0,()
1,()
2,()
3,()
4,()
5,()
6,()
7,()
8,()
9,()


In [106]:
# Rename subClassOf relationships to SUB_CLASS_OF
gds.run_cypher('''
                MATCH (n)-[rel:subClassOf]->(m)
                MERGE (n)-[:SUB_CLASS_OF]->(m)
                DELETE rel
                ''')

In [107]:
# Create fulltext search index on ontological concept nodes
gds.run_cypher('''CREATE FULLTEXT INDEX onto_concept_search IF NOT EXISTS FOR (n:OntologicalConcept) ON EACH [n.name]''')

In [108]:
# (Optional) Exports the whole database incl. indexes as cypher statements to the provided file
gds.run_cypher('''CALL apoc.export.cypher.all('kg_export_after_load_ontology.cypher',{format:'cypher-shell'})''')

Unnamed: 0,file,batches,source,format,nodes,relationships,properties,time,rows,batchSize,cypherStatements,nodeStatements,relationshipStatements,schemaStatements,cleanupStatements
0,kg_export_after_load_ontology.cypher,1,"database: nodes(66), rels(51)",cypher,66,51,132,35,117,20000,,,,,
