# Load Dictionary Concepts

This notebook reads in the dictionary concept file ("data/Concepts.obo") in OBO format and convert it into a tab separated values file ("data/Concepts.tsv") for loading into Neo4j database.

## Import Libraries

In [157]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

import os 
import configparser
import csv

In [158]:
# install or import Neo4j GraphDataScience library
try: 
  from graphdatascience import GraphDataScience
  print('Successfully imported GraphDataScience')
except ModuleNotFoundError:
  !pip3 install graphdatascience
  from graphdatascience import GraphDataScience
  print('installed and imported GraphDataScience')

Successfully imported GraphDataScience


# Custom Functions

In [159]:
# function adapted from Neo4j GDS Fraud Demo Notebook (h/t Zach B.) mainly used for connecting to Neo4j database from Python
def read_neo4j_properties(NEO4J_PROPERTIES_FILE: str=None) -> str:
  '''Parses Neo4j database or Aura connection details from provided .ini filepath.
  Requirements:
    configparser

  Args:
    NEO4J_PROPERTIES_FILE: path to a .ini file
  
  Returns:
    HOST: link to Neo4j or Aura host 
    USERNAME: login username
    PASSWORD: login password 

  Note: The .ini file should use the following syntax
    [NEO4J]
    PASSWORD=<password>
    USERNAME=<database name>
    HOST=<host uri>

  If no path is passed, the function will return the defaults:
    HOST = 'neo4j://localhost'
    USERNAME = 'neo4j'
    PASSWORD = 'password'
  '''

  if NEO4J_PROPERTIES_FILE is not None and os.path.exists(NEO4J_PROPERTIES_FILE):
      config = configparser.RawConfigParser()
      config.read(NEO4J_PROPERTIES_FILE)
      HOST = config['NEO4J']['HOST']
      USERNAME = config['NEO4J']['USERNAME']
      PASSWORD = config['NEO4J']['PASSWORD']
      print('Using HOST, USERNAME, PASSWORD from .ini file')
      return HOST, USERNAME, PASSWORD
  else:
      print('Could not find database properties file, using defaults:')
      HOST = 'neo4j://localhost'
      USERNAME = 'neo4j'
      PASSWORD = 'password'
      print(f'HOST: {HOST} \nUSERHAME: {USERNAME} \nPASSWORD: {PASSWORD}')
      return HOST, USERNAME, PASSWORD 

In [160]:
def replace_substring_from_beginning(string, substring, replacement):
  """
  Replaces a substring starting from the beginning of a string.

  Args:
      string: The original string.
      substring: The substring to be replaced.
      replacement: The replacement string.

  Returns:
      The modified string with the substring replaced.
  """

  if not substring:
    return string  # Avoid replacing empty substrings

  if string.startswith(substring):
    return replacement + string[len(substring):]
  else:
    return string

In [161]:
# function to convert concepts.obo into a TSV file
def obo_to_tsv(concepts_obo_file_path, concepts_tsv_file_path):

    terms = []
    record = []
    alter_names = []
    # Open the file in read mode
    with open(concepts_obo_file_path, "r") as obo_file:
      # Loop through each line

      for line in obo_file:
        if line == "[Term]":
            record = []
            alter_names = []
        elif line.startswith("ConceptName: "):
            record.append(replace_substring_from_beginning(line.strip(), "ConceptName: ", ""))
        elif line.startswith("Id: "):
            record.append(replace_substring_from_beginning(line.strip(), "Id: ", ""))
        elif line.startswith("CanonicalName: "):
            record.append(replace_substring_from_beginning(line.strip(), "CanonicalName: ", ""))
        elif line.startswith("Type: "):
            record.append(replace_substring_from_beginning(line.strip(), "Type: ", ""))
        elif line.startswith("TypeId: "):
            record.append(replace_substring_from_beginning(line.strip(), "TypeId: ", ""))
        elif line.startswith("AlternativeNames: "):
            alter_names.append(replace_substring_from_beginning(line, "AlternativeNames: ", "").strip())
        elif len(record) > 0 and line == "\n":
            if len(alter_names):
                record.append("; ".join(alter_names))
            terms.append("\t".join(record)+"\n")
            record = []
            alter_names = []
        

    obo_file.close()
    # open TSV file for writing
    with open(concepts_tsv_file_path, "w") as tsv_file:
        # write to file
        term_header = f"concept_name\tid\tcanonical_name\ttype\ttype_id\talter_names\n"
        tsv_file.write(term_header)
        for term in terms:
            tsv_file.write(term)
    tsv_file.close()

In [162]:
# Convert dictionary concepts in OBO format into TSV format.
obo_to_tsv("data/Concepts.obo", "data/Concepts.tsv")

# Connect to Neo4j DB
It is recommended to store authentication credentials in a separate file and read them in to the notebook as variables. This code assumes the files are stored in a local auth directory.

In [163]:
# get authentication credentials from local auth file
NEO4J_PROPERTIES_FILE = 'auth/immerse_kg_auth.ini'
HOST, USERNAME, PASSWORD = read_neo4j_properties(NEO4J_PROPERTIES_FILE=NEO4J_PROPERTIES_FILE)

Using HOST, USERNAME, PASSWORD from .ini file


In [None]:
# connect to neo4j instance 
gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=False)

In [165]:
# confirm connection with GDS version 
gds.version()

'2.5.0'

# Cleanup Dictionary Concepts

In [166]:
# if the step of loading dictionary concepts failed or want to reload dictionary concepts, clear out the dictonary concepts from the database
gds.run_cypher('''
                MATCH (n:DictionaryConcept)
                DETACH DELETE n
                ''')

In [167]:
# read concepts.tsv file
dictionary_concept_nodes = pd.read_csv('data/Concepts.tsv', sep='\t', header=0).fillna('NA') 
dictionary_concept_nodes.head().to_dict(orient='records')

[{'concept_name': '(fut8) gene knockout cho-s_COID438040',
  'id': 'COID438040',
  'canonical_name': '(fut8) gene knockout cho-s',
  'type': 'chinese hamster ovary (CHO)',
  'type_id': 'C58',
  'alter_names': 'fut8 gene knockout cho-s'},
 {'concept_name': '1-ag4-1_COID898401',
  'id': 'COID898401',
  'canonical_name': '1-ag4-1',
  'type': 'hybridoma',
  'type_id': 'C57',
  'alter_names': 'NA'},
 {'concept_name': '2-f-peracetyl fucose_COID847827',
  'id': 'COID847827',
  'canonical_name': '2-f-peracetyl fucose',
  'type': 'sugar (subset of carbon source)',
  'type_id': 'C27',
  'alter_names': 'NA'},
 {'concept_name': 'FA2G1_COID847899',
  'id': 'COID847899',
  'canonical_name': 'FA2G1',
  'type': 'sugar (subset of carbon source)',
  'type_id': 'C27',
  'alter_names': 'NA'},
 {'concept_name': 'FUC8_COID847828',
  'id': 'COID847828',
  'canonical_name': 'FUC8',
  'type': 'sugar (subset of carbon source)',
  'type_id': 'C27',
  'alter_names': 'NA'}]

In [168]:
# Create contraint on Concept nodes
gds.run_cypher('''CREATE CONSTRAINT dictionary_concept IF NOT EXISTS FOR (c:DictionaryConcept) REQUIRE c.id IS UNIQUE''')

In [169]:
# Check the constraint created in the step above.
gds.run_cypher('''SHOW CONSTRAINTS''')

Unnamed: 0,id,name,type,entityType,labelsOrTypes,properties,ownedIndex
0,5,dictionary_concept,UNIQUENESS,NODE,[DictionaryConcept],[id],dictionary_concept
1,2,ontological_concept,UNIQUENESS,NODE,[OntologicalConcept],[id],ontological_concept


# Load Dictionary Concept

In [170]:
# Load DictionaryConcept nodes into Neo4j database
gds.run_cypher('''
               UNWIND $node_list AS node
               CALL apoc.merge.node (["DictionaryConcept"], {id: node.id, concept_name: node.concept_name, canonical_name: node.canonical_name, type: node.type, type_id: node.type_id, alter_names: node.alter_names })
               YIELD node as n 
               RETURN n
              ''', {'node_list': dictionary_concept_nodes.to_dict('records')})


Unnamed: 0,n
0,"(concept_name, alter_names, canonical_name, type_id, id, type)"
1,"(concept_name, alter_names, canonical_name, type_id, id, type)"
2,"(concept_name, alter_names, canonical_name, type_id, id, type)"
3,"(concept_name, alter_names, canonical_name, type_id, id, type)"
4,"(concept_name, alter_names, canonical_name, type_id, id, type)"
5,"(concept_name, alter_names, canonical_name, type_id, id, type)"
6,"(concept_name, alter_names, canonical_name, type_id, id, type)"
7,"(concept_name, alter_names, canonical_name, type_id, id, type)"
8,"(concept_name, alter_names, canonical_name, type_id, id, type)"
9,"(concept_name, alter_names, canonical_name, type_id, id, type)"


In [171]:
# Remove empty alter_names property from DictionaryConcept nodes
gds.run_cypher('''
                match(n:DictionaryCConcept { alter_names: "NA"}) 
                remove n.alter_names
                ''')

In [172]:
# link DictionaryConcept nodes to OntologicalConcept by ID
gds.run_cypher('''
                MATCH(d:DictionaryConcept), (o:OntologicalConcept)
                WHERE d.type_id = o.id
                MERGE (d)-[:IS_INSTANCE_OF]->(o)
                ''')

In [173]:
# Create fulltext index on DictionaryConcept nodes
gds.run_cypher('''CREATE FULLTEXT INDEX dict_concept_search IF NOT EXISTS FOR (n:DictionaryConcept) ON EACH [n.name]''')

In [174]:
# (optional) Exports the whole database incl. indexes as cypher statements to the provided file
gds.run_cypher('''CALL apoc.export.cypher.all('kg_export_after_load_dictionary_concepts.cypher',{format:'cypher-shell'})''')

Unnamed: 0,file,batches,source,format,nodes,relationships,properties,time,rows,batchSize,cypherStatements,nodeStatements,relationshipStatements,schemaStatements,cleanupStatements
0,kg_export_after_load_dictionary_concepts.cypher,1,"database: nodes(704), rels(683)",cypher,704,683,3960,61,1387,20000,,,,,
