# Get Book Chapter Metadata

This notebook gets metadata for BookChapter from a Excel file ('data/BookChapter_Metadata.xlsx'), create BookChapter node and link the TM nodes to it.

## Import Libraries

In [25]:
#import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# import matplotlib.pyplot as plt 
# import seaborn as sns
# sns.set_style('darkgrid')
# sns.set_palette("colorblind")
# sns.set(rc={'figure.figsize':(11,6)})

import os 
import configparser
import csv
from collections import defaultdict
from metapub import PubMedFetcher
fetch = PubMedFetcher()

In [26]:
# install or import Neo4j GraphDataScience library
try: 
  from graphdatascience import GraphDataScience
  print('Successfully imported GraphDataScience')
except ModuleNotFoundError:
  !pip3 install graphdatascience
  from graphdatascience import GraphDataScience
  print('installed and imported GraphDataScience')

Successfully imported GraphDataScience


# Custom Functions

In [27]:
# function adapted from Neo4j GDS Fraud Demo Notebook (h/t Zach B.)
def read_neo4j_properties(NEO4J_PROPERTIES_FILE: str=None) -> str:
  '''Parses Neo4j database or Aura connection details from provided .ini filepath.
  Requirements:
    configparser

  Args:
    NEO4J_PROPERTIES_FILE: path to a .ini file
  
  Returns:
    HOST: link to Neo4j or Aura host 
    USERNAME: login username
    PASSWORD: login password 

  Note: The .ini file should use the following syntax
    [NEO4J]
    PASSWORD=<password>
    USERNAME=<database name>
    HOST=<host uri>

  If no path is passed, the function will return the defaults:
    HOST = 'neo4j://localhost'
    USERNAME = 'neo4j'
    PASSWORD = 'password'
  '''

  if NEO4J_PROPERTIES_FILE is not None and os.path.exists(NEO4J_PROPERTIES_FILE):
      config = configparser.RawConfigParser()
      config.read(NEO4J_PROPERTIES_FILE)
      HOST = config['NEO4J']['HOST']
      USERNAME = config['NEO4J']['USERNAME']
      PASSWORD = config['NEO4J']['PASSWORD']
      print('Using HOST, USERNAME, PASSWORD from .ini file')
      return HOST, USERNAME, PASSWORD
  else:
      print('Could not find database properties file, using defaults:')
      HOST = 'neo4j://localhost'
      USERNAME = 'neo4j'
      PASSWORD = 'password'
      print(f'HOST: {HOST} \nUSERHAME: {USERNAME} \nPASSWORD: {PASSWORD}')
      return HOST, USERNAME, PASSWORD 

# Connect to Neo4j DB
It is recommended to store authentication credentials in a separate file and read them in to the notebook as variables. This code assumes the files are stored in a local auth directory.

In [28]:
# get authentication credentials from local auth file
NEO4J_PROPERTIES_FILE = 'auth/immerse_kg_auth.ini'
HOST, USERNAME, PASSWORD = read_neo4j_properties(NEO4J_PROPERTIES_FILE=NEO4J_PROPERTIES_FILE)

Using HOST, USERNAME, PASSWORD from .ini file


In [29]:
# connect to neo4j instance 
gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=False)

In [None]:
# confirm connection with GDS version 
gds.version()

# Clean up BookChapter node

In [31]:
# if necessary, clean up the BookChapter node and their relationships
gds.run_cypher('''
                MATCH (n:BookChapter)
                DETACH DELETE n
                ''')

# Create BookChapter Metadata

In [32]:
book_chapter_meta = pd.read_excel('data/BookChapter_Metadata.xlsx').fillna('')

In [33]:
book_chapter_meta

Unnamed: 0,isbn,title,authors,year,publisher,link
0,978-1-4987-6285-4,Cell Culture Bioprocess Engineering (Second Edition),Wei-Shou Hu,2020,CRC PressTaylor & Francis Group,https://www.routledge.com/Cell-Culture-Bioprocess-Engineering-Second-Edition/Hu/p/book/9781498762854


In [34]:
# Create BookChapter nodes
gds.run_cypher('''
               UNWIND $book_chapter_meta AS node
               CALL apoc.merge.node (["BookChapter"], {isbn: node.isbn, title: node.title, authors: node.authors, year: toInteger(node.year), publisher: node.publisher, link: node.link})
               YIELD node as n 
               RETURN n
              ''', {'book_chapter_meta': book_chapter_meta.to_dict('records')})

Unnamed: 0,n
0,"(year, isbn, link, publisher, title, authors)"


In [35]:
# Link Textming nodes to BookChapter node
gds.run_cypher('''
    MATCH (b:BookChapter), (t:TextMining)
    WHERE t.isbn is not null and b.isbn = t.isbn
    MERGE (t)-[:IS_MENTIONED_IN]->(b)
    ''')

2024-04-19 23:55:42 UD-Q3J7G7FQ7J-D root[35495] INFO {'severity': 'INFORMATION', 'description': 'If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (t))', 'code': 'Neo.ClientNotification.Statement.CartesianProduct', 'position': {'column': 1, 'offset': 5, 'line': 2}, 'title': 'This query builds a cartesian product between disconnected patterns.', 'category': 'PERFORMANCE'}


In [43]:
# Create contraint on Concept nodes
gds.run_cypher('''CREATE CONSTRAINT article IF NOT EXISTS FOR (b:BookChapter) REQUIRE b.isbn IS UNIQUE''')

Check existing constraints

In [44]:
gds.run_cypher('''SHOW CONSTRAINTS''')

Unnamed: 0,id,name,type,entityType,labelsOrTypes,properties,ownedIndex,propertyType
0,11,article,UNIQUENESS,NODE,[BookChapter],[isbn],article,
1,5,dictionary_concept,UNIQUENESS,NODE,[DictionaryConcept],[id],dictionary_concept,
2,2,ontological_concept,UNIQUENESS,NODE,[OntologicalConcept],[id],ontological_concept,
3,8,text_mining,UNIQUENESS,NODE,[TextMining],[id],text_mining,


In [45]:
# Create fulltext index on BookChapter nodes
gds.run_cypher('''CREATE FULLTEXT INDEX article_search IF NOT EXISTS FOR (b:BookChapter) ON EACH [b.isbm, b.title, b.authors, b.year, b.publisher]''')

In [46]:
# (optional) exports the whole database incl. indexes as cypher statements to the provided file
gds.run_cypher('''CALL apoc.export.cypher.all('kg_export_after_load_bookchapter.cypher',{format:'cypher-shell'})''')

Unnamed: 0,file,batches,source,format,nodes,relationships,properties,time,rows,batchSize,cypherStatements,nodeStatements,relationshipStatements,schemaStatements,cleanupStatements
0,kg_export_after_load_article.cypher,1,"database: nodes(3418), rels(5926)",cypher,3418,5926,48212,377,9344,20000,,,,,
