## Creating the graph in Neo4j

In [1]:
!pip install neo4j
!pip install tqdm



In [2]:
from neo4j import GraphDatabase
import time
from tqdm import tqdm
import pandas as pd

In [3]:
relation_df = pd.read_csv('final_relations_wiki_mimic.csv') #add path to file created using Relation Extractor SparkNLP which is merged

In [4]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [5]:
def update_data(query, rows, batch_size = 10000):
    # Function to handle the updating the Neo4j database in batch mode.
    # Especially for large datasets.
    total = 0
    batch = 0
    start = time.time()
    result = None

    while batch * batch_size < len(rows):

        res = conn.query(query, parameters={'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
        total += res[0]['total']
        batch += 1
        result = {"total":total, "batches":batch, "time":time.time()-start}
        print(result)

    return result

In [6]:
def add_ners_rels(rows, batch_size=10000):
    # Adds NER nodes to the Neo4j graph as a batch job.
    query = '''
    //chunk1 NERs
    UNWIND $rows as row
    MERGE(n1:NER{name:row.chunk1}) ON CREATE SET n1.type=row.entity1
    
    //chunk2 NERs
    MERGE(n2:NER{name:row.chunk2}) ON CREATE SET n2.type=row.entity2

    //connect NERs
    WITH row, n1, n2
    CALL apoc.create.relationship(n1, toString(row.relation), {}, n2) YIELD rel

    WITH n1
    MATCH (n1)
    RETURN count(*) as total  
    '''
    return update_data(query, rows, batch_size)
#     WITH row, n1, n2
#     MERGE (n1)-[:LINKS{relation:row.relation}]->(n2)

In [7]:
#add uri, password and user as shown in Neo4j
uri = 'bolt://127.0.0.1:7687'
pwd = 'graph'
user= 'neo4j'

conn = Neo4jConnection(uri=uri, user=user , pwd=pwd)

In [8]:
# uri = 'bolt://54.147.79.121:7687'
# pwd = 'laws-alibis-qualifier'
# user= 'neo4j'

# conn = Neo4jConnection(uri=uri, user=user , pwd=pwd)

In [9]:
delete_all_nodes = 'MATCH (n) DETACH DELETE n;'
conn.query(delete_all_nodes)

[]

In [10]:
const_ners = 'CREATE CONSTRAINT ners IF NOT EXISTS ON (n:NER) ASSERT n.name IS UNIQUE'
conn.query(const_ners)

[]

In [11]:
add_ners_rels(relation_df)

{'total': 10000, 'batches': 1, 'time': 2.692194938659668}
{'total': 20000, 'batches': 2, 'time': 4.36225962638855}
{'total': 30000, 'batches': 3, 'time': 5.931192874908447}
{'total': 40000, 'batches': 4, 'time': 7.404002904891968}
{'total': 50000, 'batches': 5, 'time': 8.546263933181763}
{'total': 60000, 'batches': 6, 'time': 9.533070802688599}
{'total': 70000, 'batches': 7, 'time': 10.535047054290771}
{'total': 80000, 'batches': 8, 'time': 11.466240644454956}
{'total': 90000, 'batches': 9, 'time': 12.52093768119812}
{'total': 100000, 'batches': 10, 'time': 13.469653367996216}
{'total': 110000, 'batches': 11, 'time': 14.406498670578003}
{'total': 120000, 'batches': 12, 'time': 15.30834698677063}
{'total': 130000, 'batches': 13, 'time': 16.188628911972046}
{'total': 140000, 'batches': 14, 'time': 17.1149959564209}
{'total': 150000, 'batches': 15, 'time': 17.98625111579895}
{'total': 160000, 'batches': 16, 'time': 18.78773069381714}
{'total': 170000, 'batches': 17, 'time': 19.64006829261

{'total': 572771, 'batches': 58, 'time': 55.91869616508484}