In [None]:
import time
import json
from py2neo import Graph, Node, Relationship

In [None]:
graph = Graph("bolt://neo4j-quanta-two:7687", auth=('neo4j','myneo'))
# graph = Graph("bolt://localhost:7687", auth = ('neo4j', 'password'))

print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

In [None]:
# Constrain one name per author
print("Creating uniqueness constraint (and also index) on Author names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (a:Author2) ASSERT a.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

In [None]:
# Add coauthor relationships on existing graph
query = """
CALL apoc.periodic.iterate(
"MATCH (q:Quanta) WHERE size((q)<-[:AUTHORED]-())>1 RETURN q",
"WITH [(q)<-[:AUTHORED]-(a) | a] as coAuthors
 UNWIND coAuthors as first
 UNWIND coAuthors as second
 WITH first, second
 WHERE id(first) < id(second)
 MERGE (first)-[r:COAUTHOR]-(second)
  ON CREATE SET r.strength = 1
  ON MATCH SET r.strength = r.strength + 1",
{batchSize:10000, iterateList:true, parallel:false});
"""
graph.run(query)

OLD QUERIES

In [None]:
# Create coauthor relationships - runs on existing graph but double counts
import time

start_time = time.time()

print("Adding coauthor relationships...")
query = """ 
MATCH (q:Quanta)
MATCH (q)<-[:AUTHORED]-(a:Author)
MATCH (q)<-[:AUTHORED]-(b:Author) where a <> b
MERGE (a)-[r:COAUTHOR]-(b)
ON CREATE SET r.strength = 1
ON MATCH SET r.strength = r.strength + 1
"""

query_start_time = time.time()
graph.run(query)#.evaluate()
print("Done in ({:.2f} seconds)...".format((time.time()-query_start_time)))
#print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

In [None]:
# Import authors as nodes and create coauthor relationships
import glob, os, time

local_data_dir = '/tmp/data/mag/test' #'/tmp/data/mag/test'
neo4j_data_dir = '/import/mag/test'

start_time = time.time()
for _,_ , files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.txt'):
           
            print("Importing {}...".format(file), end=" ", flush=True)
            query = """
            CALL apoc.periodic.iterate(
            "CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q"
            ,
            "MERGE (p:Quanta {{name:q.title}})
            WITH q, p
            UNWIND q.authors as author 
            MERGE (a:Author2 {{name:author.name}})
            MERGE (a)-[:COCREATOR]->(p)
            WITH p,a
            MATCH (b:Author2)-[]->(p) where a <> b
            MERGE (a)-[r:COAUTHOR]-(b)
            ON CREATE SET r.strength2 = 1
            ON MATCH SET r.strength2 = r.strength2 + 1
            WITH p DETACH DELETE p;"
            , 
            {{batchSize:10000, iterateList:true, parallel:false}});
            """.format(neo4j_data_dir, file)
            
            query_start_time = time.time()
            graph.run(query).evaluate()
            print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

print("Finished all imports in {:.2f} hours.".format((time.time()-start_time)/60/60))

In [None]:
query = """
MERGE (p:Quanta2)
WITH q, p
UNWIND q.authors as author 
MERGE (a:Author2 {{name:author.name}})
MERGE (a)-[:COCREATOR]->(p)
WITH p,a
MATCH (b:Author2)-[]->(p) where a <> b
MERGE (a)-[r:COAUTHOR]-(b)
ON CREATE SET r.strength2 = 1
ON MATCH SET r.strength2 = r.strength2 + 1
WITH p DETACH DELETE p
"""

query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

print("Finished all imports in {:.2f} hours.".format((time.time()-start_time)/60/60))

In [None]:
# Import authors as nodes and create coauthor relationships
import glob, os, time

local_data_dir = '/work/ImportData/CoauthorshipNatureGraph/' #'/tmp/data/mag/test'
neo4j_data_dir = '/work/ImportData/CoauthorshipNatureGraph/'

start_time = time.time()
for _,_ , files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.txt'):
           
            print("Importing {}...".format(file), end=" ", flush=True)
            
            query = """CREATE NODE("Author2", name)"""
            
            query = """
            CALL apoc.periodic.iterate(
            "CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q");
            """.format(neo4j_data_dir, file)
            
            query_start_time = time.time()
            graph.run(query).evaluate()
            print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

print("Finished all imports in {:.2f} hours.".format((time.time()-start_time)/60/60))

In [None]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [None]:
query = "MATCH (n:Quanta2) RETURN n"
data = query_to_df(query, graph)
    
data

In [None]:
d_list = [{"title": "title1", "authors": [{"name": "name1"}], "venue": "Nature"}, {"title": "title2", "authors": [{"name": "name1"}, {"name": "name2"}], "venue": "Nature"}, {"title": "title3", "authors": [{"name": "name1"}, {"name": "name2"}, {"name": "name3"}], "venue": "Nature"}, {"title": "title4", "authors": [{"name": "name2"}, {"name": "name3"}, {"name": "name4"}], "venue": "Nature"}, {"title": "title5", "authors": [{"name": "name5"}, {"name": "name6"}, {"name": "name7"}], "venue": "Nature"}, {"title": "title6", "authors": [{"name": "name1"}, {"name": "name2"}, {"name": "name3"}], "venue": "PLOS"}, {"title": "title7", "authors": [{"name": "name3"}, {"name": "name8"}], "venue": "Cell"}, {"title": "title8", "authors": [{"name": "name4"}, {"name": "name9"}, {"name": "name10"}], "venue": "Cell"}]

for d in d_list:
    node = Node("Quanta2", title=d["title"], authors=[x["name"] for x in d["authors"]], venue=d["venue"])
    graph.create(node)

In [None]:
graph.delete_all()