In [1]:
import time
import json
from py2neo import Graph, Node, Relationship

In [2]:
graph = Graph("bolt://neo4j-quanta:7687", auth=('neo4j','myneo'))
# graph = Graph("bolt://localhost:7687", auth = ('neo4j', 'password'))

print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

Connected to graph database with 278,432,359 nodes and 1,817,035,911 relationships!


In [None]:
# Constrain one name per author
print("Creating uniqueness constraint (and also index) on Author names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (a:Author) ASSERT a.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

In [5]:
# Add coauthor relationships on existing graph
query = """
CALL apoc.periodic.iterate(
"MATCH (q:Quanta) 
 WHERE 
     size((q)<-[:AUTHORED]-())>1 AND
     q.venue IN ['Cell',
     'Nature',
     'Nature Biotechnology',
     'Proceedings of the National Academy of Sciences of the United States of America',
     'Science']
 RETURN q",
"WITH [(q)<-[:AUTHORED]-(a) | a] as coAuthors
 UNWIND coAuthors as first
 UNWIND coAuthors as second
 WITH first, second
 WHERE id(first) < id(second)
 MERGE (first)-[r:COAUTHOR]-(second)
 SET r.strength2 = CASE WHEN r.strength2 IS NULL THEN 1 ELSE r.strength2 + 1 END",
{batchSize:10000, iterateList:true, parallel:false});
"""
print(query)
# graph.run(query)


CALL apoc.periodic.iterate(
"MATCH (q:Quanta) 
 WHERE 
     size((q)<-[:AUTHORED]-())>1 AND
     q.venue IN ['Cell',
     'Nature',
     'Nature Biotechnology',
     'Proceedings of the National Academy of Sciences of the United States of America',
     'Science']
 RETURN q",
"WITH [(q)<-[:AUTHORED]-(a) | a] as coAuthors
 UNWIND coAuthors as first
 UNWIND coAuthors as second
 WITH first, second
 WHERE id(first) < id(second)
 MERGE (first)-[r:COAUTHOR]-(second)
 SET r.strength2 = CASE WHEN r.strength2 IS NULL THEN 1 ELSE r.strength2 + 1 END",
{batchSize:10000, iterateList:true, parallel:false});



OLD QUERIES

In [None]:
# Create coauthor relationships - runs on existing graph but double counts
import time

start_time = time.time()

print("Adding coauthor relationships...")
query = """ 
MATCH (q:Quanta)
MATCH (q)<-[:AUTHORED]-(a:Author)
MATCH (q)<-[:AUTHORED]-(b:Author) where a <> b
MERGE (a)-[r:COAUTHOR]-(b)
ON CREATE SET r.strength = 1
ON MATCH SET r.strength = r.strength + 1
"""

query_start_time = time.time()
graph.run(query)#.evaluate()
print("Done in ({:.2f} seconds)...".format((time.time()-query_start_time)))
#print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

In [None]:
# Import authors as nodes and create coauthor relationships
import glob, os, time

local_data_dir = '/tmp/data/mag/'
neo4j_data_dir = '/import/mag/'

start_time = time.time()
for _,_ , files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.txt'):
           
            print("Importing {}...".format(file), end=" ", flush=True)
            query = """
            CALL apoc.periodic.iterate(
            "CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q"
            ,
            "MERGE (p:Quanta {{name:q.title}})
            WITH q, p
            UNWIND q.authors as author 
            MERGE (a:Author {{name:author.name}})
            MERGE (a)-[:COCREATOR]->(p)
            WITH p,a
            MATCH (b:Author)-[]->(p) where a <> b
            MERGE (a)-[r:COAUTHOR]-(b)
            ON CREATE SET r.strength = 1
            ON MATCH SET r.strength = r.strength + 1
            WITH p DETACH DELETE p;"
            , 
            {{batchSize:10000, iterateList:true, parallel:false}});
            """.format(neo4j_data_dir, file)
            
            query_start_time = time.time()
            graph.run(query).evaluate()
            print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

print("Finished all imports in {:.2f} hours.".format((time.time()-start_time)/60/60))