In [None]:
import time
import json
from py2neo import Graph, Node, Relationship

graph = Graph("bolt://neo4j-coauthor:7687", auth=('neo4j','myneo'))
 
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

In [None]:
# Constrain one name per author
print("Creating uniqueness constraint (and also index) on Author names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (a:Author) ASSERT a.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

In [None]:

# *****************************************************
# ********* TODO: THIS NEEDS TO BE CHANGED !! *********
# *****************************************************

local_data_dir = '/tmp/data/mag/'
neo4j_data_dir = '/import/mag/'

import glob, os, time

start_time = time.time()

for _,_ , files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.txt'):
           
           # Iterative query (more efficient)
            print("Importing {}...".format(file), end=" ", flush=True)
            query = """
            CALL apoc.periodic.iterate(
            "CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q"
            ,
            "UNWIND q.id AS id 
            UNWIND q.authors as author
            MERGE (a:Author {{name:author.name}})
            MERGE (i:Quanta {{id:q.id}}) ON CREATE SET 
                i.refs=q.references, 
                i.year=q.year, 
                i.title=q.title, 
                i.fos=q.fos, 
                i.url=q.url, 
                i.lang=q.lang, 
                i.keywords=q.keywords, 
                i.n_citation=q.n_citation, 
                i.pdf=q.pdf, 
                i.publisher=q.publisher,
                i.venue=q.venue
            WITH 
                i,
                a,
                author,
                head(q.authors)=author AS first_author, 
                last(q.authors)=author AS last_author
            MERGE (a)-[:AUTHORED {{is_first_author: first_author, is_last_author: last_author}}]->(i)
            WITH a, author
            WHERE author.org is not null
            MERGE (o:Organization {{name:author.org}}) 
            MERGE (a)-[:AFFILIATED_WITH]->(o)"
            , 
            {{batchSize:10000, iterateList:true, parallel:false}});
            """.format(neo4j_data_dir, file)
        
#             print(query)
            query_start_time = time.time()
            graph.run(query).evaluate()
            print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

print("Finished all imports in {:.2f} hours.".format((time.time()-start_time)/60/60))