In [None]:
import time
import json
from py2neo import Graph, Node, Relationship

graph = Graph("bolt://neo4j-coauthor:7687", auth=('neo4j','myneo'))
 
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

In [None]:
# Constrain one name per author
print("Creating uniqueness constraint (and also index) on Author names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (a:Author) ASSERT a.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

In [None]:
# Import authors as nodes
local_data_dir = '/tmp/data/mag/'
neo4j_data_dir = '/import/mag/'

import glob, os, time

start_time = time.time()

for _,_ , files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.txt'):
           
            print("Importing {}...".format(file), end=" ", flush=True)
            query = """
            CALL apoc.periodic.iterate(
            "CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q"
            ,
            "UNWIND q.id AS id 
            FOREACH (author_one IN q.authors |
                MERGE (a:Author {{name:author_one.name}}));"
            , {{batchSize:10000, iterateList:true, parallel:false}});
            """.format(neo4j_data_dir, file)
        
#             print(query)
            query_start_time = time.time()
            graph.run(query).evaluate()
            print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

print("Finished all imports in {:.2f} hours.".format((time.time()-start_time)/60/60))

In [None]:

# *****************************************************
# ********* TODO: THIS NEEDS TO BE CHANGED !! *********
# *****************************************************

local_data_dir = '/tmp/data/mag/'
neo4j_data_dir = '/import/mag/'

import glob, os, time

start_time = time.time()

for _,_ , files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.txt'):
           
            print("Importing {}...".format(file), end=" ", flush=True)
            query = """
            CALL apoc.periodic.iterate(
            "CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q"
            ,
            "UNWIND q.id AS id 
            FOREACH (author_one IN q.authors |
                MERGE (a:Author {{name:author_one.name}})
                FOREACH (author_two IN q.authors |
                    MERGE (b:Author {{name:author_two.name}})
                    CASE WHEN author_one <> author_two
                    THEN MERGE (a)-[COAUTHOR {{count:1}}]-(b)
                    END
                ));"
            , {{batchSize:10000, iterateList:true, parallel:false}});
            """.format(neo4j_data_dir, file)
        
#             print(query)
            query_start_time = time.time()
            graph.run(query).evaluate()
            print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

print("Finished all imports in {:.2f} hours.".format((time.time()-start_time)/60/60))





