In [3]:
import json
from py2neo import Graph
from py2neo.data import Node, Relationship

graph = Graph("bolt://neo4j:7687")
graph.delete_all()

In [2]:
# Create nodes 

local_data_dir = '/tmp/data/test/'
neo4j_data_dir = '/import/test/'

import glob, os
for _, _, files in os.walk(local_data_dir):
    for file in files:
        if file.endswith('.txt'):
            
#             # Normal query
#             query = """
#             CALL apoc.load.json("file://{}{}") 
#             YIELD value AS q
#             UNWIND q.id AS id
#             MERGE (i:Quanta {{id:q.id}})
#             ON CREATE SET i.abstract=q.abstract, i.refs=q.references, i.title=q.title, 
#                           i.fos=q.fos, i.url=q.url, i.lang=q.lang
#         """.format(neo4j_data_dir, file)
            
            # Iterative query (more efficient)
            query = """
            CALL apoc.periodic.iterate(
            "CALL apoc.load.json('file://{}{}') 
                YIELD value AS q UNWIND q.id AS id RETURN q", 
            "MERGE (i:Quanta {{id:q.id}})
                ON CREATE SET i.refs=q.references, 
                i.title=q.title, i.fos=q.fos, i.url=q.url, i.lang=q.lang", 
            {{batchSize:10000, iterateList:true, parallel:false}});
            """.format(neo4j_data_dir, file)
            
            print("Importing {}...".format(file), end=" ", flush=True)
            print(query)
            graph.run(query).evaluate()
            print("Done.")

Importing mag_10entries.txt... 
            CALL apoc.periodic.iterate(
            "CALL apoc.load.json('file:///import/test/mag_10entries.txt') 
                YIELD value AS q UNWIND q.id AS id RETURN q", 
            "MERGE (i:Quanta {id:q.id})
                ON CREATE SET i.abstract=q.abstract, i.refs=q.references, 
                i.title=q.title, i.fos=q.fos, i.url=q.url, i.lang=q.lang", 
            {batchSize:5000, iterateList:true, parallel:false});
            
Done.
Importing aminer_10entries.txt... 
            CALL apoc.periodic.iterate(
            "CALL apoc.load.json('file:///import/test/aminer_10entries.txt') 
                YIELD value AS q UNWIND q.id AS id RETURN q", 
            "MERGE (i:Quanta {id:q.id})
                ON CREATE SET i.abstract=q.abstract, i.refs=q.references, 
                i.title=q.title, i.fos=q.fos, i.url=q.url, i.lang=q.lang", 
            {batchSize:5000, iterateList:true, parallel:false});
            
Done.


In [89]:
# Constrain one node per id
query = """CREATE CONSTRAINT ON (n:Quanta) ASSERT n.id IS UNIQUE;"""
graph.run(query).evaluate()

In [78]:
# Add all citations as relationships between Quanta

# Simple but slow
query = """
MATCH (a:Quanta), (b:Quanta)
WHERE a.id IN b.refs
CREATE (b)-[:CITES]->(a)
"""

# Faster but more complex (not benchmarked though)
query = """
CALL apoc.periodic.iterate(
   "MATCH (a:Quanta), (a2:Quanta) WHERE a.id IN a2.refs
    WITH a, COLLECT(a2) as b
    RETURN a, b",
   "UNWIND b AS a2
    CREATE (a2)-[:CITES]->(a)",
    {batchSize:5000, parallel:true,iterateList:true}
"""

# Faster but simple (also not benchmarked)
query = """
CALL apoc.periodic.iterate(
   "MATCH (a:Quanta), (b:Quanta) WHERE ID(a) < ID(b) AND a.id IN b.refs RETURN a, b",
   "CREATE (b)-[:CITES]->(a)",
    {batchSize:10000, parallel:true,iterateList:true})
"""

graph.run(query).evaluate()
