In [7]:
import json
from py2neo import Graph
from py2neo.data import Node, Relationship

# Need to get authentication working, currently NEO4J_AUTH=none
graph = Graph("bolt://neo4j:7687")

#graph.delete_all()

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

Connected to graph database with 1,000,000 nodes and 443 relationships!


In [3]:
# Constrain one id per Quanta
print("Creating uniqueness constraint (and also index) on Quanta id's...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (n:Quanta) ASSERT n.id IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

Creating uniqueness constraint (and also index) on Quanta id's... Done.


In [5]:
# Create nodes 

# local_data_dir = '/tmp/data/test/'
# neo4j_data_dir = '/import/test/'

local_data_dir = '/tmp/data/test'
neo4j_data_dir = '/import/test'

import glob, os
for _, _, files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.txt'):
            
#             # Normal query
#             query = """
#             CALL apoc.load.json("file://{}{}") 
#             YIELD value AS q
#             UNWIND q.id AS id
#             MERGE (i:Quanta {{id:q.id}})
#             ON CREATE SET i.abstract=q.abstract, i.refs=q.references, i.title=q.title, 
#                           i.fos=q.fos, i.url=q.url, i.lang=q.lang
#         """.format(neo4j_data_dir, file)
            
            # Iterative query (more efficient)
            print("Importing {}...".format(file), end=" ", flush=True)
            query = """
            CALL apoc.periodic.iterate(
            "CALL apoc.load.json('file://{}{}') YIELD value AS q UNWIND q.id AS id RETURN q", 
            "MERGE (i:Quanta {{id:q.id}}) ON CREATE SET 
            i.refs=q.references, i.year=q.year, i.title=q.title, i.fos=q.fos, i.url=q.url, 
            i.lang=q.lang, i.keywords=q.keywords, i.n_citation=q.n_citation, i.pdf=q.pdf, 
            i.publisher=q.publisher",
            {{batchSize:50000, iterateList:true, parallel:false}});
            """.format(neo4j_data_dir, file)
            #print(query)
            #graph.run(query).evaluate()
            print("Done.")

Importing mag_papers_0.txt... Done.
Importing mag_papers_1.txt... Done.
Importing mag_papers_10.txt... Done.
Importing mag_papers_11.txt... Done.
Importing mag_papers_12.txt... Done.
Importing mag_papers_13.txt... Done.
Importing mag_papers_14.txt... Done.
Importing mag_papers_15.txt... Done.
Importing mag_papers_16.txt... Done.
Importing mag_papers_17.txt... Done.
Importing mag_papers_18.txt... Done.
Importing mag_papers_19.txt... Done.
Importing mag_papers_2.txt... Done.
Importing mag_papers_3.txt... Done.
Importing mag_papers_4.txt... Done.
Importing mag_papers_5.txt... Done.
Importing mag_papers_6.txt... Done.
Importing mag_papers_7.txt... Done.
Importing mag_papers_8.txt... Done.
Importing mag_papers_9.txt... Done.


In [6]:
# Add all citations as relationships between Quanta

# # Simple but slow
# query = """
# MATCH (a:Quanta), (b:Quanta)
# WHERE a.id IN b.refs
# CREATE (b)-[:CITES]->(a)
# """

# # Faster but more complex (not benchmarked though)
# query = """
# CALL apoc.periodic.iterate(
#    "MATCH (a:Quanta), (a2:Quanta) WHERE a.id IN a2.refs
#     WITH a, COLLECT(a2) as b
#     RETURN a, b",
#    "UNWIND b AS a2
#     CREATE (a2)-[:CITES]->(a)",
#     {batchSize:5000, parallel:true,iterateList:true}
# """

# # Faster but simple (also not benchmarked)
# query = """
# CALL apoc.periodic.iterate(
#    "MATCH (a:Quanta), (b:Quanta) WHERE ID(a) < ID(b) AND a.id IN b.refs RETURN a, b",
#    "CREATE (b)-[:CITES]->(a)",
#     {batchSize:10000, parallel:true,iterateList:true});
# """

# # Take advantage of indexing performed by constraints
# print("Adding citations...", end=" ", flush=True)
# query = """
# MATCH (b:Quanta)
# UNWIND b.refs AS ref
# MATCH (a:Quanta)
# WHERE a.id = ref
# CREATE (b)-[:CITES]->(a);
# """

# # Take advantage of indexing and also run in batches
# query = """
# CALL apoc.periodic.iterate(
# "MATCH (b:Quanta) 
#  UNWIND b.refs AS ref 
#  MATCH (a:Quanta) 
#  WHERE a.id = ref
#  RETURN a, b",
# "MERGE (b)-[:CITES]->(a)",
#  {batchSize:20000, parallel:false,iterateList:true});
# """

# Fastest: Put more work on thread running in parallel. 
print("Adding citation relationships...", end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
"MATCH (b:Quanta) UNWIND b.refs AS ref RETURN b, ref",
"MATCH (a:Quanta {id: ref}) CREATE (b)-[:CITES]->(a)",
{batchSize:10000, parallel:false, retries:3})
"""
graph.run(query).evaluate()
print("Done.")


Adding citation relationships... Done.


In [9]:
n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Created graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

Created graph database with 10,000,000 nodes and 1,940,000 relationships!
