In [None]:
import time
import json
from py2neo import Graph, Node, Relationship

In [None]:
graph = Graph("bolt://neo4j-coauthor:7687", auth=('neo4j','myneo'))
 
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

In [None]:
# Constrain one name per author
print("Creating uniqueness constraint (and also index) on Author names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (a:Author) ASSERT a.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

In [None]:
# Import authors as nodes
import glob, os, time

local_data_dir = '/tmp/data/mag/'
neo4j_data_dir = '/import/mag/'

start_time = time.time()
for _,_ , files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.txt'):
           
            print("Importing {}...".format(file), end=" ", flush=True)
            query = """
            CALL apoc.periodic.iterate(
            "CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q"
            ,
            "MERGE (p:Quanta {{name:q.title}})
            WITH q, p
            UNWIND q.authors as author 
            MERGE (a:Author {{name:author.name}})
            MERGE (a)-[:COCREATOR]->(p)
            WITH p,a
            MATCH (b:Author) where (b)-[]->(p) and a <> b
            MERGE (a)-[r:COAUTHOR]-(b)
            ON CREATE SET r.strength = 1
            ON MATCH SET r.strength = r.strength + 1
            WITH p DETACH DELETE p;"
            , 
            {{batchSize:10000, iterateList:true, parallel:false}});
            """.format(neo4j_data_dir, file)
            
            query_start_time = time.time()
            graph.run(query).evaluate()
            print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

print("Finished all imports in {:.2f} hours.".format((time.time()-start_time)/60/60))

In [None]:
# Old Query (deleted MATCH (p)<-[]-(a) in the new query)
# Create a CoAuthor graph by creating quanta nodes, then creating relationships between all nodes 
# with relationships to that Quanta
query = """
CALL apoc.load.json('file:/Users/timholdsworth/Documents/simple-fake-data.txt') 
YIELD value AS q
MERGE (p:Quanta {name:q.title})
WITH q, p
UNWIND q.authors as author 
MERGE (a:Author {name:author.name})
MERGE (a)-[:CO_CREATOR]->(p)
WITH p,a
MATCH (p)<-[]-(a)
MATCH (b:Author) where (b)-[]->(p) and a <> b
MERGE (a)-[r:COAUTHOR]-(b)
ON CREATE SET r.strength = 1
ON MATCH SET r.strength = r.strength + 1
WITH p DETACH DELETE p
"""

graph.run(query)

In [None]:
# Divide all relationship scores by two to account for doulbe counting in the first query 
# TODO fix this - its actually setting all the strength values to 0 for some reason
query = """
MATCH (a:Author)-[r:COAUTHOR]-(b:Author)
SET r.strength = r.strength / 2 
"""

graph.run(query)