In [None]:
from py2neo import Graph, Node, Relationship

graph = Graph("bolt://localhost:7687", auth=('neo4j','password'))

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

In [None]:
# Constrain one id per Quanta
print("Creating uniqueness constraint (and also index) on Quanta id's...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (n:Quanta) ASSERT n.id IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Constrain one name per author
print("Creating uniqueness constraint (and also index) on Author names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (a:Author) ASSERT a.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Constrain one name per organization
print("Creating uniqueness constraint (and also index) on Organization names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (o:Organization) ASSERT o.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Add index for year of publication
print("Creating index for publication year...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(year);"""
graph.run(query).evaluate()
print("Done.")

# Add index for language
print("Creating index for langauge...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(lang);"""
graph.run(query).evaluate()
print("Done.")

# Add index for field of study
print("Creating index for fos...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(fos);"""
graph.run(query).evaluate()
print("Done.")

# Add index for keywords
print("Creating index for fos...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(keywords);"""
graph.run(query).evaluate()
print("Done.")

# Add index for title 
print("Creating index for title...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(title);"""
graph.run(query).evaluate()
print("Done.")

# Add index for venue
print("Creating index for fos...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(venue);"""
graph.run(query).evaluate()
print("Done.")

# Add index for document type
print("Creating index for doctype...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(doctype);"""
graph.run(query).evaluate()
print("Done.")

In [None]:
# Import venue nodes
file_dir = 'file:/Users/timholdsworth/code/scaling-science/notebooks/data/mag_venues.txt.first100.txt'
print("Importing {}...".format(file_dir), end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
"CALL apoc.load.json('file:/Users/timholdsworth/code/scaling-science/notebooks/data/mag_venues.txt.first100.txt') YIELD value AS q RETURN q",
"UNWIND q.JournalId AS venue_id 
MERGE (v:Venue {id:venue_id, name:q.DisplayName})", 
{batchSize:10000, iterateList:true, parallel:false})
"""
query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))



In [None]:
# Import author nodes
file_dir = 'file:/Users/timholdsworth/code/scaling-science/notebooks/data/mag_venues.txt.first100.txt'
print("Importing {}...".format(file_dir), end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
"CALL apoc.load.json('file:/Users/timholdsworth/code/scaling-science/notebooks/data/mag_authors_5.txt.first100.txt') YIELD value AS q RETURN q",
"UNWIND q.id AS author_id 
MERGE (a:Author {id:author_id, name:q.name})", 
{batchSize:10000, iterateList:true, parallel:false})
"""#.format(file_dir)

query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

In [None]:
# query1 adds Quanta, Authors, and Organizations to graph with relationships between
# Authors and Quanta, and Authors and Orgs
# query2 adds is_first_author and is_last_author property to AUTHORED relationships between
# Authors and Quanta

#local_data_dir = '/tmp/data/mag/'
local_data_dir = '/Users/timholdsworth/code/scaling-science/notebooks/data'
neo4j_data_dir = '/import/mag/'

import glob, os, time

start_time = time.time()

for _,_ , files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('100.txt'):
           
           # Iterative query (more efficient)
            print("Importing {}...".format(file), end=" ", flush=True)
            query = """
            CALL apoc.periodic.iterate(
            "CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q"
            ,
            "UNWIND q.id AS id 
            UNWIND q.authors as author
            MERGE (a:Author {{name:author.name}})
            MERGE (i:Quanta {{id:q.id}}) ON CREATE SET 
                i.refs=q.references, 
                i.year=q.year, 
                i.title=q.title, 
                i.fos=q.fos, 
                i.url=q.url, 
                i.lang=q.lang, 
                i.keywords=q.keywords, 
                i.n_citation=q.n_citation, 
                i.pdf=q.pdf, 
                i.publisher=q.publisher,
                i.venue=q.venue
            WITH 
                i,
                a,
                author,
                head(q.authors)=author AS first_author, 
                last(q.authors)=author AS last_author
            MERGE (a)-[:AUTHORED {{is_first_author: first_author, is_last_author: last_author}}]->(i)
            WITH a, author
            WHERE author.org is not null
            MERGE (o:Organization {{name:author.org}}) 
            MERGE (a)-[:AFFILIATED_WITH]->(o)"
            , 
            {{batchSize:10000, iterateList:true, parallel:false}});
            """.format(neo4j_data_dir, file)
        
#             print(query)
            query_start_time = time.time()
            graph.run(query).evaluate()
            print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

print("Finished all imports in {:.2f} hours.".format((time.time()-start_time)/60/60))

In [None]:
# Import venue nodes

print("Importing {}...".format(file), end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
"CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q",
"UNWIND q.id AS id 
UNWIND q.authors as author
MERGE (a:Author {{name:author.name}})"

, 
{{batchSize:10000, iterateList:true, parallel:false}});
""".format(neo4j_data_dir, file)

#             print(query)
query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

In [None]:
# Import paper nodes

In [None]:
# Import author nodes

In [None]:
# Import paper nodes

In [None]:
# Import venue edges

In [None]:
# Import paper edges

In [None]:
# Add coauthor relationships 