In [None]:
from py2neo import Graph, Node, Relationship
import time

graph = Graph("bolt://localhost:7687", auth=('neo4j','password'))

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

In [None]:
# Constrain one id per Quanta
print("Creating uniqueness constraint (and also index) on Quanta id's...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (n:Quanta) ASSERT n.id IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Constrain one name per author
print("Creating uniqueness constraint (and also index) on Author names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (a:Author) ASSERT a.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Constrain one name per organization
print("Creating uniqueness constraint (and also index) on Organization names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (o:Organization) ASSERT o.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Add index for year of publication
print("Creating index for publication year...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(year);"""
graph.run(query).evaluate()
print("Done.")

# Add index for language
print("Creating index for langauge...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(lang);"""
graph.run(query).evaluate()
print("Done.")

# Add index for field of study
print("Creating index for fos...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(fos);"""
graph.run(query).evaluate()
print("Done.")

# Add index for keywords
print("Creating index for fos...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(keywords);"""
graph.run(query).evaluate()
print("Done.")

# Add index for title 
print("Creating index for title...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(title);"""
graph.run(query).evaluate()
print("Done.")

# Add index for venue
print("Creating index for fos...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(venue);"""
graph.run(query).evaluate()
print("Done.")

# Add index for document type
print("Creating index for doctype...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(doctype);"""
graph.run(query).evaluate()
print("Done.")

In [None]:
# Import venue nodes
file_dir = 'file:/Users/timholdsworth/code/scaling-science/notebooks/data/mag_venues.txt.first100.txt'
print("Importing {}...".format(file_dir), end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
"CALL apoc.load.json('{}') YIELD value AS q RETURN q",
"UNWIND q.JournalId AS venue_id 
CREATE (v:Venue {{id:venue_id, name:q.DisplayName}})", 
{{batchSize:10000, iterateList:true, parallel:false}})
""".format(file_dir)

query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.3f} minutes)...".format((time.time()-query_start_time)/60))

# Do we need a merge statement here on venues, or are all the elements in this list distinct journals?


In [None]:
# Import quanta nodes
file_dir = 'file:/Users/timholdsworth/code/scaling-science/notebooks/data/mag_papers_10.txt.first100.txt'
print("Importing {}...".format(file_dir), end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
"CALL apoc.load.json('{}') YIELD value AS q RETURN q",
"UNWIND q.id as id
CREATE (p:Quanta {{id:id, title:q.title, citations:q.n_citation, year:q.year}})",
{{batchSize:10000, iterateList:true, parallel:false}})
""".format(file_dir)



query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.3f} minutes)...".format((time.time()-query_start_time)/60))

# Do we need a merge statement here on authors, or are all the elements in this list distinct authors?


In [None]:
# Import author nodes
file_dir = 'file:/Users/timholdsworth/code/scaling-science/notebooks/data/mag_authors_5.txt.first100.txt'
print("Importing {}...".format(file_dir), end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
"CALL apoc.load.json('{}') YIELD value AS q RETURN q",
"UNWIND q.id as id
CREATE (a:Author {{id:id, name:q.name, citations:q.n_citation, publications:q.n_pubs}})
WITH q, a
UNWIND q.tags as tags
MERGE (t:Tag {{name: tags.t}})
CREATE (a)-[:HAS_TAGS]->(t)
WITH DISTINCT q, a
WHERE q.org is not null
MERGE (o:Organization {{name: q.org}})
CREATE (a)-[:AFFILIATED_WITH]->(o)",
{{batchSize:10000, iterateList:true, parallel:false}})
""".format(file_dir)



query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.3f} minutes)...".format((time.time()-query_start_time)/60))

# Do we need a merge statement here on authors, or are all the elements in this list distinct authors?


In [None]:
# Import Quanta Nodes
file_dir = 'file:/Users/timholdsworth/code/scaling-science/notebooks/data/mag_papers_10.txt.first100.txt'
print("Importing {}...".format(file_dir), end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
"CALL apoc.load.json('{}') YIELD value AS q RETURN q",
"UNWIND q.id AS author_id 
CREATE (a:Author {{id:author_id, name:q.name}})", 
{{batchSize:10000, iterateList:true, parallel:false}})
""".format(file_dir)

query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))


MERGE (i:Quanta {{id:q.id}}) ON CREATE SET 
    i.refs=q.references, 
    i.year=q.year, 
    i.title=q.title, 
    i.fos=q.fos, 
    i.url=q.url, 
    i.lang=q.lang, 
    i.keywords=q.keywords, 
    i.n_citation=q.n_citation, 
    i.pdf=q.pdf, 
    i.publisher=q.publisher,
    i.venue=q.venue