In [None]:
from py2neo import Graph, Node, Relationship
import glob, os, time

graph = Graph("bolt://localhost:7687", auth=('neo4j','password'))

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

In [None]:
# Clear the database
query = """MATCH (b) DETACH DELETE b"""
graph.run(query)

In [None]:
# Constrain one id per Quanta
print("Creating uniqueness constraint (and also index) on Quanta id's...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (n:Quanta) ASSERT n.id IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Constrain one name per author
print("Creating uniqueness constraint (and also index) on Author names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (a:Author) ASSERT a.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Constrain one name per organization
print("Creating uniqueness constraint (and also index) on Organization names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (o:Organization) ASSERT o.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Add index for year of publication
print("Creating index for publication year...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(year);"""
graph.run(query).evaluate()
print("Done.")

# Add index for language
print("Creating index for langauge...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(lang);"""
graph.run(query).evaluate()
print("Done.")

# Add index for field of study
print("Creating index for fos...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(fos);"""
graph.run(query).evaluate()
print("Done.")

# Add index for keywords
print("Creating index for fos...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(keywords);"""
graph.run(query).evaluate()
print("Done.")

# Add index for title 
print("Creating index for title...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(title);"""
graph.run(query).evaluate()
print("Done.")

# Add index for venue
print("Creating index for fos...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(venue);"""
graph.run(query).evaluate()
print("Done.")

# Add index for document type
print("Creating index for doctype...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(doctype);"""
graph.run(query).evaluate()
print("Done.")

In [None]:
# ENSURE ALL COAUTHORSHIP RELATIONSHIPS ADDED FOR TOP JOURNALS
import glob, os, time

local_data_dir = '/tmp/data/mag/'
neo4j_data_dir = '/import/mag/'

start_time = time.time()
for _,_ , files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.txt'):
           
            print("Importing {}...".format(file), end=" ", flush=True)
            query = """
            CALL apoc.periodic.iterate(
            "CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q"
            ,
            "MATCH (p:Quanta {{name:q.title}})
            WITH q, p
            UNWIND q.authors as author 
            MATCH (a:Author {{name:author.name}})
            MATCH (b:Author)-[:AUTHORED]->(p) where a <> b
            MERGE (a)-[r:COAUTHOR]-(b)
            ON CREATE SET r.strength2 = 1
            ON MATCH SET r.strength2 = r.strength + 1
            WITH p DETACH DELETE p;"
            , 
            {{batchSize:10000, iterateList:true, parallel:false}});
            """.format(neo4j_data_dir, file)
            
            query_start_time = time.time()
            graph.run(query).evaluate()
            print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

print("Finished all imports in {:.2f} hours.".format((time.time()-start_time)/60/60))

In [None]:
# Import venue nodes
import glob, os, time

local_data_dir = '/tmp/data/mag/'
neo4j_data_dir = '/import/mag/'

start_time = time.time()
for _,_ , files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.txt'):
            print("Importing {}...".format(file_dir), end=" ", flush=True)
            query = """
            CALL apoc.periodic.iterate(
                "CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q",
                "CREATE (v:Venue {{id:q.venue_id, name:q.DisplayName, journal_id: q.JournalId}})", 
            {{batchSize:10000, iterateList:true, parallel:true}})
            """.format(neo4j_data_dir, file)

            query_start_time = time.time()
            graph.run(query).evaluate()
            print("Done in ({:.2f} minutes)...".format((time.time()-query_start_time)/60))

print("Finished all work in {:.2f} hours.".format((time.time()-start_time)/60/60))

In [None]:
# Create year nodes
query = """
UNWIND range(1850, 2020) as yr
MERGE (y:Year {value: yr})
"""

query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.3f} minutes)...".format((time.time()-query_start_time)/60))

In [None]:
# Import quanta nodes and add relationship with year and venue
import glob, os, time

local_data_dir = '/tmp/data/mag/'
neo4j_data_dir = '/import/mag/'

start_time = time.time()
for _,_ , files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.txt'):
            print("Importing {}...".format(file_dir), end=" ", flush=True)
            query = """
            CALL apoc.periodic.iterate(
                "CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q"
                "UNWIND q.id as id
                CREATE (p:Quanta {{id:id, title:q.title, citations:q.n_citation, year:q.year}})
                WITH q, p
                MATCH (y:Year {{value: q.year}})
                CREATE (p)-[:PUBLISHED_IN]->(y)
                WITH q, p
                UNWIND q.venue as venue
                MATCH (v:Venue {{name: venue.raw }})
                CREATE (p)-[:PUBLISHED_IN]->(v)",
            {{batchSize:10000, iterateList:true, parallel:true}})
            """.format(neo4j_data_dir, file)

            query_start_time = time.time()
            graph.run(query).evaluate()
            print("Done in ({:.3f} minutes)...".format((time.time()-query_start_time)/60))

print("Finished all work in {:.2f} hours.".format((time.time()-start_time)/60/60))

In [None]:
# Import author nodes, tags, and organizations 
# Create relationships b/w authors & quanta, authors & tags, and authors & orgs
import glob, os, time

local_data_dir = '/tmp/data/mag/'
neo4j_data_dir = '/import/mag/'

start_time = time.time()
for _,_ , files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.txt'):
            print("Importing {}...".format(file_dir), end=" ", flush=True)
            query = """
            CALL apoc.periodic.iterate(
                "CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q"
                "CREATE (a:Author {{id:q.id, name:q.name, citations:q.n_citation, publications:q.n_pubs}})
                WITH q, a
                UNWIND q.pubs as pubs
                MERGE (p:Quanta {{id:pubs.i}})
                CREATE (a)-[r:AUTHORED {{rank: pubs.r}}]->(p)
                WITH DISTINCT q, a
                UNWIND q.tags as tags
                MERGE (t:Tag {{name: tags.t}})
                CREATE (a)-[:HAS_TAGS]->(t)
                WITH DISTINCT q, a
                WHERE q.org is not null
                MERGE (o:Organization {{name: q.org}})
                CREATE (a)-[:AFFILIATED_WITH]->(o)",
            {{batchSize:10000, iterateList:true, parallel:true}})
            """.format(neo4j_data_dir, file)

            query_start_time = time.time()
            graph.run(query).evaluate()
            print("Done in ({:.3f} minutes)...".format((time.time()-query_start_time)/60))

print("Finished all work in {:.2f} hours.".format((time.time()-start_time)/60/60))


In [None]:
# Add coauthor relationships
query = """
CALL apoc.periodic.iterate(
    "MATCH (q:Quanta) WHERE size((q)<-[:AUTHORED]-()) > 1 RETURN q",
    "WITH [(q)<-[:AUTHORED]-(a) | a] as coAuthors
    UNWIND coAuthors as first
    UNWIND coAuthors as second
    WITH first, second
    WHERE id(first) < id(second)
    MERGE (first)-[r:COAUTHOR]-(second)
    SET r.strength = CASE WHEN r.strength IS NULL THEN 1 ELSE r.strength + 1 END",
{batchSize:10000, iterateList:true, parallel:true});
"""

start_time = time.time()
query_start_time = time.time()
graph.run(query).evaluate()

print("Done in ({:.3f} minutes)...".format((time.time()-query_start_time)/60))

# Scripts that are easy for Tim to run

In [None]:
# Clear the database
query = """MATCH (b) DETACH DELETE b"""
graph.run(query)

In [None]:
# Import venue nodes
file_dir = 'file:/Users/timholdsworth/code/scaling-science/notebooks/data/mag_venues.txt.first100.txt'
print("Importing {}...".format(file_dir), end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "CREATE (v:Venue {{id:q.venue_id, name:q.DisplayName, journal_id: q.JournalId}})", 
{{batchSize:10000, iterateList:true, parallel:true}})
""".format(file_dir)



query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.3f} minutes)...".format((time.time()-query_start_time)/60))

In [None]:
# Create year nodes
query = """
UNWIND range(1850, 2020) as yr
MERGE (y:Year {value: yr})
"""

query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.3f} minutes)...".format((time.time()-query_start_time)/60))

In [None]:
# Import quanta nodes and add relationship with year and venue)
file_dir = 'file:/Users/timholdsworth/code/scaling-science/notebooks/data/mag_papers_10.txt.first100.txt'
print("Importing {}...".format(file_dir), end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "UNWIND q.id as id
    CREATE (p:Quanta {{id:id, title:q.title, citations:q.n_citation, year:q.year}})
    WITH q, p
    MATCH (y:Year {{value: q.year}})
    CREATE (p)-[:PUBLISHED_IN]->(y)
    WITH q, p
    UNWIND q.venue as venue
    MATCH (v:Venue {{name: venue.raw }})
    CREATE (p)-[:PUBLISHED_IN]->(v)",
{{batchSize:10000, iterateList:true, parallel:true}})
""".format(file_dir)

query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.3f} minutes)...".format((time.time()-query_start_time)/60))

In [None]:
#Import author nodes, tags, and organizations 
#Create relationships b/w authors & tags, authors & orgs, and authors & quanta

file_dir = 'file:/Users/timholdsworth/code/scaling-science/notebooks/data/mag_authors_5.txt.first100.txt'
print("Importing {}...".format(file_dir), end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "CREATE (a:Author {{id:q.id, name:q.name, citations:q.n_citation, publications:q.n_pubs}})
    WITH q, a
    UNWIND q.pubs as pubs
    MERGE (p:Quanta {{id:pubs.i}})
    CREATE (a)-[r:AUTHORED {{rank: pubs.r}}]->(p)
    WITH DISTINCT q, a
    UNWIND q.tags as tags
    MERGE (t:Tag {{name: tags.t}})
    CREATE (a)-[:HAS_TAGS]->(t)
    WITH DISTINCT q, a
    WHERE q.org is not null
    MERGE (o:Organization {{name: q.org}})
    CREATE (a)-[:AFFILIATED_WITH]->(o)",
{{batchSize:10000, iterateList:true, parallel:true}})
""".format(file_dir)

query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.3f} minutes)...".format((time.time()-query_start_time)/60))

In [None]:
# Adds coauthor relationships
query = """
CALL apoc.periodic.iterate(
    "MATCH (q:Quanta) WHERE size((q)<-[:AUTHORED]-()) > 1 RETURN q",
    "WITH [(q)<-[:AUTHORED]-(a) | a] as coAuthors
    UNWIND coAuthors as first
    UNWIND coAuthors as second
    WITH first, second
    WHERE id(first) < id(second)
    MERGE (first)-[r:COAUTHOR]-(second)
    SET r.strength = CASE WHEN r.strength IS NULL THEN 1 ELSE r.strength + 1 END",
{batchSize:10000, iterateList:true, parallel:true});
"""

query_start_time = time.time()
graph.run(query).evaluate()
print("Done in ({:.3f} minutes)...".format((time.time()-query_start_time)/60))