In [None]:
from py2neo import Graph, Node, Relationship
import glob, os, time

graph = Graph("bolt://neo4j-top42:7687", auth=('neo4j','myneo'))

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

In [None]:
# graph.delete_all()

## Setup Schema

### Add uniqueness constraints

In [None]:
contraint_node_property_pairs = [('n:Quanta', 'n.id'), 
                                 ('a:Author', 'a.id'),
                                 ('o:Organization', 'o.name'),
                                 ('v:Venue', 'v.id')]
for n, p in contraint_node_property_pairs:
    query = "CREATE CONSTRAINT ON ({}) ASSERT {} IS UNIQUE;".format(n,p)
    print(query)
    graph.run(query)

### Create indices

In [None]:
indices_to_create = [':Quanta(year)', ':Quanta(lang)', ':Quanta(keywords)', 
                     ':Quanta(title)', ':Quanta(venue)', ':Quanta(doctype)',
                     ':Venue(name)', ':Year(year)']
for index in indices_to_create:
    query = "CREATE INDEX ON {};".format(index)
    print(query)
    graph.run(query)

## Import Data

### Setup directories etc

In [None]:
# data_dir = 'file:/Users/timholdsworth/code/scaling-science/notebooks/data/'
data_dir = 'file:/import/magtwo/'
print("Data directory set to `{}`.".format(data_dir))

def run_query(query, graph, print_query=False, run_query=True, print_only=False):
    if print_only: 
        print_query = True
        run_query = False
    start_time = time.time()
    if print_query:
        print(query)
    if run_query:
        graph.run(query)
    end_time = time.time()
    minutes_elapsed = (end_time-start_time)/60
    print("Query completed in {:.2f} minutes.".format(minutes_elapsed))

### Import venues as nodes

In [None]:
query = """ 
CALL apoc.periodic.iterate(
"CALL apoc.load.json('{}') YIELD value AS q RETURN q",
"CREATE (v:Venue {{id:q.id, journalId:q.JournalId, conferenceId:q.ConferenceId,
    name:q.DisplayName, normalizedName:q.NormalizedName}})", 
{{batchSize:10000, iterateList:true, parallel:true}});
""".format(data_dir + 'mag_venues.txt')

run_query(query, graph)

### Create year nodes

In [None]:
query = """
UNWIND range(1800, 2020) as yr
MERGE (y:Year {value: yr})
"""
run_query(query, graph)

### Import quanta and add relationships to venue nodes

In [None]:
for i in range(11):
    file_name = 'mag_papers_nature_{}.txt'.format(i)
    query = """
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "UNWIND q.id as id
    CREATE (p:Quanta {{id:id, title:q.title, year:q.year, keywords:q.keywords,
        numCitations:q.n_citation, docType:q.doc_type, language:q.lang, 
        publisher:q.publisher, doi:q.doi, pdf:q.pdf, abstract:q.abstract}})
    WITH q, p
    MATCH (y:Year {{value: q.year}})
    CREATE (p)-[:PUBLISHED_IN_YEAR]->(y)
    WITH q, p
    UNWIND q.venue as venue
    MATCH (v:Venue {{name: venue.raw }})
    CREATE (p)-[:PUBLISHED_IN_VENUE]->(v)",
    {{batchSize:10000, iterateList:true, parallel:true}})
    """.format(data_dir + file_name)
    run_query(query, graph)
    




### Import Author, Tag, and Organization nodes and relationships between them

In [None]:
for i in range(13):
    file_name = 'mag_authors_{}.txt'.format(i)
    
    query = """
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "CREATE (a:Author {{id:q.id, name:q.name, normalizedName:q.normalized_name, 
        position:q.position, numCitations:q.n_citation, numPublications:q.n_pubs, 
        hIndex:q.h_index}})
    WITH q, a
    UNWIND q.pubs as pubs
    MATCH (p:Quanta {{id:pubs.i}})
    CREATE (a)-[r:AUTHORED {{rank: pubs.r}}]->(p)
    WITH DISTINCT q, a
    UNWIND q.tags as tags
    MERGE (t:Tag {{name: tags.t}})
    CREATE (a)-[:HAS_TAG {{weight: tags.weight}}]->(t)
    WITH DISTINCT q, a
    WHERE q.org is not null
    MERGE (o:Organization {{name: q.org}})
    CREATE (a)-[:AFFILIATED_WITH]->(o)",
    {{batchSize:10000, iterateList:true, parallel:true}})
    """.format(data_dir + file_name)
    
    run_query(query, graph)
    

### Add Coauthor Relationships

In [None]:
query = """
CALL apoc.periodic.iterate(
    "MATCH (q:Quanta) WHERE size((q)<-[:AUTHORED]-()) > 1 RETURN q",
    "WITH [(q)<-[:AUTHORED]-(a) | a] as coAuthors
    UNWIND coAuthors as first
    UNWIND coAuthors as second
    WITH first, second
    WHERE id(first) < id(second)
    MERGE (first)-[r:COAUTHOR]-(second)
    SET r.strength = CASE WHEN r.strength IS NULL THEN 1 ELSE r.strength + 1 END",
{batchSize:10000, iterateList:true, parallel:true});
"""

run_query(query, graph, print_only=False)