In [None]:
from py2neo import Graph, Node, Relationship
import glob, os, time
 
graph = Graph('bolt://neo4j-magtwo:7687', auth=('neo4j','myneo'))
# graph = Graph('bolt://localhost:7687', auth=('neo4j', 'password'))

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

In [None]:
graph.delete_all()

## Setup Schema

### Add uniqueness constraints

In [None]:
contraint_node_property_pairs = [('n:Quanta', 'n.id'), 
                                 ('a:Author', 'a.normalizedName'),
                                 ('t:tag', 't.name'),
                                 ('a:Author', 'a.id'),
                                 ('o:Organization', 'o.name'),
                                 ('v:Venue', 'v.id')]
for n, p in contraint_node_property_pairs:
    query = "CREATE CONSTRAINT ON ({}) ASSERT {} IS UNIQUE;".format(n,p)
    print(query)
    graph.run(query)

### Create indices

In [None]:
indices_to_create = [':Quanta(year)', ':Quanta(lang)', ':Quanta(keywords)', 
                     ':Quanta(title)', ':Quanta(venue)', ':Quanta(doctype)',
                     ':Venue(name)', ':Year(value)']
for index in indices_to_create:
    query = "CREATE INDEX ON {};".format(index)
    print(query)
    graph.run(query)

## Import Data

### Setup directories etc

In [None]:
data_dir = 'file:/import/magtwo/' #'file:/Users/timholdsworth/code/scaling-science/notebooks/data/medium/'
venues_file = 'mag_venues.txt'
papers_files = ['mag_papers_{}.txt'.format(i) for i in range(11)]
authors_files = ['mag_authors_{}.txt'.format(i) for i in range(13)]

print("Data directory set to `{}`.".format(data_dir))

def run_query(query, graph, print_query=False, run_query=True, print_only=False):
    if print_only: 
        print_query = True
        run_query = False
    start_time = time.time()
    if print_query:
        print(query)
    if run_query:
        graph.run(query)
    end_time = time.time()
    seconds_elapsed = end_time-start_time
    minutes_elapsed = (end_time-start_time)/60
    print("Query completed in {:.2f} seconds.".format(seconds_elapsed))

### Import venues as nodes


In [None]:
query = """ 
CALL apoc.periodic.iterate(
"CALL apoc.load.json('{}') YIELD value AS q RETURN q",
"CREATE (v:Venue {{id:q.id, journalId:q.JournalId, conferenceId:q.ConferenceId,
    name:q.DisplayName, normalizedName:q.NormalizedName}})", 
{{batchSize:10000, iterateList:true, parallel:true}});
""".format(data_dir + venues_file)

run_query(query, graph)

### Create year nodes

In [None]:
query = """
UNWIND range(1750, 2020) as yr
MERGE (y:Year {value: yr})
"""
run_query(query, graph)

### Import quanta, authors, and add relationships to author and venue nodes

In [None]:
for file_name in papers_files:
    query = """
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "UNWIND q.id as id
    CREATE (p:Quanta {{id:id, title:q.title, year:q.year, keywords:q.keywords,
        numCitations:q.n_citation, docType:q.doc_type, language:q.lang, 
        publisher:q.publisher, doi:q.doi, pdf:q.pdf, abstract:q.abstract}})
    WITH q, p
    MATCH (y:Year {{value: q.year}})
    CREATE (p)-[:PUBLISHED_IN_YEAR]->(y)
    WITH q, p
    UNWIND q.venue as venue
    MATCH (v:Venue {{name: venue.raw }})
    CREATE (p)-[:PUBLISHED_IN_VENUE]->(v)
    WITH q, p
    UNWIND range(0, size(q.authors)-1) as i
    MERGE (a:Author {{id:q.authors[i].id, name:q.authors[i].name}})
    CREATE (a)-[:AUTHORED {{rank:i}}]->(p)",
    {{batchSize:10000, iterateList:true, parallel:false}});
    """.format(data_dir + file_name)
    run_query(query, graph, print_only=False)

### Import quanta from certain venues and add relationship to venue node

In [None]:
# No periodic import - works 
for file_name in papers_files:
    query = """
    CALL apoc.load.json('{}') YIELD value AS q 
    WHERE q.venue.raw="Nature"
    CREATE (p:Quanta {{id:q.id, title:q.title, year:q.year, keywords:q.keywords,
        numCitations:q.n_citation, docType:q.doc_type, language:q.lang, 
        publisher:q.publisher, doi:q.doi, pdf:q.pdf, abstract:q.abstract}})
    WITH q, p
    MATCH (y:Year {{value: q.year}})
    CREATE (p)-[:PUBLISHED_IN_YEAR]->(y)
    WITH q, p
    UNWIND q.venue as venue
    MATCH (v:Venue {{name: venue.raw}})
    CREATE (p)-[:PUBLISHED_IN_VENUE]->(v)
    RETURN p.title, q.venue.raw
    LIMIT 100
    """.format(data_dir + file_name)
    run_query(query, graph)
    

### Import Author, Tag, and Organization nodes and relationships between them

In [None]:
# Import authors and make relationships between authors and quanta
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "MERGE (a:Author {{id:q.id, name:q.name}})
        SET a.normalizedName=q.normalized_name, a.position=q.position, 
            a.numCitations=q.n_citation, a.numPublications=q.n_pubs, a.hIndex=q.h_index)
    WITH q, a
    UNWIND q.pubs as pubs
    MATCH (p:Quanta {{id:pubs.i}})
    MERGE (a)-[r:AUTHORED]->(p)
    SET  rrank=pubs.r",
    {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    run_query(query, graph)

In [None]:
# Add organization nodes
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MERGE (o:organization {{name: q.org}})",
        {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    run_query(query, graph)

In [None]:
# Add relationships between authors and organizations
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MATCH (a:Author {{normalizedName: q.normalized_name}})
        MATCH (o:Organization {{name: q.org}})
        WHERE q.org is not null
        CREATE (a)-[:AFFILIATED_WITH]->(o)",
        {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    run_query(query, graph)

In [None]:
# Add tags as nodes
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "UNWIND q.tags as tags
        MERGE (t:Tag {{name: tags.t}})",
        {{batchSize:10000, iterateList:true, parallel:true}})
    """.format(data_dir + file_name)
    run_query(query, graph)

In [None]:
# Add relationships between authors and tags
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "UNWIND q.tags as tags
        MATCH (a:Author {{normalizedName: q.normalized_name}})
        MATCH (t:Tag {{name: tags.t}})
        CREATE (a)-[:HAS_TAG {{weight: tags.weight}}]->(t)",
        {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    run_query(query, graph)

### Import Citation Data from MAGv1 Data

In [None]:
# Edit this so that it matches the ids from magv1 with the id properties on existing 
for file_name in v1_papers_files:
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MATCH (a:Quanta {{title: q.title}})
        WITH a
        UNWIND q.refs as ref
        MATCH (b:Quanta {{title: ref}})
        CREATE (a)-[:CITES]->(b)",
        {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    
run_query(query, graph)

### Add Coauthor Relationships

In [None]:
query = """
CALL apoc.periodic.iterate(
    "MATCH (q:Quanta) WHERE size((q)<-[:AUTHORED]-()) > 1 RETURN q",
    "WITH [(q)<-[:AUTHORED]-(a) | a] as coAuthors
    UNWIND coAuthors as first
    UNWIND coAuthors as second
    WITH first, second
    WHERE id(first) < id(second)
    MERGE (first)-[r:COAUTHOR]-(second)
    SET r.strength = CASE WHEN r.strength IS NULL THEN 1 ELSE r.strength + 1 END",
{batchSize:10000, iterateList:true, parallel:true});
"""
run_query(query, graph, print_only=False)