In [1]:
from py2neo import Graph, Node, Relationship
import glob, os, time
 
graph = Graph('bolt://neo4j-magtwo:7687', auth=('neo4j','myneo'))
#graph = Graph('bolt://localhost:7687', auth=('neo4j', 'password'))

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

Connected to graph database with 0 nodes and 0 relationships!


In [14]:
graph.delete_all()

## Setup Schema

### Add uniqueness constraints

In [2]:
contraint_node_property_pairs = [('n:Quanta', 'n.id'),
                                 ('t:Tag', 't.name'),
                                 ('a:Author', 'a.id'),
                                 ('o:Organization', 'o.name'),
                                 ('v:Venue', 'v.id')]
for n, p in contraint_node_property_pairs:
    query = "CREATE CONSTRAINT ON ({}) ASSERT {} IS UNIQUE;".format(n,p)
    print(query)
    graph.run(query)

CREATE CONSTRAINT ON (n:Quanta) ASSERT n.id IS UNIQUE;
CREATE CONSTRAINT ON (a:Author) ASSERT a.normalizedName IS UNIQUE;
CREATE CONSTRAINT ON (t:tag) ASSERT t.name IS UNIQUE;
CREATE CONSTRAINT ON (a:Author) ASSERT a.id IS UNIQUE;
CREATE CONSTRAINT ON (o:Organization) ASSERT o.name IS UNIQUE;
CREATE CONSTRAINT ON (v:Venue) ASSERT v.id IS UNIQUE;


### Create indices

In [3]:
indices_to_create = [':Quanta(year)', ':Quanta(lang)', ':Quanta(keywords)', 
                     ':Quanta(title)', ':Quanta(venue)', ':Quanta(doctype)',
                     ':Venue(name)', ':Year(value)',
                     ':Author(name)', ':Author(normalizedName)']
for index in indices_to_create:
    query = "CREATE INDEX ON {};".format(index)
    print(query)
    graph.run(query)

CREATE INDEX ON :Quanta(year);
CREATE INDEX ON :Quanta(lang);
CREATE INDEX ON :Quanta(keywords);
CREATE INDEX ON :Quanta(title);
CREATE INDEX ON :Quanta(venue);
CREATE INDEX ON :Quanta(doctype);
CREATE INDEX ON :Venue(name);
CREATE INDEX ON :Year(value);


## Import Data

### Setup directories etc

In [4]:
# Set data directory. In Neo4j 3.5+, the directory is automatically set to /import
data_dir = '' 
#data_dir = 'file:/Users/timholdsworth/code/scaling-science/notebooks/data/'

venues_file = data_dir + 'magtwo/mag_venues.txt'
papers_files = [data_dir + 'magtwo/mag_papers_{}.txt'.format(i) for i in range(11)]
authors_files = [data_dir + 'magtwo/mag_authors_{}.txt'.format(i) for i in range(13)]

v1_papers_files = ['magone/mag_papers_{}.txt'.format(i) for i in range(167)]
v1_papers_files_clean = ['magone/mag_papers_{}.txt'.format(i) for i in range(167)]

print("Data directory set to `{}`.".format(data_dir))

def run_query(query, graph, print_query=False, run_query=True, print_only=False):
    if print_only: 
        print_query = True
        run_query = False
    start_time = time.time()
    if print_query:
        print(query)
    if run_query:
        graph.run(query)
    end_time = time.time()
    seconds_elapsed = end_time-start_time
    minutes_elapsed = (end_time-start_time)/60
    print("Query completed in {:.2f} seconds.".format(seconds_elapsed))

Data directory set to `file:/import/`.


### Import venues as nodes


In [5]:
query = """ 
CALL apoc.periodic.iterate(
"CALL apoc.load.json('{}') YIELD value AS q RETURN q",
"CREATE (v:Venue {{id:q.id, journalId:q.JournalId, conferenceId:q.ConferenceId,
    name:q.DisplayName, normalizedName:q.NormalizedName}})", 
{{batchSize:10000, iterateList:true, parallel:true}});
""".format(venues_file)

run_query(query, graph)

ClientError: Procedure Not Found: There is no procedure with the name `apoc.periodic.iterate` registered for this database instance. Please ensure you've spelled the procedure name correctly and that the procedure is properly deployed.

### Create year nodes

In [6]:
query = """
UNWIND range(1800, 2020) as yr
MERGE (y:Year {value: yr})
"""
run_query(query, graph)

Query completed in 0.30 seconds.


### Import quanta

In [76]:
#for file_name in papers_files:
for file_name in papers_files:
    print('Importing {}'.format(file_name))
    query = """
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q",
    "CREATE (p:Quanta {{id:q.id, title:q.title, year:q.year, keywords:q.keywords,
        numCitations:q.n_citation, docType:q.doc_type, language:q.lang, 
        publisher:q.publisher, doi:q.doi, pdf:q.pdf, abstract:q.abstract}})",
    {{batchSize:10000, iterateList:true, parallel:true}})
    """.format(file_name)
    run_query(query, graph)
    
# CREATE (p:Quanta {{id:q.id, title:q.title, year:q.year, keywords:q.keywords,
#        numCitations:q.n_citation, docType:q.doc_type, language:q.lang, 
#        publisher:q.publisher, doi:q.doi, pdf:q.pdf, abstract:q.abstract

#MERGE (p:Quanta {{id:q.id}}) 
#        ON CREATE SET title=q.title, year=q.year, keywords=q.keywords,
#        numCitations=q.n_citation, docType=q.doc_type, language=q.lang, 
#        publisher=q.publisher, doi=q.doi, pdf=q.pdf, abstract=q.abstract

Importing file:/import/magtwo/mag_papers_1.txt
Query completed in 236.50 seconds.


### Create relationships between quanta and year

In [10]:
for file_name in papers_files:
    query = """
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "MATCH (p:Quanta {{id:q.id}})
    WITH p
    MATCH (y:Year {{value: p.year}})
    CREATE (p)-[:PUBLISHED_IN_YEAR]->(y)",
    {{batchSize:10000, iterateList:true, parallel:true}})
    """.format(file_name)
    run_query(query, graph)
    
# Changing q to p in value:p.year because year value on the node should be same
# as the year value on corresponding node in the map

Query completed in 999.94 seconds.
Query completed in 700.60 seconds.
Query completed in 837.66 seconds.
Query completed in 999.41 seconds.
Query completed in 855.87 seconds.
Query completed in 906.90 seconds.
Query completed in 772.89 seconds.
Query completed in 905.58 seconds.
Query completed in 45.62 seconds.


### Create relationships between Quanta and Venue

In [8]:
for file_name in papers_files:
    query = """
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "MATCH (p:Quanta {{id:q.id}})
    WITH q, p
    UNWIND q.venue as venue
    MATCH (v:Venue {{name: venue.raw}})
    WITH p, v
    CREATE (p)-[:PUBLISHED_IN_VENUE]->(v)",
    {{batchSize:10000, iterateList:true, parallel:true}})
    """.format(file_name)
    run_query(query, graph)
    

KeyboardInterrupt: 

Exception ignored in: 'neo4j.bolt._io.ChunkedInputBuffer.receive'
KeyboardInterrupt


ServiceUnavailable: Failed to read from closed connection Address(host='192.168.0.2', port=7687)

### Import Authors

In [None]:
# updated version (experimental)
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "MERGE (a:Author {{id:q.id, name:q.name}})
    SET a.normalizedName=q.normalized_name, a.position=q.position, a.lastAffiliation=q.org,
            a.numCitations=q.n_citation, a.numPublications=q.n_pubs, a.hIndex=q.h_index",
    {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(file_name)
    run_query(query, graph)

In [None]:
## USE TO UPDATE STUFF THAT WAS WRONG

for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "MATCH (a:Author {{id:q.id}})
    SET a.lastAffiliation=q.org, a.name:q.name, a.normalizedName=q.normalized_name",
    {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(file_name)
    run_query(query, graph)

### Create relationships between authors and quanta

In [None]:
# updated version (experimental)
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "MATCH (a:Author {{id:q.id}})
    WITH q, a
    UNWIND q.pubs as pubs
    MATCH (p:Quanta {{id:pubs.i}})
    MERGE (a)-[r:AUTHORED {{order:pubs.r}}]->(p)",
    {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(file_name)
    run_query(query, graph)

### Import Organizations

In [None]:
# Add organization nodes
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MERGE (o:Organization {{name: q.org}})",
        {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    run_query(query, graph)

### Add relationships between Authors and Org

In [None]:
# Add relationships between authors and organizations
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MATCH (a:Author {{id: q.id}})
        WITH a
        WHERE q.org is not null
        MATCH (o:Organization {{name: q.org}})
        CREATE (a)-[:AFFILIATED_WITH]->(o)",
        {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    run_query(query, graph)

### Import Tags

In [None]:
# Add tags as nodes
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "UNWIND q.tags as tags
        MERGE (t:Tag {{name: tags.t}})",
        {{batchSize:10000, iterateList:true, parallel:true}})
    """.format(data_dir + file_name)
    run_query(query, graph)

### Add relationships between Authors and Tags

In [None]:
# Add relationships between authors and tags
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "UNWIND q.tags as tags
        MATCH (a:Author {{id: q.id}})
        MATCH (t:Tag {{name: tags.t}})
        CREATE (a)-[:HAS_TAG {{weight: tags.weight}}]->(t)",
        {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    run_query(query, graph)

### Import citation data from MAGv1

In [None]:
# Add relationships between authors and tags
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "UNWIND q.tags as tags
        MATCH (a:Author {{normalizedName: q.normalized_name}})
        MATCH (t:Tag {{name: tags.t}})
        CREATE (a)-[:HAS_TAG {{weight: tags.weight}}]->(t)",
        {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    run_query(query, graph)

### Add coauthor relationships

In [None]:
query = """
CALL apoc.periodic.iterate(
    "MATCH (q:Quanta) WHERE size((q)<-[:AUTHORED]-()) > 1 RETURN q",
    "WITH [(q)<-[:AUTHORED]-(a) | a] as coAuthors
    UNWIND coAuthors as first
    UNWIND coAuthors as second
    WITH first, second
    WHERE id(first) < id(second)
    MERGE (first)-[r:COAUTHOR]-(second)
    SET r.strength = CASE WHEN r.strength IS NULL THEN 1 ELSE r.strength + 1 END",
{batchSize:10000, iterateList:true, parallel:true});
"""
run_query(query, graph, print_only=False)

# OLD IMPORTS BELOW HERE

### Import quanta

In [8]:
#for file_name in papers_files:
for file_name in papers_files:
    print('Importing {}'.format(file_name))
    query = """
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "MERGE (p:Quanta {{id:q.id, title:q.title, year:q.year, keywords:q.keywords,
        numCitations:q.n_citation, docType:q.doc_type, language:q.lang, 
        publisher:q.publisher, doi:q.doi, pdf:q.pdf, abstract:q.abstract}})",
    {{batchSize:10000, iterateList:true, parallel:true}})
    """.format(file_name)
    run_query(query, graph)
    

Importing file:/import/magtwo/mag_papers_1.txt
Query completed in 327.92 seconds.


### Delete relationships between Quanta and Venue

In [7]:
query = """
MATCH (:Quanta)-[r:PUBLISHED_IN_VENUE]-(:Venue) 
DELETE r
"""
run_query(query, graph)

Query completed in 0.24 seconds.


### Import quanta, authors, and add relationships to author and venue nodes

In [6]:
for file_name in papers_files:
    query = """
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "UNWIND q.id as id
    CREATE (p:Quanta {{id:id, title:q.title, year:q.year, keywords:q.keywords,
        numCitations:q.n_citation, docType:q.doc_type, language:q.lang, 
        publisher:q.publisher, doi:q.doi, pdf:q.pdf, abstract:q.abstract}})
    WITH q, p
    MATCH (y:Year {{value: q.year}})
    CREATE (p)-[:PUBLISHED_IN_YEAR]->(y)
    WITH q, p
    UNWIND q.venue as venue
    MATCH (v:Venue {{name: venue.raw }})
    CREATE (p)-[:PUBLISHED_IN_VENUE]->(v)
    WITH q, p
    UNWIND range(0, size(q.authors)-1) as i
    MERGE (a:Author {{id:q.authors[i].id, name:q.authors[i].name}})
    CREATE (a)-[:AUTHORED {{rank:i}}]->(p)",
    {{batchSize:10000, iterateList:true, parallel:false}});
    """.format(file_name)
    run_query(query, graph, print_only=False)

KeyboardInterrupt: 

Exception ignored in: 'neo4j.bolt._io.ChunkedInputBuffer.receive'
KeyboardInterrupt


ServiceUnavailable: Failed to read from closed connection Address(host='192.168.0.2', port=7687)

### Import Author, Tag, and Organization nodes and relationships between them

In [None]:
# Import authors and make relationships between authors and quanta
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "MERGE (a:Author {{id:q.id, name:q.name}})
        SET a.normalizedName=q.normalized_name, a.position=q.position, 
            a.numCitations=q.n_citation, a.numPublications=q.n_pubs, a.hIndex=q.h_index)
    WITH q, a
    UNWIND q.pubs as pubs
    MATCH (p:Quanta {{id:pubs.i}})
    MERGE (a)-[r:AUTHORED]->(p)
    SET  rrank=pubs.r",
    {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    run_query(query, graph)

In [None]:
# Add organization nodes
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "CREATE (o:Organization {{name: q.org}})",
        {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    run_query(query, graph)

In [None]:
# Add relationships between authors and organizations
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MATCH (a:Author {{normalizedName: q.normalized_name}})
        MATCH (o:Organization {{name: q.org}})
        WHERE q.org is not null
        CREATE (a)-[:AFFILIATED_WITH]->(o)",
        {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    run_query(query, graph)

In [None]:
# Add tags as nodes
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "UNWIND q.tags as tags
        MERGE (t:Tag {{name: tags.t}})",
        {{batchSize:10000, iterateList:true, parallel:true}})
    """.format(data_dir + file_name)
    run_query(query, graph)

In [None]:
# Add relationships between authors and tags
for file_name in authors_files: 
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "UNWIND q.tags as tags
        MATCH (a:Author {{normalizedName: q.normalized_name}})
        MATCH (t:Tag {{name: tags.t}})
        CREATE (a)-[:HAS_TAG {{weight: tags.weight}}]->(t)",
        {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    run_query(query, graph)

### Import Citation Data from MAGv1 Data

In [None]:
# Edit this so that it matches the ids from magv1 with the id properties on existing 
for file_name in v1_papers_files:
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MATCH (a:Quanta {{title: q.title}})
        WITH a
        UNWIND q.refs as ref
        MATCH (b:Quanta {{title: ref}})
        CREATE (a)-[:CITES]->(b)",
        {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(data_dir + file_name)
    
run_query(query, graph)

### Add Coauthor Relationships

In [None]:
query = """
CALL apoc.periodic.iterate(
    "MATCH (q:Quanta) WHERE size((q)<-[:AUTHORED]-()) > 1 RETURN q",
    "WITH [(q)<-[:AUTHORED]-(a) | a] as coAuthors
    UNWIND coAuthors as first
    UNWIND coAuthors as second
    WITH first, second
    WHERE id(first) < id(second)
    MERGE (first)-[r:COAUTHOR]-(second)
    SET r.strength = CASE WHEN r.strength IS NULL THEN 1 ELSE r.strength + 1 END",
{batchSize:10000, iterateList:true, parallel:true});
"""
run_query(query, graph, print_only=False)