# Setup

## Connect to graph

In [1]:
from py2neo import Graph, Node, Relationship
import glob, os, time
 
graph = Graph('bolt://dev:7687', auth=('neo4j','myneo'))

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

Connected to graph database with 74,155,268 nodes and 0 relationships!


## Setup workspace

In [2]:
## In Neo4j 3.5+, the directory is automatically set to /import
data_dir = '' 
scipy_data_dir = '/tmp/data/'

print("Neo4j data directory set to `{}`.".format(data_dir))
print("SciPy data directory set to `{}`.".format(scipy_data_dir))

# MAG files
mag_venues_file = data_dir + 'magtwo/mag_venues.txt'

n_mag_papers_files = len(glob.glob(scipy_data_dir + 'magtwo/mag_papers_*.txt'))
mag_papers_files = [data_dir + 'magtwo/mag_papers_{}.txt'.format(i) for i in range(n_mag_papers_files)]

n_mag_authors_files = len(glob.glob(scipy_data_dir + 'magtwo/mag_authors_*.txt'))
mag_authors_files = [data_dir + 'magtwo/mag_authors_{}.txt'.format(i) for i in range(n_mag_authors_files)]

n_magv1_papers_files = len(glob.glob(scipy_data_dir + 'magone/mag_papers_*.txt'))
mag_v1_papers_files = [data_dir + 'magone/mag_papers_{}.txt'.format(i) for i in range(n_magv1_papers_files)]

print("\nMAG files:")
print("\tVenue files: {}".format(1))
print("\tPapers files: {}".format(n_mag_papers_files))
print("\tAuthors files: {}".format(n_mag_authors_files))
print("\tMAGv1 papers files: {}".format(n_magv1_papers_files))

# AMner files
aminer_venues_file = data_dir + 'aminertwo/aminer_venues.txt'

n_aminer_papers_files = len(glob.glob(scipy_data_dir + 'aminertwo/aminer_papers_*.txt'))
aminer_papers_files = [data_dir + 'aminertwo/aminer_papers_{}.txt'.format(i) for i in range(n_aminer_papers_files)]

n_aminer_authors_files = len(glob.glob(scipy_data_dir + 'aminertwo/aminer_authors_*.txt'))
aminer_authors_files = [data_dir + 'aminertwo/aminer_authors_{}.txt'.format(i) for i in range(n_aminer_authors_files)]

n_aminer_v1_papers_files = len(glob.glob(scipy_data_dir + 'aminerone/aminer_papers_*.txt'))
aminer_v1_papers_files = [data_dir + 'aminerone/aminer_papers_{}.txt'.format(i) for i in range(n_aminer_v1_papers_files)]
      
print("\nAMiner files:")
print("\tVenue files: {}".format(1))
print("\tPapers files: {}".format(n_aminer_papers_files))
print("\tAuthors files: {}".format(n_aminer_authors_files))
print("\tAMinerv1 papers files: {}".format(n_aminer_v1_papers_files))

# Linking pairs
venue_linking_pairs = 'magtwo/venue_linking_pairs.txt'
paper_linking_pairs = 'magtwo/paper_linking_pairs.txt'
author_linking_pairs = 'magtwo/author_linking_pairs.txt'

def run_query(query, graph, print_query=False, run_query=True, print_only=False):
    if print_only: 
        print_query = True
        run_query = False
    start_time = time.time()
    if print_query:
        print(query)
    if run_query:
        graph.run(query)
    end_time = time.time()
    seconds_elapsed = end_time-start_time
    minutes_elapsed = (end_time-start_time)/60
#     print("Query completed in {:.2f} seconds.".format(seconds_elapsed))

Neo4j data directory set to ``.
SciPy data directory set to `/tmp/data/`.

MAG files:
	Venue files: 1
	Papers files: 11
	Authors files: 13
	MAGv1 papers files: 167

AMiner files:
	Venue files: 1
	Papers files: 15
	Authors files: 20
	AMinerv1 papers files: 155


# Import functions

## Constraints and indices

In [3]:
def add_constraints(print_only=False):
    contraint_pairs = [('n:Quanta', 'n.id'),
                       ('n:Quanta', 'n.idv1'), 
                       ('a:Author', 'a.id'),
                       ('v:Venue', 'v.id')]
    
    for n, p in contraint_pairs:
        query = "CREATE CONSTRAINT ON ({}) ASSERT {} IS UNIQUE;".format(n,p)
        run_query(query, graph, print_only=print_only)
        
def add_indices(print_only=False):
    indices = [':Quanta(cleanName)',
               ':Quanta(doi)',
               ':Venue(cleanName)', 
               ':Venue(matchingId)',
               ':Year(value)',
               ':Author(cleanName)',
               ':Organization(cleanName)']
    
    for index in indices:
        query = "CREATE INDEX ON {};".format(index)
        run_query(query, graph, print_only=print_only)

## (:Year)

In [4]:
def add_years(min_year, max_year, print_only=False):
    query = """
    UNWIND range({}, {}) as yr
    MERGE (y:Year {{value: yr}})
    """.format(min_year, max_year)
    run_query(query, graph, print_only=print_only)

## (:Venue)

In [5]:
def add_venues(venues_file, print_only=False):
    query = """ 
    CALL apoc.load.json('{}') YIELD value AS q
    CREATE (v:Venue)
        SET v.id = q.id,
            v.journalId = q.JournalId, 
            v.conferenceId = q.ConferenceId, 
            v.matchingId = coalesce(q.JournalId, q.ConferenceId),
            v.name = q.DisplayName, 
            v.normalizedName = q.NormalizedName,
            v.cleanName = apoc.text.clean(q.DisplayName);
    """.format(venues_file)
    run_query(query, graph, print_only=print_only)

## (:Quanta)

In [34]:
def add_quanta(papers_files, print_only=False):
    for file_name in papers_files:
        print('Importing {}'.format(file_name))
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "WHERE q IS NOT NULL AND size(q.title) <= 1000
         MERGE (p:Quanta)
            SET p.id = q.id, 
                p.title = q.title, 
                p.cleanName = apoc.text.clean(q.title),
                p.year = q.year, 
                p.numCitations = q.n_citation, 
                p.docType = q.doc_type, 
                p.language = q.lang, 
                p.publisher = q.publisher, 
                p.doi = q.doi, 
                p.pdf = q.pdf, 
                p.abstract = q.abstract;",
        {{batchSize:10000, iterateList:true, parallel:false}});
        """.format(file_name)

#         query = """
#         CALL apoc.load.json('{}') YIELD value AS q
#         WHERE q IS NOT NULL AND size(q.title) <= 1000
#         CREATE (p:Quanta)
#             SET p.id = q.id, 
#                 p.title = q.title, 
#                 p.cleanName = apoc.text.clean(q.title),
#                 p.year = q.year, 
#                 p.numCitations = q.n_citation, 
#                 p.docType = q.doc_type, 
#                 p.language = q.lang, 
#                 p.publisher = q.publisher, 
#                 p.doi = q.doi, 
#                 p.pdf = q.pdf, 
#                 p.abstract = q.abstract;
#         """.format(file_name)

        run_query(query, graph, print_only=print_only)

## (:Quanta {idv1})

In [7]:
def merge_quanta_on_cleanname(print_only=False):
    query = """
    CALL apoc.periodic.iterate(
        "MATCH (q:Quanta) 
         WHERE EXISTS(q.cleanName)
         WITH 
             q.cleanName as cleanName, 
             COLLECT(q) as nodeList, 
             count(*) as count
         WHERE count>=2
         RETURN nodeList",
        "CALL apoc.refactor.mergeNodes(nodeList, 
         {properties:'discard', mergeRels:true})
         YIELD node
         RETURN 'none'",
        {batchSize:10000, iterateList:true, parallel:true});
    """
    run_query(query, graph, print_only=print_only)

In [8]:
def add_v1_ids(v1_papers_files, print_only=False):
    for file_name in v1_papers_files:
        query = """
        CALL apoc.periodic.iterate(
            "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
            "MATCH (p:Quanta {{cleanName: apoc.text.clean(q.title)}})
             SET p.idv1 = q.id",
            {{batchSize:10000, iterateList:true, parallel:true}});
        """.format(data_dir + file_name)
        run_query(query, graph, print_only=print_only)

## (:Quanta)-[:PUBLISHED_IN_YEAR]->(:Year)

In [9]:
def add_quanta_year_edges(papers_files, print_only=False):
    for file_name in papers_files:
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MATCH (p:Quanta {{id:q.id}})
         MATCH (y:Year {{value: q.year}})
         CREATE (p)-[:PUBLISHED_IN_YEAR]->(y)",
        {{batchSize:10000, iterateList:true, parallel:false}})
        """.format(file_name)
        run_query(query, graph, print_only=print_only)

## (:Quanta)-[:PUBLISHED_IN_VENUE]->(:Venue)

In [10]:
def add_quanta_venue_edges(papers_files, print_only=False):
    for file_name in papers_files:
        print('Processing {}'.format(file_name))

        # First run on all venues WITH id's 
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') 
         YIELD value AS q 
         RETURN q",
        "WHERE exists(q.venue.id)
         MATCH (p:Quanta {{id:q.id}} )
         MATCH (v:Venue {{matchingId:q.venue.id}} )
         CREATE (p)-[:PUBLISHED_IN_VENUE]->(v)",
        {{batchSize:5000, iterateList:true, parallel:false}})        
        """.format(file_name)
        run_query(query, graph, print_only=print_only)
        
        # Second, run on all venues WITHOUT id's
        # This means that the :Venue is not in the graph
        # So we want to MERGE it in
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') 
         YIELD value AS q 
         RETURN q",
        "WHERE not(exists(q.venue.id)) AND exists(q.venue.raw) 
         MATCH (p:Quanta {{id:q.id}} )
         MERGE (v:Venue {{cleanName:apoc.text.clean(q.venue.raw)}} )
         CREATE (p)-[:PUBLISHED_IN_VENUE]->(v)",
        {{batchSize:5000, iterateList:true, parallel:false}})        
        """.format(file_name)            
        run_query(query, graph, print_only=print_only)

## (:Author)

In [11]:
def add_authors(authors_files, print_only=False):
    for file_name in authors_files: 
        print('Importing {}'.format(file_name))
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MERGE (a:Author {{id:q.id}})
        SET a.name = q.name, 
            a.normalizedName = q.normalized_name, 
            a.cleanName = apoc.text.clean(q.normalized_name),
            a.position = q.position, 
            a.lastAffiliation = q.org,
            a.numCitations = q.n_citation, 
            a.numPublications = q.n_pubs, 
            a.hIndex=q.h_index",
        {{batchSize:10000, iterateList:true, parallel:true}});
        """.format(file_name)
        run_query(query, graph, print_only=print_only)

## (:Author)-[:AUTHORED]->(:Quanta)

In [12]:
def add_author_quanta_edges(authors_files, print_only=False):
    for file_name in authors_files: 
        print('Creating authored relationships for {}'.format(file_name))
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MATCH (a:Author {{id:q.id}})
        UNWIND q.pubs as pub
        MATCH (p:Quanta {{id:pub.i}})
        MERGE (a)-[r:AUTHORED]->(p)
        ON CREATE SET r.order = CASE WHEN pub.r IS NULL THEN NULL ELSE pub.r END",
        {{batchSize:10000, iterateList:true, parallel:false}});
        """.format(file_name)
        run_query(query, graph, print_only=print_only)

## (:Organization)

In [13]:
def add_organizations(authors_files, print_only=False):
    for file_name in authors_files: 
        query = """
        CALL apoc.periodic.iterate(
            "CALL apoc.load.json('{}') 
             YIELD value AS q              
             RETURN q",
            "WHERE q.org is not null
             MERGE (o:Organization {{cleanName: apoc.text.clean(q.org}})
             ON CREATE SET o.name = q.org",
            {{batchSize:10000, iterateList:true, parallel:false}});
        """.format(data_dir + file_name)
        run_query(query, graph, print_only=print_only)

## (:Author)-[:AFFILIATED_WITH]->(:Organization)

In [14]:
def add_author_organization_edges(authors_files, print_only=False):
    for file_name in authors_files: 
        query = """
        CALL apoc.periodic.iterate(
            "CALL apoc.load.json('{}') 
             YIELD value AS q 
             RETURN q",
            "WHERE q.org IS NOT NULL
             MATCH (a:Author {{id: q.id}})
             MATCH (o:Organization {{cleanName: apoc.text.clean(q.org)}})
             MERGE (a)-[:AFFILIATED_WITH]->(o)",
            {{batchSize:10000, iterateList:true, parallel:false}});
        """.format(data_dir + file_name)
        run_query(query, graph, print_only=print_only)

## (:Tag)

In [15]:
def add_tags(authors_files, print_only=False):
    for file_name in authors_files: 
        query = """
        CALL apoc.periodic.iterate(
            "CALL apoc.load.json('{}') 
             YIELD value AS q 
             RETURN q",
            "WHERE exists(q.tags)
             UNWIND q.tags as tag
             MERGE (t:Tag {{cleanName: apoc.text.clean(tag.t)}})
             ON CREATE SET t.name = tag.t",
            {{batchSize:10000, iterateList:true, parallel:true}})
        """.format(data_dir + file_name)
        run_query(query, graph, print_only=print_only)

## (:Author)-[:HAS_TAG]->(:Tag)

In [16]:
def add_author_tag_edges(authors_files, print_only=False):
    for file_name in authors_files: 
        print("Processing {}...".format(file_name))    
        
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') 
         YIELD value AS q 
         RETURN q",
        "WHERE q.tags IS NOT NULL 
         MATCH (a:Author {{id:q.id}})
         UNWIND q.tags as tag
         MATCH (t:Tag {{cleanName:apoc.text.clean(tag.t)}})
         MERGE (a)-[:HAS_TAG]->(t)",
        {{batchSize:10000, iterateList:true, parallel:false}});
        """.format(file_name)
        run_query(query, graph, print_only=print_only)

## (:Quanta)-[:HAS_TAG]->(:Tag)

In [17]:
def add_mag_quanta_tag_edges(papers_files, print_only=False):
    for file_name in authors_files: 
        print("Processing {}...".format(file_name))    
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') 
         YIELD value AS q 
         RETURN q",
        "WHERE q.keywords IS NOT NULL 
         MATCH (q:Quanta {{id:q.idv1}})
         UNWIND q.keywords as tag
         MATCH (t:Tag {{cleanName:apoc.text.clean(tag.t)}})
         MERGE (q)-[:HAS_TAG]->(t)",
        {{batchSize:10000, iterateList:true, parallel:false}});
        """.format(file_name)
        run_query(query, graph, print_only=print_only)

## (:Quanta)-[:CITES]->(:Quanta)

In [18]:
def add_quanta_quanta_edges(v1_papers_files, print_only=False):
    for file_name in v1_papers_files:
        print("Processing {}...".format(file_name))
        query = """
        CALL apoc.periodic.iterate(
            "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
            "MATCH (p:Quanta {{idv1: q.id}})
            UNWIND q.references as ref
            WITH p, ref
            MERGE (p)-[:CITES]->(b)",
            {{batchSize:10000, iterateList:true, parallel:false}});
        """.format(data_dir + file_name)
        run_query(query, graph, print_only=print_only)

## (:Author)-[:COAUTHOR]-(:Author)

In [19]:
def add_author_author_edges(print_only=False):
    query = """
    CALL apoc.periodic.iterate(
        "MATCH (q:Quanta) WHERE size((q)<-[:AUTHORED]-()) > 1 RETURN q",
        "WITH [(q)<-[:AUTHORED]-(a) | a] as coAuthors
        UNWIND coAuthors as first
        UNWIND coAuthors as second
        WITH first, second
        WHERE id(first) < id(second)
        MERGE (first)-[r:COAUTHOR]-(second)
        SET r.strength = CASE WHEN r.strength IS NULL THEN 1 ELSE r.strength + 1 END",
    {batchSize:10000, iterateList:true, parallel:false});
    """
    run_query(query, graph, print_only=print_only)

## Merge nodes that occur in both MAG and AMiner databases

In [20]:
def merge_mag_aminer_venues(print_only=False):
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('magtwo/venue_linking_pairs.txt') 
         YIELD value as q
         RETURN q",
        "MATCH (m:Venue {id:q.mid})
         MATCH (a:Venue {id:q.aid})
         CALL apoc.refactor.mergeNodes(collect([m, a]), 
             {{properties:'discard', mergeRels:true}}) YIELD node
         RETURN 'none'",
        {{batchSize:10000, iterateList:true, parallel:true}});
        """ 
    run_query(query, graph, print_only=print_only)

In [21]:
def merge_mag_aminer_authors(print_only=False):
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('magtwo/author_linking_pairs.txt') 
         YIELD value as q
         RETURN q",
        "MATCH (m:Author {id:q.mid})
         MATCH (a:Author {id:q.aid})
         CALL apoc.refactor.mergeNodes(collect([m, a]), 
             {{properties:'discard', mergeRels:true}}) YIELD node
         RETURN 'none'",
        {{batchSize:10000, iterateList:true, parallel:true}});
        """
    run_query(query, graph, print_only=print_only)

In [22]:
def merge_mag_aminer_papers(print_only=False):
    query = """
    CALL apoc.periodic.iterate(
        "CALL apoc.load.json('magtwo/paper_linking_pairs.txt') 
         YIELD value as q
         RETURN q",
        "MATCH (m:Quanta {id:q.mid})
         MATCH (a:Quanta {id:q.aid})
         CALL apoc.refactor.mergeNodes(collect([m, a]), 
             {{properties:'discard', mergeRels:true}}) YIELD node
         RETURN 'none'",
        {{batchSize:10000, iterateList:true, parallel:true}});
        """
    run_query(query, graph, print_only=print_only)

# Run Import

## Setup database

In [5]:
add_constraints(print_only=False)
add_indices(print_only=False)
add_years(1700, 2020)

CREATE CONSTRAINT ON (n:Quanta) ASSERT n.id IS UNIQUE;
CREATE CONSTRAINT ON (n:Quanta) ASSERT n.idv1 IS UNIQUE;
CREATE CONSTRAINT ON (a:Author) ASSERT a.id IS UNIQUE;
CREATE CONSTRAINT ON (v:Venue) ASSERT v.id IS UNIQUE;
CREATE INDEX ON :Quanta(cleanName);
CREATE INDEX ON :Quanta(doi);
CREATE INDEX ON :Venue(cleanName);
CREATE INDEX ON :Venue(matchingId);
CREATE INDEX ON :Year(value);
CREATE INDEX ON :Author(cleanName);
CREATE INDEX ON :Organization(cleanName);


## Import MAG

In [24]:
add_venues(mag_venues_file)

In [35]:
add_quanta(mag_papers_files, print_only=False)

Importing magtwo/mag_papers_0.txt

        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('magtwo/mag_papers_0.txt') YIELD value AS q RETURN q",
        "WHERE q IS NOT NULL AND size(q.title) <= 1000
         CREATE (p:Quanta)
            SET p.id = q.id, 
                p.title = q.title, 
                p.cleanName = apoc.text.clean(q.title),
                p.year = q.year, 
                p.numCitations = q.n_citation, 
                p.docType = q.doc_type, 
                p.language = q.lang, 
                p.publisher = q.publisher, 
                p.doi = q.doi, 
                p.pdf = q.pdf, 
                p.abstract = q.abstract;",
        {batchSize:5000, iterateList:true, parallel:false});
        
Importing magtwo/mag_papers_1.txt

        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('magtwo/mag_papers_1.txt') YIELD value AS q RETURN q",
        "WHERE q IS NOT NULL AND size(q.title) <= 1000
         CREATE (p:Quanta)
            SET p.id =

In [33]:
merge_quanta_on_cleanname(print_only=False)
add_v1_ids(mag_v1_papers_files, print_only=False)


    CALL apoc.periodic.iterate(
        "MATCH (q:Quanta) 
         WHERE EXISTS(q.cleanName)
         WITH 
             q.cleanName as cleanName, 
             COLLECT(q) as nodeList, 
             count(*) as count
         WHERE count>=2
         RETURN nodeList",
        "CALL apoc.refactor.mergeNodes(nodeList, 
         {properties:'discard', mergeRels:true})
         YIELD node
         RETURN 'none'",
        {batchSize:10000, iterateList:true, parallel:true});
    


In [36]:
add_quanta_year_edges(mag_papers_files, print_only=True)


        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('mag_first1m.txt') YIELD value AS q RETURN q",
        "MATCH (p:Quanta {id:q.id})
         MATCH (y:Year {value: p.year})
         CREATE (p)-[:PUBLISHED_IN_YEAR]->(y)",
        {batchSize:10000, iterateList:true, parallel:false})
        


In [38]:
add_quanta_venue_edges(mag_papers_files, print_only=True)
# NEED TO RE ADD

Processing mag_first1m.txt

        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('mag_first1m.txt') 
         YIELD value AS q 
         RETURN q",
        "WHERE exists(q.venue.id)
         MATCH (p:Quanta {id:q.id} )
         MATCH (v:Venue {matchingId:q.venue.id} )
         CREATE (p)-[:PUBLISHED_IN_VENUE]->(v)",
        {batchSize:5000, iterateList:true, parallel:false})        
        

        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('mag_first1m.txt') 
         YIELD value AS q 
         RETURN q",
        "WHERE not(exists(q.venue.id)) AND exists(q.venue.raw) 
         MATCH (p:Quanta {id:q.id} )
         MERGE (v:Venue {cleanName:apoc.text.clean(q.venue.raw)} )
         CREATE (p)-[:PUBLISHED_IN_VENUE]->(v)",
        {batchSize:5000, iterateList:true, parallel:false})        
        


In [25]:
add_authors(mag_authors_files, print_only=True)

Importing magtwo/mag_authors_0.txt

        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('magtwo/mag_authors_0.txt') YIELD value AS q RETURN q",
        "MERGE (a:Author {id:q.id})
        SET a.name = q.name, 
            a.normalizedName = q.normalized_name, 
            a.cleanName = apoc.text.clean(q.normalized_name),
            a.position = q.position, 
            a.lastAffiliation = q.org,
            a.numCitations = q.n_citation, 
            a.numPublications = q.n_pubs, 
            a.hIndex=q.h_index",
        {batchSize:10000, iterateList:true, parallel:true});
        
Importing magtwo/mag_authors_1.txt

        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('magtwo/mag_authors_1.txt') YIELD value AS q RETURN q",
        "MERGE (a:Author {id:q.id})
        SET a.name = q.name, 
            a.normalizedName = q.normalized_name, 
            a.cleanName = apoc.text.clean(q.normalized_name),
            a.position = q.position, 
            a.lastAffil

In [26]:
add_author_quanta_edges(mag_authors_files, print_only=True)

Creating authored relationships for magtwo/mag_authors_0.txt

        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('magtwo/mag_authors_0.txt') YIELD value AS q RETURN q",
        "MATCH (a:Author {id:q.id})
        UNWIND q.pubs as pub
        MATCH (p:Quanta {id:pub.i})
        MERGE (a)-[r:AUTHORED]->(p)
        ON CREATE SET r.order = CASE WHEN pub.r IS NULL THEN NULL ELSE pub.r END",
        {batchSize:10000, iterateList:true, parallel:false});
        
Creating authored relationships for magtwo/mag_authors_1.txt

        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('magtwo/mag_authors_1.txt') YIELD value AS q RETURN q",
        "MATCH (a:Author {id:q.id})
        UNWIND q.pubs as pub
        MATCH (p:Quanta {id:pub.i})
        MERGE (a)-[r:AUTHORED]->(p)
        ON CREATE SET r.order = CASE WHEN pub.r IS NULL THEN NULL ELSE pub.r END",
        {batchSize:10000, iterateList:true, parallel:false});
        
Creating authored relationships for magtwo/mag_autho

In [None]:
add_organizations(mag_authors_files)

In [None]:
add_author_organization_edges(mag_authors_files)

In [None]:
add_tags(mag_authors_files)

In [None]:
add_author_tag_edges(mag_authors_files)

In [None]:
add_quanta_quanta_edges(mag_v1_papers_files)

## Import AMiner

In [18]:
add_venues(aminer_venues_file)

Query completed in 5.50 seconds.


In [None]:
add_quanta(aminer_papers_files)

In [None]:
add_v1_ids(aminer_v1_papers_files)

In [None]:
add_quanta_year_edges(aminer_papers_files)

In [None]:
add_quanta_venue_edges(aminer_papers_files)
# NEED TO RE ADD

In [None]:
add_authors(aminer_authors_files)

In [None]:
add_author_quanta_edges(aminer_authors_files)

In [None]:
add_organizations(aminer_authors_files)

In [None]:
add_author_organization_edges(aminer_authors_files)

In [None]:
add_tags(aminer_authors_files)

In [None]:
add_author_tag_edges(aminer_authors_files)
# adding

In [None]:
add_quanta_quanta_edges(aminer_v1_papers_files)

## Further additions and modifications

In [None]:
add_author_author_edges()

In [None]:
merge_mag_aminer_venues(venue_linking_pairs)
#Not done

In [None]:
merge_mag_aminer_papers(paper_linking_pairs)
#Not done

In [None]:
merge_mag_aminer_authors(author_linking_pairs)
#Not done

In [None]:
# MErge duplicate venues, organizations, authors?

# MERGE DUPLICATE CITATIONS 