# Setup

## Connect to graph

In [None]:
from py2neo import Graph, Node, Relationship
import glob, os, time
 
graph = Graph('bolt://neo4j-magtwo:7687', auth=('neo4j','myneo'))
#graph = Graph('bolt://localhost:7687', auth=('neo4j', 'password'))

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

## Setup workspace

In [None]:
# In Neo4j 3.5+, the directory is automatically set to /import
data_dir = '' 
scipy_data_dir = '/tmp/data/'

print("Neo4j data directory set to `{}`.".format(data_dir))
print("SciPy data directory set to `{}`.".format(scipy_data_dir))

# MAG files
mag_venues_file = data_dir + 'magtwo/mag_venues.txt'

n_mag_papers_files = len(glob.glob(scipy_data_dir + 'magtwo/mag_papers_*.txt'))
mag_papers_files = [data_dir + 'magtwo/mag_papers_{}.txt'.format(i) for i in range(n_mag_papers_files)]

n_mag_authors_files = len(glob.glob(scipy_data_dir + 'magtwo/mag_authors_*.txt'))
mag_authors_files = [data_dir + 'magtwo/mag_authors_{}.txt'.format(i) for i in range(n_mag_authors_files)]

n_magv1_papers_files = len(glob.glob(scipy_data_dir + 'magone/mag_papers_*.txt'))
mag_v1_papers_files = [data_dir + 'magone/mag_papers_{}.txt'.format(i) for i in range(n_magv1_papers_files)]

print("\nMAG files:")
print("\tVenue files: {}".format(1))
print("\tPapers files: {}".format(n_mag_papers_files))
print("\tAuthors files: {}".format(n_mag_authors_files))
print("\tMAGv1 papers files: {}".format(n_magv1_papers_files))

# AMner files
aminer_venues_file = data_dir + 'aminertwo/aminer_venues.txt'

n_aminer_papers_files = len(glob.glob(scipy_data_dir + 'aminertwo/aminer_papers_*.txt'))
aminer_papers_files = [data_dir + 'aminertwo/aminer_papers_{}.txt'.format(i) for i in range(n_aminer_papers_files)]

n_aminer_authors_files = len(glob.glob(scipy_data_dir + 'aminertwo/aminer_authors_*.txt'))
aminer_authors_files = [data_dir + 'aminertwo/aminer_authors_{}.txt'.format(i) for i in range(n_aminer_authors_files)]

n_aminer_v1_papers_files = len(glob.glob(scipy_data_dir + 'aminerone/aminer_papers_*.txt'))
aminer_v1_papers_files = [data_dir + 'aminerone/aminer_papers_{}.txt'.format(i) for i in range(n_aminer_v1_papers_files)]
      
print("\nAMiner files:")
print("\tVenue files: {}".format(1))
print("\tPapers files: {}".format(n_aminer_papers_files))
print("\tAuthors files: {}".format(n_aminer_authors_files))
print("\tAMinerv1 papers files: {}".format(n_aminer_v1_papers_files))

# Linking pairs
venue_links = 'venue_links'
paper_linkes = ''
author_links = ''

def run_query(query, graph, print_query=False, run_query=True, print_only=False):
    if print_only: 
        print_query = True
        run_query = False
    start_time = time.time()
    if print_query:
        print(query)
    if run_query:
        graph.run(query)
    end_time = time.time()
    seconds_elapsed = end_time-start_time
    minutes_elapsed = (end_time-start_time)/60
    print("Query completed in {:.2f} seconds.".format(seconds_elapsed))

# Define import functions

## Constraints and indices

In [None]:
def add_constraints():
    contraint_pairs = [('n:Quanta', 'n.id'), 
                       ('t:Tag', 't.name'),
                       ('a:Author', 'a.id'), 
                       ('o:Organization', 'o.name'),
                       ('v:Venue', 'v.id'), 
                       ('v:Venue', 'v.name')]
    
    for n, p in contraint_pairs:
        query = "CREATE CONSTRAINT ON ({}) ASSERT {} IS UNIQUE;".format(n,p)
        run_query(query, graph, print_query=True)
        
def add_indices():
    indices = [':Quanta(year)', 
               ':Quanta(lang)', 
               ':Quanta(keywords)', 
               ':Quanta(title)', 
               ':Quanta(venue)', 
               ':Quanta(doctype)',
               ':Quanta(idv1)',
               ':Venue(normalizedName)', 
               ':Year(value)',
               ':Author(name)', 
               ':Author(normalizedName)']
    
    for index in indices:
        query = "CREATE INDEX ON {};".format(index)
        run_query(query, graph, print_query=True)

## (:Year)

In [None]:
def add_years(min_year, max_year):
    query = """
    UNWIND range({}, {}) as yr
    MERGE (y:Year {{value: yr}})
    """.format(min_year, max_year)
    run_query(query, graph)

## (:Venue)

In [None]:
def add_venues(venues_file):
    query = """ 
    CALL apoc.periodic.iterate(
    "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
    "MERGE (v:Venue {{id:q.id}})
    SET v.journalId:q.JournalId, v.conferenceId:q.ConferenceId,
        v.name:q.DisplayName, v.normalizedName:q.NormalizedName}})", 
    {{batchSize:10000, iterateList:true, parallel:true}});
    """.format(venues_file)
    run_query(query, graph, print_query=True)

## (:Quanta)

In [None]:
def add_quanta(papers_files):
    for file_name in papers_files:
        print('Importing {}'.format(file_name))
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q",
        "CREATE (p:Quanta {{id:q.id, title:q.title, year:q.year, keywords:q.keywords,
            numCitations:q.n_citation, docType:q.doc_type, language:q.lang, 
            publisher:q.publisher, doi:q.doi, pdf:q.pdf, abstract:q.abstract}})",
        {{batchSize:10000, iterateList:true, parallel:true}})
        """.format(file_name)
        run_query(query, graph)

## (:Quanta)-[:PUBLISHED_IN_YEAR]->(:Year)

In [None]:
def add_quanta_year_edges(papers_files):
    for file_name in papers_files:
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MATCH (p:Quanta {{id:q.id}})
        WITH p
        MATCH (y:Year {{value: p.year}})
        CREATE (p)-[:PUBLISHED_IN_YEAR]->(y)",
        {{batchSize:10000, iterateList:true, parallel:true}})
        """.format(file_name)
        run_query(query, graph)

## (:Quanta)-[:PUBLISHED_IN_VENUE]->(:Venue)

In [None]:
def add_quanta_venue_edges(papers_files):
    for file_name in papers_files:
        print('Importing {}'.format(file_name))
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MATCH (p:Quanta {{id:q.id}})
        WITH q, p
        UNWIND q.venue as venue
        WITH distinct q, p, venue
        MERGE (v:Venue {{name: venue.raw}})
        WITH p, v
        CREATE (p)-[:PUBLISHED_IN_VENUE]->(v)",
        {{batchSize:10000, iterateList:true, parallel:true}})
        """.format(file_name)
        run_query(query, graph, print_only=False)

## (:Author)

In [None]:
def add_authors(authors_files):
    for file_name in authors_files: 
        print('Importing {}'.format(file_name))
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MERGE (a:Author {{id:q.id, name:q.name}})
        SET a.normalizedName=q.normalized_name, a.position=q.position, a.lastAffiliation=q.org,
                a.numCitations=q.n_citation, a.numPublications=q.n_pubs, a.hIndex=q.h_index",
        {{batchSize:10000, iterateList:true, parallel:true}});
        """.format(file_name)
        run_query(query, graph)

## (:Author)-[:AUTHORED]->(:Quanta)

In [None]:
def add_author_quanta_edges(authors_files):
    for file_name in authors_files: 
        print('Creating authored relationships for {}'.format(file_name))
#         query = """
#         CALL apoc.periodic.iterate(
#         "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
#         "MATCH (a:Author {{id:q.id}})
#         WITH q, a
#         UNWIND q.pubs as pubs
#         WITH a, pubs
#         MATCH (p:Quanta {{id:pubs.i}})
#         WITH a, p, pubs
#         MERGE (a)-[r:AUTHORED {{order: pubs.r}}]->(p)",
#         {{batchSize:10000, iterateList:true, parallel:false}})
#         """.format(file_name)
            
#         query = """
#         CALL apoc.periodic.iterate(
#         "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
#         "MATCH (p:Quanta {{id:q.id}})
#         WITH p, q, range(0, size(q.authors)) as is
#         UNWIND is as i
#         MATCH (a:Author)
#         WHERE 
#             CASE WHEN ((a.id IS NULL) OR (q.authors[i].id IS NULL))
#             THEN (a.name=q.authors[i].name)
#             ELSE (a.id=q.authors[i].id)
#             END
#         WITH p, a, i
#         MERGE (p)-[r:AUTHORED]->(a)
#         SET r.order = i",
#         {{batchSize:10000, iterateList:true, parallel:true}})
#         """.format(file_name)
#         run_query(query, graph)
    
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MATCH (a:Author {{id:q.id}})
        WITH q, a
        UNWIND q.pubs as pubs
        MATCH (p:Quanta {{id:pubs.i}})
        MERGE (a)-[r:AUTHORED]->(p)
        ON CREATE SET r.order = CASE WHEN pubs.r IS NULL THEN NULL ELSE pubs.r END",
        {{batchSize:10000, iterateList:true, parallel:false}});
        """.format(file_name)
        run_query(query, graph)        

## (:Organization)

In [None]:
def add_organizations(authors_files):
    for file_name in authors_files: 
        query = """
        CALL apoc.periodic.iterate(
            "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
            "WHERE q.org is not null
            MERGE (o:Organization {{name: q.org}})",
            {{batchSize:10000, iterateList:true, parallel:false}});
        """.format(data_dir + file_name)
        run_query(query, graph)

## (:Author)-[:AFFILIATED_WITH]->(:Organization)

In [None]:
def add_author_organization_edges(authors_files):
    for file_name in authors_files: 
        query = """
        CALL apoc.periodic.iterate(
            "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
            "MATCH (a:Author {{id: q.id}})
            MATCH (o:Organization {{name: q.org}})
            WITH q, a, o
            WHERE q.org is not null
            MATCH (o:Organization {{name: q.org}})
            MERGE (a)-[:AFFILIATED_WITH]->(o)",
            {{batchSize:10000, iterateList:true, parallel:false}});
        """.format(data_dir + file_name)
        run_query(query, graph)

## (:Tag)

In [None]:
def add_tags(authors_files):
    for file_name in authors_files: 
        query = """
        CALL apoc.periodic.iterate(
            "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
            "UNWIND q.tags as tag
            MERGE (t:Tag {{name: tag.t}})",
            {{batchSize:10000, iterateList:true, parallel:true}})
        """.format(data_dir + file_name)
        run_query(query, graph)

## (:Author)-[:HAS_TAG]->(:Tag)

In [None]:
def add_author_tag_edges(authors_files):
    for file_name in authors_files: 
        print("Processing {}...".format(file_name))    
        query = """
        CALL apoc.periodic.iterate(
        "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
        "MATCH (a:Author {id:q.id})
         UNWIND COALESCE(q.tags, [null]) as tag
         MATCH (t:Tag {name:tag.t})
         MERGE (a)-[r:HAS_TAG]->(t)",
        {{batchSize:10000, iterateList:true, parallel:false}});
        """.format(file_name)
        run_query(query, graph)

## (:Quanta {idv1})

In [None]:
def add_v1_ids(v1_papers_files):
    for file_name in v1_papers_files:
        query = """
        CALL apoc.periodic.iterate(
            "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
            "MATCH (p:Quanta {{title: q.title}})
             SET p.idv1 = q.id",
            {{batchSize:10000, iterateList:true, parallel:false}});
        """.format(data_dir + file_name)
        run_query(query, graph)

## (:Quanta)-[:CITES]->(:Quanta)

In [None]:
def add_quanta_quanta_edges(v1_papers_files):
    for file_name in v1_papers_files:
        print("Processing {}...".format(file_name))
        query = """
        CALL apoc.periodic.iterate(
            "CALL apoc.load.json('{}') YIELD value AS q RETURN q",
            "MATCH (p:Quanta {{idv1: q.id}})
            UNWIND q.references as ref
            WITH p, ref
            MATCH (b:Quanta {{idv1: ref}})
            CREATE (p)-[:CITES]->(b)",
            {{batchSize:10000, iterateList:true, parallel:false}});
        """.format(data_dir + file_name)
        run_query(query, graph)

In [None]:
def add_author_author_edges():
    query = """
    CALL apoc.periodic.iterate(
        "MATCH (q:Quanta) WHERE size((q)<-[:AUTHORED]-()) > 1 RETURN q",
        "WITH [(q)<-[:AUTHORED]-(a) | a] as coAuthors
        UNWIND coAuthors as first
        UNWIND coAuthors as second
        WITH first, second
        WHERE id(first) < id(second)
        MERGE (first)-[r:COAUTHOR]-(second)
        SET r.strength = CASE WHEN r.strength IS NULL THEN 1 ELSE r.strength + 1 END",
    {batchSize:10000, iterateList:true, parallel:false});
    """
    run_query(query, graph, print_only=False)

# Run Import

## Setup database

In [None]:
add_constraints()

In [None]:
add_indices()

In [None]:
add_years(1700, 2020)

## Import MAG

In [None]:
add_venues(mag_venues_file)

In [None]:
add_quanta(mag_papers_files)

In [None]:
add_quanta_year_edges(mag_papers_files)

In [None]:
add_quanta_venue_edges(mag_papers_files)

In [None]:
add_authors(mag_authors_files)

In [None]:
add_author_quanta_edges(mag_authors_files)

In [None]:
add_organizations(mag_authors_files)

In [None]:
add_author_organization_edges(mag_authors_files)

In [None]:
add_tags(mag_authors_files)

In [None]:
add_author_tag_edges(mag_authors_files)

In [None]:
add_v1_ids(mag_v1_papers_files)

In [None]:
add_quanta_quanta_edges(mag_v1_papers_files)

## Import AMiner

In [None]:
add_venues(aminer_venues_file)

In [None]:
add_quanta(aminer_papers_files)

In [None]:
add_quanta_year_edges(aminer_papers_files)
#added

In [None]:
add_quanta_venue_edges(aminer_papers_files)
#added

In [None]:
add_authors(aminer_authors_files)
#added

In [None]:
add_author_quanta_edges(aminer_authors_files)
#added

In [None]:
add_organizations(aminer_authors_files)
#added

In [None]:
add_author_organization_edges(aminer_authors_files)
#added

In [None]:
add_tags(aminer_authors_files)
#added

In [None]:
add_author_tag_edges(aminer_authors_files)
#not added!

In [None]:
add_v1_ids(aminer_v1_papers_files)
#added

In [None]:
add_quanta_quanta_edges(aminer_v1_papers_files)

## Further additions

In [None]:
add_author_author_edges()