In [None]:
# Download and install Python packages needed for this Jupyter Notebook

!pip install neo4j arxiv

In [None]:
# This imports the Python packages needed for this Jupyter Notebook 

# Note: 'ast' 'json' and 'os' are part of the Python Standard Library
# If not already included in your Python installer, 
# they will need to be installed manually 

import arxiv
import json 
from neo4j import GraphDatabase
import os
import ast


In [None]:
# The block connect the Jupyter Notebook to your Neo4j Database
# Note: Your Neo4j Database must be running and accepting connections
# Note: This example is for connecting to a local instance of Neo4j
# More information on interfacing with can be found at
# https://neo4j.com/docs/python-manual/current/connect/

uri = 'bolt://localhost:7687'
username = 'neo4j'
password = 'password'
driver = GraphDatabase.driver(uri, auth=(username, password))

In [None]:
# This block creates indexes on the following properties to greatly speed data import and data queries
# Scripting that is commented out indicate an Node type not present in the data set imported  

#driver.execute_query('CREATE INDEX Institutions IF NOT EXISTS FOR \
#    (i:Institutions) ON (i.id)')

record, summary, keys =  driver.execute_query('CREATE INDEX Concept \
    IF NOT EXISTS FOR (i:Concept) ON (i.id)')
record, summary, keys =  driver.execute_query('CREATE INDEX Work_ID \
    IF NOT EXISTS FOR (i:Work) ON (i.id)')
record, summary, keys =  driver.execute_query('CREATE INDEX Author \
    IF NOT EXISTS FOR (i:Author) ON (i.id)')


In [None]:
# This block retrieves all articles from arXiv related to the keyword entered 
# by the user. These results are then imported into Neo4j

query_input = input('Please enter a keyword to retrieve related articles: ') 

search = arxiv.Search(
  query = query_input,
  max_results=float('inf'),
  #max_results = 10,
  sort_by = arxiv.SortCriterion.SubmittedDate
)

total_node_count = 0 

for result in search.results():
    record, summary, keys = driver.execute_query(\
        "MERGE (w:Work {id: $id}) SET \
        w.source = \'arXiv\', \
        w.id = coalesce($id, \'\'), \
        w.display_name = coalesce($display_name, \'\'), \
        w.doi = coalesce($doi, \'\'), \
        w.publication_date = coalesce($publication_date, \'\'), \
        w.updated_date = coalesce($updated_date, \'\'), \
        w.url = coalesce($url, \'\'), \
        w.abstract = coalesce($abstract, \'\'), \
        w.authorships = coalesce(split($authorships, \',\'), \'\'), \
        w.concepts = coalesce($concepts, \'\'), \
        w.source_title = coalesce($source_title, \'\'), \
        w.authors_comment = coalesce($authors_comment, \'\'), \
        w.locations = coalesce($locations, \'\'), \
        w.primary_concept = coalesce($primary_concept, \'\') \
        ", \
        id = result.entry_id, \
        display_name = result.title, \
        doi = result.doi, \
        publication_date = result.published, \
        updated_date = result.updated, \
        url = result.pdf_url, \
        abstract = result.summary, \
        authorships = str(result.authors).replace('[','').replace(']',''), \
        concepts = result.categories, \
        source_title = result.journal_ref, \
        authors_comment = result.comment, \
        locations = str(result.links), \
        primary_concept = result.primary_category
        )
    total_node_count += 1 

    # Note: Need to add a "biblio" property with the entries below
    # "biblio": "{"volume":"95","issue":"1","first_page":"71","last_page":"97"}",
    # Note: summary.counters will provide 
    # labels_added, nodes_created and properties_set information

print("Total number of Work nodes created: " + str(total_node_count))

In [None]:
# This block retrieves the concepts property imported earlier and
# creates a Concept Node for each unique concept.
# An ASSOC_CONCEPT relationship is created linking Work and Concept nodes

concept_node_creation = "CALL apoc.periodic.iterate(\"MATCH (w:Work) RETURN w\",\"WITH \
    w.concepts AS concepts,w UNWIND \
    concepts AS concept \
    MERGE (c:Concept {id: concept}) SET \
    c.source = \'arXiv\' \
    MERGE (c)<-[:ASSOC_CONCEPT]-(w)\",{batchSize:200, parallel:false})"

# Print line provides the cypher syntax executed within neo4j
#print(concept_node_creation)

record, summary, keys = driver.execute_query(concept_node_creation)

# This print statement provides the total batches commited, failed along with any errors. 
print(record[0][8])

# This print statement provides information on 
# labels_added, relationships_created, nodes_created and properties_set
print(summary.counters)

print("Concept import complete")

In [None]:
# This block retrieves the authorships property imported earlier and 
# creates an Author Node for each unique author. 
# Author names are assigned to display_name and a unique identifier
# (UUID) is created

author_node_creation = "MATCH (w:Work) \
    WITH w.authorships AS authors,w UNWIND authors AS author \
    MERGE (a:Author {display_name: split(substring(trim(author),21),\"'\")[0]}) \
    SET a.source = 'arXIV', \
    a.id = randomUUID() \
    MERGE (a)-[:WROTE]->(w)"

# Print line provides the cypher syntax executed within neo4j
#print(author_node_creation)

record, summary, keys = driver.execute_query(author_node_creation)

# This print statement provides information on 
# labels_added, relationships_created, nodes_created and properties_set
print(summary.counters)

print("Author import complete")