In [None]:
# Download and install Python packages needed for this Jupyter Notebook

!pip install neo4j semanticscholar

In [None]:
# This imports the Python packages needed for this Jupyter Notebook 

# Note: 'json' and 'os' are part of the Python Standard Library
# If not already included in your Python installer, 
# they will need to be installed manually 

from semanticscholar import SemanticScholar
import json 
from neo4j import GraphDatabase
import os

In [None]:
# The block connect the Jupyter Notebook to your Neo4j Database
# Note: Your Neo4j Database must be running and accepting connections
# Note: This example is for connecting to a local instance of Neo4j
# More information on interfacing with can be found at
# https://neo4j.com/docs/python-manual/current/connect/

uri = 'bolt://localhost:7687'
username = 'neo4j'
password = 'password'
driver = GraphDatabase.driver(uri, auth=(username, password))

In [None]:
# This block is to enter your S2 key if you have recieved one.
s2_api_key = 'Enter your S2 Key from Semantic Scholar Here'

In [None]:
# This block creates indexes on the following properties to greatly speed data import and data queries
# Scripting that is commented out indicate an Node type not present in the data set imported  

#driver.execute_query('CREATE INDEX Institutions IF NOT EXISTS FOR \
#    (i:Institutions) ON (i.id)')

record, summary, keys =  driver.execute_query('CREATE INDEX Concept \
    IF NOT EXISTS FOR (i:Concept) ON (i.id)')
record, summary, keys =  driver.execute_query('CREATE INDEX Work_ID \
    IF NOT EXISTS FOR (i:Work) ON (i.id)')
record, summary, keys =  driver.execute_query('CREATE INDEX Author \
    IF NOT EXISTS FOR (i:Author) ON (i.id)')

In [None]:

# Note: All unauthenticated users share a limit of 5,000 requests per 5 minutes
# https://www.semanticscholar.org/product/api

query_input = input('Please enter a keyword to retrieve related articles: ') 
print("Retrieving results. This may take a minute") 

sch = SemanticScholar()

# Uncomment the line below if you have a S2 API key from Semantic Scholar
# This will greatly increase the speed of results returned 
#sch = SemanticScholar(api_key=s2_api_key)

results = sch.search_paper(query_input, \
    fields=['paperId', 'journal',  'abstract',  'authors',  'citationCount',  
    'citations',  'externalIds',  'fieldsOfStudy',  \
    'influentialCitationCount',  'isOpenAccess',  'openAccessPdf',  \
    'publicationDate',  'publicationTypes',  'publicationVenue',  \
    'references',  's2FieldsOfStudy',  'title',  'title',  'url', \
    'year', 'references','citations','authors'], limit=5)

print(f'A total of {results.total} articles will be retrieved and imported.')
print("These articles will be processed in batches")

total_node_count = 1 

for result in results:
    #Uncomment the line below to view the data for each result returned
    #print(result)
    
    print("Retrieving and importing articles " + str(total_node_count))
    record, summary, keys = driver.execute_query(\
        "MERGE (w:Work {id: $id}) SET \
        w.source = \'Semantic Scholar\', \
        w.pass = 1, \
        w.id = coalesce($id, \'\'), \
        w.cited_by_count = coalesce($cited_by_count, \'\'), \
        w.display_name = coalesce($display_name, \'\'), \
        w.publication_date = coalesce($publication_date, \'\'), \
        w.publication_year = coalesce($publication_year, \'\'), \
        w.title = coalesce($title,  \'\'),\
        w.type = coalesce($type, \'\'), \
        w.url = coalesce($url,  \'\'),\
        w.best_oa_location = coalesce($best_oa_location, \'\'), \
        w.referenced_works = coalesce($referenced_works, \'\'), \
        w.abstract = coalesce($abstract, \'\'), \
        w.authorships = coalesce(split($authorships, \',\'), \'\'), \
        w.ids = coalesce($ids, \'\'), \
        w.open_access = coalesce($open_access, \'\'), \
        w.concepts = coalesce($concepts, \'\'), \
        w.locations = coalesce($locations, \'\'), \
        w.publication_venue = coalesce($publication_venue, \'\'), \
        w.citations = coalesce($citations, \'\'), \
        w.influential_citations = coalesce($influential_citations, \'\'),\
        w.s2_fields = coalesce($s2_fields, \'\') \
        ", \
        id = result.paperId, \
        cited_by_count = result.citationCount, \
        display_name = result.title, \
        publication_date = result.publicationDate, \
        publication_year = result.year, \
        title = result.title, \
        type = result.publicationTypes, \
        url = result.url, \
        best_oa_location = str(result.openAccessPdf), \
        referenced_works = str(result.references), \
        abstract = result.abstract, \
        authorships = \
        (str(result.authors).replace('[','').replace(']','')).replace('[','').replace(']',''), \
        ids = str(result.externalIds), \
        open_access = result.isOpenAccess, \
        concepts = result.fieldsOfStudy, \
        locations = str(result.journal), \
        publication_venue = str(result.publicationVenue), \
        citations = str(result.citations).replace('[','').replace(']',''),\
        influential_citations = result.influentialCitationCount, \
        s2_fields = str(result.s2FieldsOfStudy) \
        )
    total_node_count += 1

        
print("Total number of Work nodes created: " + str(total_node_count))



In [None]:
#This block retrieves works referenced by existing Works nodes and 
# creates a REFERENCED_WORK relationship
#If the referenced work does not exist it is created using the
# id retrieved from the list of "referenced_works"
#These works are identified with a 3 in the "pass" property to 
# facilitate future processing
 
referenced_node_creation = "MATCH (w:Work) \
    WITH apoc.text.regexGroups(w.referenced_works,   \
    \"\{'paperId': '[^']*', 'title': '[^']*'\}\")  \
    AS works,w UNWIND works AS stage1  \
    UNWIND stage1 as stage2 WITH split(stage2, \"', '\") AS stage3,w \
    WITH split(stage3[0], \": '\")[1] AS id,   \
    substring(split(stage3[1], \": '\")[1], 0, \
    size(split(stage3[1], \": '\")[1]) -1) \
    AS title,w MERGE (z:Work {id: id}) SET z.pass =  \
    CASE WHEN any (x in z.pass WHERE x = 1)   \
    THEN z.pass ELSE 3 END \
    SET z.display_name = CASE WHEN any (x in z.display_name WHERE x = title)  \
    THEN z.display_name ELSE title END \
    WITH id, z, w  \
    MERGE (z)<-[:REFERENCED_WORK]-(w)"

#Uncomment the print command below to view the raw Cypher script used by Neo4j
#print(referenced_node_creation)

record, summary, keys = driver.execute_query(referenced_node_creation)
print(summary.counters)
print("Referenced work relationships creation complete")

In [None]:
#This block retrieves works cited by existing Works nodes and 
# creates a CITED_WORK relationship
#If the cited work does not exist it is created using the
# id retrieved from the list of "citations"
#These works are identified with a 3 in the "pass" property to 
# facilitate future processing
 
referenced_node_creation = "MATCH (w:Work) \
    WITH apoc.text.regexGroups(w.citations,   \
    \"\{'paperId': '[^']*', 'title': '[^']*'\}\")  \
    AS works,w UNWIND works AS stage1  \
    UNWIND stage1 as stage2 WITH split(stage2, \"', '\") AS stage3,w \
    WITH split(stage3[0], \": '\")[1] AS id,   \
    substring(split(stage3[1], \": '\")[1], 0, size(split(stage3[1], \": '\")[1]) -1) \
    AS title,w MERGE (z:Work {id: id}) SET z.pass =  \
    CASE WHEN any (x in z.pass WHERE x = 1)   \
    THEN z.pass ELSE 3 END \
    SET z.display_name = CASE WHEN any (x in z.display_name WHERE x = title)  \
    THEN z.display_name ELSE title END \
    WITH id, z, w  \
    MERGE (z)-[:CITED_WORK]->(w)"

#Uncomment the print command below to view the raw Cypher script used by Neo4j
#print(referenced_node_creation)

record, summary, keys = driver.execute_query(referenced_node_creation)
print(summary.counters)
print("Cited work relationships creation complete")

In [None]:
# This block retrieves the concepts property imported earlier and
# creates a Concept Node for each unique concept.
# An ASSOC_CONCEPT relationship is created linking Work and Concept nodes

concept_node_creation = "CALL apoc.periodic.iterate(\"MATCH (w:Work) RETURN w\",\"WITH \
    w.concepts AS concepts,w UNWIND \
    concepts AS concept \
    WITH concept,w  WHERE concept <> '' \
    MERGE (c:Concept {id: concept}) SET \
    c.source = \'Semantic Scholar\' \
    MERGE (c)<-[:ASSOC_CONCEPT]-(w)\",{batchSize:200, parallel:false})"

# Print line provides the cypher syntax executed within neo4j
#print(concept_node_creation)

record, summary, keys = driver.execute_query(concept_node_creation)

# This print statement provides the total batches commited, failed along with any errors. 
print(record[0][8])

print("Concept import complete")

In [None]:
#This block retrieves authors of existing Works nodes previously imported 
# and creates a WROTE relationship that includes a "author_position"
# property to identify the position of the author's name
#If the author does not exist it is created using the
# id retrieved from the list of "authorships"
#Newly crated authors are identified with a 3 in the "pass" property to 
# allow processing to retrieve all information about the author
 
author_node_creation = "MATCH (w:Work) \
    WITH apoc.text.join(w.authorships,',') AS authorships,w \
    WITH apoc.text.regexGroups(authorships,   \
    \"\{'authorId': '[^']*', 'name': '[^']*'\}\")  \
    AS authors,w UNWIND authors AS stage1  \
    UNWIND stage1 as stage2 WITH split(stage2, \"', '\") AS stage3,w \
    WITH split(stage3[0], \": '\")[1] AS id,   \
    substring(split(stage3[1], \": '\")[1], 0, size(split(stage3[1], \": '\")[1]) -2) \
    AS name,w MERGE (z:Author {id: id}) SET z.pass =  \
    CASE WHEN any (x in z.pass WHERE x = 1)   \
    THEN z.pass ELSE 3 END \
    SET z.display_name = CASE WHEN any (x in z.display_name WHERE x = name)  \
    THEN z.display_name ELSE name END \
    WITH id, z, w  \
    MERGE (z)-[:WROTE]->(w)"

#Uncomment the print command below to view the raw Cypher script used by Neo4j
#print(author_node_creation)

record, summary, keys = driver.execute_query(author_node_creation)
print(summary.counters)
print("Cited work relationships creation complete")