In [None]:
import json
from py2neo import Graph, Node, Relationship
#from py2neo.Graph import database 

# Need to get authentication working, currently NEO4J_AUTH=none
#graph = Graph("bolt://neo4j:7687")
graph = Graph('bolt://localhost:7687', bolt=True)

graph.delete_all()

#n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
#n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
#print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     #(n_nodes, n_relationships))

In [None]:
# Constrain one id per Quanta
print("Creating uniqueness constraint (and also index) on Quanta id's...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (n:Quanta) ASSERT n.id IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Constrain one name per author
print("Creating uniqueness constraint (and also index) on Author names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (a:Author) ASSERT a.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Constrain one name per author
print("Creating uniqueness constraint (and also index) on Organization names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (o:Organization) ASSERT o.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

In [None]:
# Add index for year of publication
print("Creating index for publication year...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(year);"""
graph.run(query).evaluate()
print("Done.")

# Add index for language
print("Creating index for langauge...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(lang);"""
graph.run(query).evaluate()
print("Done.")

# Add index for keywords HOW?
print("Creating index for fos...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(fos);"""
graph.run(query).evaluate()
print("Done.")

# Add index for title (good idea?)
print("Creating index for title...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(title);"""
graph.run(query).evaluate()
print("Done.")

In [None]:
# query1 adds Quanta, Authors, and Organizations to graph with relationships between
# Authors and Quanta, and Authors and Orgs
# query2 adds is_first_author and is_last_author property to AUTHORED relationships between
# Authors and Quanta

#local_data_dir = ‘/tmp/data/’
#neo4j_data_dir = ‘/import/’

local_data_dir = '/Users/timholdsworth/code/scaling-science/notebooks/tmp/data/'
neo4j_data_dir = local_data_dir


import glob, os
for _,_ , files in os.walk(local_data_dir):
    for file in sorted(files):
        if file.endswith('.txt'):
           
           # Iterative query (more efficient)
           print("Importing {}...".format(file), end=" ", flush=True)
           query1 = """
           CALL apoc.periodic.iterate(
           "CALL apoc.load.json('file://{}{}') YIELD value AS q 
           RETURN q",
           "UNWIND q.id AS id UNWIND q.authors as authors
           MERGE (a:Author {{name:authors.name}})
           MERGE (i:Quanta {{id:q.id}}) ON CREATE SET 
           i.refs=q.references, i.year=q.year, i.title=q.title, 
           i.fos=q.fos, i.url=q.url, i.lang=q.lang, i.keywords=q.keywords, 
           i.n_citation=q.n_citation, i.pdf=q.pdf, i.publisher=q.publisher
           WITH a, q, authors
           WHERE authors.org is not null
           MERGE (o:Organization {{name:authors.org}}) 
           MERGE (a)-[:WORKS_AT]->(o)", 
           {{batchSize:50000, iterateList:true, parallel:false}});
           """.format(neo4j_data_dir, file)
    
           query2 = """
           CALL apoc.periodic.iterate(
           "CALL apoc.load.json('file://{}{}') YIELD value AS q RETURN q",
           "UNWIND q.id AS id UNWIND q.authors as authors
           WITH q.id AS id, head(q.authors).name as firstName, last(q.authors).name as lastName, q.authors as authors
           UNWIND authors as author
           MATCH (i:Quanta {{id:id}}) 
           MATCH (a:Author {{name:author.name}})
           WITH i, a, author.name = firstName as isFirstName, author.name = lastName as isLastName
           MERGE (a)-[r:AUTHORED]->(i) ON MATCH SET r.is_first_author=isFirstName, r.is_last_author=isLastName",
           {{batchSize:50000, iterateList:true, parallel:false}});
           """.format(neo4j_data_dir, file)
        
           #print(query)
           graph.run(query1).evaluate()
           print("Done with query 1.")
           graph.run(query2).evaluate()
           print("Done with query 2.")




In [None]:
# Add all citations as relationships between Quanta

# # Simple but slow
# query = """
# MATCH (a:Quanta), (b:Quanta)
# WHERE a.id IN b.refs
# CREATE (b)-[:CITES]->(a)
# """

# # Faster but more complex (not benchmarked though)
# query = """
# CALL apoc.periodic.iterate(
#    "MATCH (a:Quanta), (a2:Quanta) WHERE a.id IN a2.refs
#     WITH a, COLLECT(a2) as b
#     RETURN a, b",
#    "UNWIND b AS a2
#     CREATE (a2)-[:CITES]->(a)",
#     {batchSize:5000, parallel:true,iterateList:true}
# """

# # Faster but simple (also not benchmarked)
# query = """
# CALL apoc.periodic.iterate(
#    "MATCH (a:Quanta), (b:Quanta) WHERE ID(a) < ID(b) AND a.id IN b.refs RETURN a, b",
#    "CREATE (b)-[:CITES]->(a)",
#     {batchSize:10000, parallel:true,iterateList:true});
# """

# # Take advantage of indexing performed by constraints
# print("Adding citations...", end=" ", flush=True)
# query = """
# MATCH (b:Quanta)
# UNWIND b.refs AS ref
# MATCH (a:Quanta)
# WHERE a.id = ref
# CREATE (b)-[:CITES]->(a);
# """

# # Take advantage of indexing and also run in batches
# query = """
# CALL apoc.periodic.iterate(
# "MATCH (b:Quanta) 
#  UNWIND b.refs AS ref 
#  MATCH (a:Quanta) 
#  WHERE a.id = ref
#  RETURN a, b",
# "MERGE (b)-[:CITES]->(a)",
#  {batchSize:20000, parallel:false,iterateList:true});
# """

# Fastest: Put more work on thread running in parallel. 
print("Adding citation relationships...", end=" ", flush=True)
query = """
CALL apoc.periodic.iterate(
"MATCH (b:Quanta) UNWIND b.refs AS ref RETURN b, ref",
"MATCH (a:Quanta {id: ref}) MERGE (b)-[:CITES]->(a)",
{batchSize:2000, iterateList:true, parallel:false})
"""
graph.run(query).evaluate()
print("Done.")


In [None]:
n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Created graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

### Various misc scripts below here

In [None]:
# Run PageRank on each year from 1800 to 1805

import pandas as pd

start_year, end_year = 1900, 1901
dfs = []
for year in range(start_year, end_year+1):

    print("Running PageRank on works from <= {}...".format(year), end=" ")
    query = """
    CALL algo.pageRank(
    'MATCH (p:Quanta) WHERE p.year <= {} RETURN id(p) as id',
    'MATCH (p1:Quanta)-[:CITES]->(p2:Quanta) RETURN id(p1) as source, id(p2) as target',
    {{graph:'cypher', writeProperty:'pageRank_{}', iterations:5, write: true, concurrency:20}});
    """.format(year,year)
    graph.run(query).evaluate()
    
    print("Pulling out and saving results...", end=" ")
    query = """
    MATCH (a:Quanta) 
    WHERE a.year <= {} 
    RETURN id(a), a.title, a.pageRank_{}""".format(year,year)
    df = graph.run(query).to_data_frame()
    df['year'] = year
    dfs.append(df)
    print("Done.")
    
result = pd.concat(dfs).pivot_table(index='a.title', columns='year', values='a.pageRank')    

In [None]:
result = pd.concat(dfs).pivot_table(index='a.title', columns='year', values='a.pageRank')    

In [None]:
# Write result to CSV
file_path = '/tmp/data/result/impact_20M_{}-{}.csv'.format(start_year, end_year)
print("Writing results to {}...".format(file_path), end=" ")
result.index = result.index.str.replace(",","")
result.to_csv(path_or_buf=file_path, sep=",", header=True, index=True)
print("Done.")

In [None]:
import numpy as np
%matplotlib inline
scores = result.sum(axis=1)
scores.plot.hist(grid=True,bins=[i/2 for i in range(1,100)])

In [None]:
score_threshold = scores.quantile(0.999999)
top_papers = scores.drop(scores[scores.values>=15].index)
print("Considering the top {} (score >= {:.2f}) papers.".format(len(top_papers), score_threshold))

In [None]:
import re
word_lists = top_papers.index.to_series().apply(
    lambda x: [w for w in re.compile(r"[A-Za-z']{4,}").findall(x)])
all_words = set()
word_map = {}
for i,v in word_lists.items():
    for w in v:
        all_words.add(w.lower())
        word_map.get(w,[]).append(i)
print("Built set and map of {:,} unique words.".format(len(all_words)))

In [None]:
print("Filling {}x{} dataframe...".format(len(all_words),len(result.columns)), end = "")
word_scores = pd.DataFrame(0, index=all_words, columns=result.columns)
for key, value in word_map:
    print(key)
#     for w in ws:
#         print(w)
#         word_scores.loc[w.lower()] += result.loc[title].va
#         print(word_scores.loc[w.lower()])

In [None]:
word_lists

In [None]:
word_scores[title]

In [None]:
result.loc[title,:]