In [6]:
import json
from py2neo import Graph, Node, Relationship 

graph = Graph('bolt://localhost:7687', auth=('neo4j', 'password'))

graph.delete_all()

#n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
#n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
#print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     #(n_nodes, n_relationships))

In [7]:
# Constrain one id per Quanta
print("Creating uniqueness constraint (and also index) on Quanta id's...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (n:Quanta) ASSERT n.id IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Constrain one name per author
print("Creating uniqueness constraint (and also index) on Author names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (a:Author) ASSERT a.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

# Constrain one name per organization
print("Creating uniqueness constraint (and also index) on Organization names...", end=" ", flush=True)
query = """CREATE CONSTRAINT ON (o:Organization) ASSERT o.name IS UNIQUE;"""
graph.run(query).evaluate()
print("Done.")

Creating uniqueness constraint (and also index) on Quanta id's... Done.
Creating uniqueness constraint (and also index) on Author names... Done.
Creating uniqueness constraint (and also index) on Organization names... Done.


In [8]:
# Add index for year of publication
print("Creating index for publication year...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(year);"""
graph.run(query).evaluate()
print("Done.")

# Add index for language
print("Creating index for langauge...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(lang);"""
graph.run(query).evaluate()
print("Done.")

# Add index for keywords HOW?
print("Creating index for fos...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(fos);"""
graph.run(query).evaluate()
print("Done.")

# Add index for title (good idea?)
print("Creating index for title...", end=" ", flush=True)
query = """CREATE INDEX ON :Quanta(title);"""
graph.run(query).evaluate()
print("Done.")

Creating index for publication year... Done.
Creating index for langauge... Done.
Creating index for fos... Done.
Creating index for title... Done.


In [11]:
import_query = """
CALL apoc.load.json('file:/Users/timholds/code/scaling-science/notebooks/data/fake-data.txt') YIELD value AS q UNWIND q.id AS id UNWIND q.authors as authors
MERGE (a:Author {name:authors.name})
WITH a, q, authors
WHERE authors.org is not null
MERGE (i:Quanta {id:q.id}) ON CREATE SET i.year=q.year, i.title=q.title, i.fos=q.fos
WITH q.id AS id, head(q.authors).name as firstName, last(q.authors).name as lastName, q.authors as authors
UNWIND authors as author
MATCH (i:Quanta {id:id}) 
MATCH (a:Author {name:author.name})
WITH i, a, author.name = firstName as isFirstName, author.name = lastName as isLastName
MERGE (a)-[:AUTHORED {is_first_author:isFirstName, is_last_author:isLastName}]-(i)
RETURN *
"""

graph.run(import_query)

<py2neo.graph.Cursor at 0x104f0e4e0>

In [12]:
# Create test citation relationships
query1 = """
Match (a:Quanta {title: "Paper13"})
Match (b:Quanta {title: "Paper7"})
WITH a, b
MERGE (a)-[:CITES]->(b)"""
graph.run(query1)

query2 = """
Match (c:Quanta {title: "Paper13"})
Match (d:Quanta {title: "Paper1"})
WITH c, d
MERGE (c)-[:CITES]->(d)
"""
graph.run(query2)

query3 = """
Match (e:Quanta {title: "Paper7"})
Match (f:Quanta {title: "Paper1"})
WITH e, f
MERGE (e)-[:CITES]->(f)
"""
graph.run(query3)

query4 = """
Match (g:Quanta {title: "Paper13"})
Match (h:Quanta {title: "Paper8"})
WITH g, h
MERGE (g)-[:CITES]->(h)
"""
graph.run(query4)

query5 = """
Match (i:Quanta {title: "Paper7"})
Match (j:Quanta {title: "Paper8"})
WITH i, j
MERGE (i)-[:CITES]->(j)
"""
graph.run(query5)

query6 = """
Match (k:Quanta {title: "Paper1"})
Match (l:Quanta {title: "Paper8"})
WITH k, l
MERGE (k)-[:CITES]->(l)

"""
graph.run(query6)

<py2neo.graph.Cursor at 0x104f0e588>

In [13]:
delete_extra_query = """
MATCH (n)
WHERE size((n)--())=0
DELETE (n)"""
graph.run(delete_extra_query)

<py2neo.graph.Cursor at 0x104f46ac8>

In [7]:
"""STOP HERE: the cells after this are not needed for importing test data"""

'STOP HERE: the cells after this are not needed for importing test data'

In [None]:
# Fastest: Put more work on thread running in parallel. 
print("Adding citation relationships...", end=" ", flush=True)
# query = """
# MATCH (b:Quanta) UNWIND b.refs AS ref
# MATCH (a:Quanta {id: ref}) MERGE (b)-[:CITES]->(a);
# """

Note this query ^^ should include the apoc.periodic.iterate() function, which would be:
query = """
CALL apoc.periodic.iterate(
"MATCH (b:Quanta) UNWIND b.refs AS ref RETURN b, ref",
"MATCH (a:Quanta {id: ref}) MERGE (b)-[:CITES]->(a)",
{batchSize:500, iterateList:true, parallel:false});
"""

graph.run(query).evaluate()
print("Done.")


In [21]:
n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Created graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

Created graph database with 22 nodes and 46 relationships!


### Various misc scripts below here

In [24]:
# Run PageRank on each year from 1800 to 1805

import pandas as pd

start_year, end_year = 1985, 2010
dfs = []
for year in range(start_year, end_year+1):

    print("Running PageRank on works from <= {}...".format(year), end=" ")
    query = """
    CALL algo.pageRank(
    'MATCH (p:Quanta) WHERE p.year <= {} RETURN id(p) as id',
    'MATCH (p1:Quanta)-[:CITES]->(p2:Quanta) RETURN id(p1) as source, id(p2) as target',
    {{graph:'cypher', writeProperty:'pageRank_{}', iterations:5, write: true, concurrency:20}});
    """.format(year,year)
    graph.run(query).evaluate()
    
    print("Pulling out and saving results...", end=" ")
    query = """
    MATCH (a:Quanta) 
    WHERE a.year <= {} 
    RETURN id(a), a.title, a.pageRank_{}""".format(year,year)
    df = graph.run(query).to_data_frame()
    df['year'] = year
    dfs.append(df)
    print("Done.")
    
result = pd.concat(dfs).pivot_table(index='a.title', columns='year', values='a.pageRank')    

Running PageRank on works from <= 1985... Pulling out and saving results... Done.
Running PageRank on works from <= 1986... Pulling out and saving results... Done.
Running PageRank on works from <= 1987... Pulling out and saving results... Done.
Running PageRank on works from <= 1988... Pulling out and saving results... Done.
Running PageRank on works from <= 1989... Pulling out and saving results... Done.
Running PageRank on works from <= 1990... Pulling out and saving results... Done.
Running PageRank on works from <= 1991... Pulling out and saving results... Done.
Running PageRank on works from <= 1992... Pulling out and saving results... Done.
Running PageRank on works from <= 1993... Pulling out and saving results... Done.
Running PageRank on works from <= 1994... Pulling out and saving results... Done.
Running PageRank on works from <= 1995... Pulling out and saving results... Done.
Running PageRank on works from <= 1996... Pulling out and saving results... Done.
Running PageRank

KeyError: 'a.pageRank'

In [None]:
result = pd.concat(dfs).pivot_table(index='a.title', columns='year', values='a.pageRank')    

In [None]:
# Write result to CSV
file_path = '/tmp/data/result/impact_20M_{}-{}.csv'.format(start_year, end_year)
print("Writing results to {}...".format(file_path), end=" ")
result.index = result.index.str.replace(",","")
result.to_csv(path_or_buf=file_path, sep=",", header=True, index=True)
print("Done.")

In [None]:
import numpy as np
%matplotlib inline
scores = result.sum(axis=1)
scores.plot.hist(grid=True,bins=[i/2 for i in range(1,100)])

In [None]:
score_threshold = scores.quantile(0.999999)
top_papers = scores.drop(scores[scores.values>=15].index)
print("Considering the top {} (score >= {:.2f}) papers.".format(len(top_papers), score_threshold))

In [None]:
import re
word_lists = top_papers.index.to_series().apply(
    lambda x: [w for w in re.compile(r"[A-Za-z']{4,}").findall(x)])
all_words = set()
word_map = {}
for i,v in word_lists.items():
    for w in v:
        all_words.add(w.lower())
        word_map.get(w,[]).append(i)
print("Built set and map of {:,} unique words.".format(len(all_words)))

In [None]:
print("Filling {}x{} dataframe...".format(len(all_words),len(result.columns)), end = "")
word_scores = pd.DataFrame(0, index=all_words, columns=result.columns)
for key, value in word_map:
    print(key)
#     for w in ws:
#         print(w)
#         word_scores.loc[w.lower()] += result.loc[title].va
#         print(word_scores.loc[w.lower()])

In [None]:
word_lists

In [None]:
word_scores[title]

In [None]:
result.loc[title,:]