In [2]:
import json
from py2neo import Graph
from py2neo.data import Node, Relationship

# Need to get authentication working, currently NEO4J_AUTH=none
graph = Graph("bolt://neo4j:7687")
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
      graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

Connected to graph database with 166,192,182 nodes and 611,200,000 relationships!


In [None]:
# # FIRST ATTEMPT AT A QUERY THAT WRITES pagerank PROPERTY TO THE ENTIRE GRAPH
# import pandas as pd

# print("Running PageRank STREAM on the entire graph...", end=" ", flush=True)
# query = """
# CALL algo.pageRank.stream('Quanta','CITES',{iterations:20, concurrency:20})
# YIELD node, score
# RETURN node.id, node.title, score
# LIMIT 1000
# """
# df = graph.run(query).to_data_frame()
# print("Done.")

In [None]:
# # Attempt non-streaming version of pagerank 
# print("Running  PageRank on entire graph...", end=" ")
# query = """
# CALL algo.pageRank(
# 'Quanta',
# 'CITES',
# {iterations:20, write: true, writeProperty:'pageRank2018'});
# """

In [None]:
# # Run non-streaming PageRank on each year from 1800 to 1805

# import pandas as pd
# import time

# start_year, end_year, step = 1900, 1920, 5
# dfs = []
# start_time = time.time()
# for year in range(start_year, end_year+1, step):

#     print("Running PageRank on works from <= {}...".format(year), end=" ")
#     query = """
#     CALL algo.pageRank(
#     'MATCH (p:Quanta) WHERE p.year <= {} RETURN id(p) as id',
#     'MATCH (p1:Quanta)-[:CITES]->(p2:Quanta) RETURN id(p1) as source, id(p2) as target',
#     {{graph:'cypher', writeProperty:'pageRank_{}', iterations:5, write: true, concurrency:20}});
#     """.format(year,year)
#     graph.run(query).evaluate()
    
#     print("Pulling out and saving results...", end=" ")
#     query = """
#     MATCH (a:Quanta) 
#     WHERE a.year <= {} 
#     RETURN id(a), a.title, a.pageRank_{}""".format(year,year)
#     df = graph.run(query).to_data_frame()
#     df['year'] = year
#     dfs.append(df)
#     print("Done.")
    
# end_time = time.time()
# print(end_time-start_time)
# # result = pd.concat(dfs).pivot_table(index='a.title', columns='year', values='a.pageRank')    

In [3]:
# Run STREAMING PageRank on each year from 1800 to 1805

import pandas as pd
import time

start_time = time.time()
start_year, end_year, step = 1950, 2020, 10
dfs = []
for year in range(start_year, end_year+1, step):
    
    # < IS MUCH FASTER THAN <=
    print("Running PageRank on works from < {}...".format(year), end=" ")
    query_start_time = time.time()
    query = """
    CALL algo.pageRank.stream(
    'MATCH (p:Quanta) WHERE p.year < {} RETURN id(p) as id',
    'MATCH (p1:Quanta)-[:CITES]->(p2:Quanta) RETURN id(p1) as source, id(p2) as target',
    {{graph:'cypher', iterations:5, concurrency:20}})
    YIELD node, score
    WITH * 
    ORDER BY score DESC
    LIMIT 1000
    RETURN node.id as id, node.title as title, node.lang as lang, node.year as year, 
    node.keywords as keywords, node.fos as fos, score;
    """.format(year,year)
    df = graph.run(query).to_data_frame()
    df['year'] = year
    dfs.append(df)
    query_end_time = time.time()
    print("Done ({:.2f} minutes).".format((query_end_time-query_start_time)/60))
    
end_time = time.time()
print("Finished all calculations in {:.2f} minutes.".format((end_time-start_time)/60))


Running PageRank on works from < 1950... Done (100.33 minutes).
Running PageRank on works from < 1960... Done (47.19 minutes).
Running PageRank on works from < 1970... Done (44.08 minutes).
Running PageRank on works from < 1980... Done (44.80 minutes).
Running PageRank on works from < 1990... Done (46.91 minutes).
Running PageRank on works from < 2000... Done (49.85 minutes).
Running PageRank on works from < 2010... Done (56.03 minutes).
Running PageRank on works from < 2020... Done (60.00 minutes).
Finished all calculations in 449.18 minutes.


In [4]:
# Write result BY TITLE to CSV
result = pd.concat(dfs).pivot_table(index='title', columns='year', values='score')    
file_path = '/tmp/data/result/impactByTitle_166M_{}-{}-{}.csv'.format(start_year, end_year,step)
print("Writing results to {}...".format(file_path), end=" ")
result.index = result.index.str.replace(",","")
result.to_csv(path_or_buf=file_path, sep=",", header=True, index=True)
print("Done.")

Writing results to /tmp/data/impactByTitle_166M_1950-2020-10.csv... Done.


In [None]:
# # Graph loading (notes; do not execute)
# # // Load graph
CALL algo.graph.load('my-graph','Label','REL_TYPE',{graph:'heavy',..other config...})
  YIELD name, graph, direction, undirected, sorted, nodes, loadMillis, alreadyLoaded,
        nodeWeight, relationshipWeight, nodeProperty, loadNodes, loadRelationships;

# # // Info on loaded graph
# CALL algo.graph.info('my-graph')
#   YIELD name, type, exists, removed, nodes;

# # // Use graph
# CALL algo.pageRank(null,null,{graph:'my-graph',...})


# # // Remove graph
# CALL algo.graph.remove('my-graph')
#   YIELD name, type, exists, removed, nodes;