In [16]:
import json
from py2neo import Graph
from py2neo.data import Node, Relationship

# Need to get authentication working, currently NEO4J_AUTH=none
graph = Graph("bolt://neo4j:7687")
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
      graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

Connected to graph database with 1,000,010 nodes and 37,041 relationships!


In [None]:
# # FIRST ATTEMPT AT A QUERY THAT WRITES pagerank PROPERTY TO THE ENTIRE GRAPH
# import pandas as pd

# print("Running PageRank STREAM on the entire graph...", end=" ", flush=True)
# query = """
# CALL algo.pageRank.stream('Quanta','CITES',{iterations:20, concurrency:20})
# YIELD node, score
# RETURN node.id, node.title, score
# LIMIT 1000
# """
# df = graph.run(query).to_data_frame()
# print("Done.")

In [None]:
df

In [None]:
# # Attempt non-streaming version of pagerank 
# print("Running  PageRank on entire graph...", end=" ")
# query = """
# CALL algo.pageRank(
# 'Quanta',
# 'CITES',
# {iterations:20, write: true, writeProperty:'pageRank2018'});
# """

In [None]:
# # Run non-streaming PageRank on each year from 1800 to 1805

# import pandas as pd
# import time

# start_year, end_year, step = 1900, 1920, 5
# dfs = []
# start_time = time.time()
# for year in range(start_year, end_year+1, step):

#     print("Running PageRank on works from <= {}...".format(year), end=" ")
#     query = """
#     CALL algo.pageRank(
#     'MATCH (p:Quanta) WHERE p.year <= {} RETURN id(p) as id',
#     'MATCH (p1:Quanta)-[:CITES]->(p2:Quanta) RETURN id(p1) as source, id(p2) as target',
#     {{graph:'cypher', writeProperty:'pageRank_{}', iterations:5, write: true, concurrency:20}});
#     """.format(year,year)
#     graph.run(query).evaluate()
    
#     print("Pulling out and saving results...", end=" ")
#     query = """
#     MATCH (a:Quanta) 
#     WHERE a.year <= {} 
#     RETURN id(a), a.title, a.pageRank_{}""".format(year,year)
#     df = graph.run(query).to_data_frame()
#     df['year'] = year
#     dfs.append(df)
#     print("Done.")
    
# end_time = time.time()
# print(end_time-start_time)
# # result = pd.concat(dfs).pivot_table(index='a.title', columns='year', values='a.pageRank')    

In [32]:
# Run STREAMING PageRank on each year from 1800 to 1805

import pandas as pd
import time

start_time = time.time()
start_year, end_year, step = 1900, 2000, 10
dfs = []
for year in range(start_year, end_year+1, step):
    
    print("Running PageRank on works from <= {}...".format(year), end=" ")
    query_start_time = time.time()
    query = """
    CALL algo.pageRank.stream(
    'MATCH (p:Quanta) WHERE p.year <= {} RETURN id(p) as id',
    'MATCH (p1:Quanta)-[:CITES]->(p2:Quanta) RETURN id(p1) as source, id(p2) as target',
    {{graph:'cypher', iterations:5, concurrency:20}})
    YIELD node, score
    RETURN node.id as id, node.title as title, node.lang as lang, node.year as year, 
    node.keywords as keywords, node.fos as fos, score
    LIMIT 1000;
    """.format(year,year)
    df = graph.run(query).to_data_frame()
    df['year'] = year
    dfs.append(df)
    query_end_time = time.time()
    print("Done ({:.2f} minutes).".format((query_end_time-query_start_time)/60))
    
end_time = time.time()
print("Finished all calculations in {:.2f} minutes.".format((end_time-start_time)/60))


Running PageRank on works from <= 1900... Done (0.03 minutes).
Running PageRank on works from <= 1910... Done (0.02 minutes).
Running PageRank on works from <= 1920... Done (0.02 minutes).
Running PageRank on works from <= 1930... Done (0.03 minutes).
Running PageRank on works from <= 1940... Done (0.03 minutes).
Running PageRank on works from <= 1950... Done (0.03 minutes).
Running PageRank on works from <= 1960... Done (0.02 minutes).
Running PageRank on works from <= 1970... Done (0.02 minutes).
Running PageRank on works from <= 1980... Done (0.02 minutes).
Running PageRank on works from <= 1990... Done (0.02 minutes).
Running PageRank on works from <= 2000... Done (0.03 minutes).
Finished all calculations in 0.26 minutes.


In [37]:
# Write result BY TITLE to CSV
result = pd.concat(dfs).pivot_table(index='title', columns='year', values='score')    
file_path = '/tmp/data/impact_20M_{}-{}.csv'.format(start_year, end_year)
print("Writing results to {}...".format(file_path), end=" ")
result.index = result.index.str.replace(",","")
result.to_csv(path_or_buf=file_path, sep=",", header=True, index=True)
print("Done.")

Writing results to /tmp/data/impact_20M_1900-2000.csv... Done.


Unnamed: 0,fos,id,keywords,lang,score,title,year
0,,0004bd0d-d398-4397-8627-1416cb9cc0c3,[œuvre auteur resume editions manuscrits criti...,fr,0.15,Le calife de Bagdad. Ouverture,2000
1,[Biology],0010dd7d-78d5-48ce-a1af-0b0deb01aa49,[new south wales],en,0.15,"XVIII. Description of Conserva umbilicata, a n...",2000
2,,001a9805-df08-4a02-a5ab-552b46a3e51f,,en,0.15,A Flea in Her Ear,2000
3,"[Demography, History, Performance art]",002794f5-ca24-45f8-88fd-a6807ba1ce66,,en,0.15,"Allan Kellehear, A Social History of Dying: Ed...",2000
4,,00311e08-7ab9-463d-b840-b394ae6440d7,,fr@@@ja,0.15,"Opinion d'Arnould, sur le projet de loi relati...",2000
5,,00528de8-eda6-4d78-9a1d-0037030f3ccd,"[kuropatnicki ewaryst andrzej 1734 1788, kazan...",pl,0.15,Kazanie na uroczystość poświęcenia kościoła ta...,2000
6,,0056d48a-5e23-4965-a1e6-8480bc2e910c,,,0.15,On an Improved Reflecting Circle. [Abstract],2000
7,"[Humanities, Visual arts, Art, Painting]",005f348a-e51e-407c-98cd-5c90e91be34e,"[painting paintings, nineteenth century, oil o...",en,0.15,Self portrait with Spectacles,2000
8,"[Art, Art history, Performance art]",006d4d1f-db93-40bf-bead-cd4408b28280,[œuvre auteur resume editions manuscrits criti...,fr,0.15,Die Seligkeit der Liebe. MH 783,2000
9,,00717262-df43-4c72-a612-d973ff6e76d4,,,0.15,Curiosités américaines ou Description des anim...,2000


In [None]:
# # Graph loading (notes; do not execute)
# # // Load graph
# CALL algo.graph.load('my-graph','Label','REL_TYPE',{graph:'heavy',..other config...})
#   YIELD name, graph, direction, undirected, sorted, nodes, loadMillis, alreadyLoaded,
#         nodeWeight, relationshipWeight, nodeProperty, loadNodes, loadRelationships;

# # // Info on loaded graph
# CALL algo.graph.info('my-graph')
#   YIELD name, type, exists, removed, nodes;

# # // Use graph
# CALL algo.pageRank(null,null,{graph:'my-graph',...})


# # // Remove graph
# CALL algo.graph.remove('my-graph')
#   YIELD name, type, exists, removed, nodes;