In [1]:
import pandas as pd
import json
from py2neo import Graph, Node, Relationship
import numpy as np
import time
from functools import reduce
from tqdm.autonotebook import tqdm



In [3]:
graph = Graph("bolt://neo4j-magtwo:7687", auth=('neo4j','myneo'))

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

Connected to graph database with 758,202,153 nodes and 3,482,624,155 relationships!


In [4]:
start_year, end_year, step = 2018, 2020, 1
min_year = 1950

In [5]:
# Write time-series of PageRank values to Quanta
print("Writing PageRank for {} to {}...".format(start_year, end_year))

start_time = time.time()
for year in tqdm(range(start_year, end_year+1, step)):
    year_start_time = time.time()
    
    where_clause = 'WHERE p.year < {} AND p.year > {} AND p.venue="Nature"'.format(year+1, min_year-1)
    
    # [1] Write standard PageRank values for current-year graph
    print("\t{}: Writing standard PageRank values...".format(year), end=" ")
    query_start_time = time.time()
    query = """
    CALL algo.pageRank(
    'MATCH (p:Quanta) {} RETURN id(p) as id',
    'MATCH (p1:Quanta)-[:CITES]->(p2:Quanta) RETURN id(p1) as source, id(p2) as target',
    {{graph:'cypher', iterations:35, write:true, writeProperty:"tspr{}"}});
    """.format(where_clause, year)
#     print(query)
    graph.run(query)
    print("Done ({:.2f} min).".format((time.time()-query_start_time)/60))
      
    # [2] Collect reference-set normalization values for all years up to the current year
    print("\t{}: Calculating reference-set statistics FOR SAME YEAR...".format(year), end=" ")
    query_start_time = time.time()    
    query = """
    MATCH (p:Quanta)
    {}
    RETURN 
        p.year as year,
        AVG(p.tspr{}) as avgpr, 
        stDev(p.tspr{}) as stdevpr
    ORDER BY year DESC
    """.format(where_clause, year, year)
#     print(query)
    df = graph.run(query).to_data_frame()
    avg_pagerank = df['avgpr'].iloc[0]
    stdev_pagerank = df['stdevpr'].iloc[0]
    print("Finished ({:.2f} min).".format((time.time()-query_start_time)/60))  
    
    # [3] Normalize current-year PageRank values using calculated reference set from current year
    print("\t{}: Normalizing to time-scaled PageRank values...".format(year), end=" ")
    
    df['avg_str'] = df.apply(lambda row: 'WHEN {:.0f} THEN {}'.format(row['year'], row['avgpr']), axis=1)
    df['std_str'] = df.apply(lambda row: 'WHEN {:.0f} THEN {}'.format(row['year'], row['stdevpr']), axis=1)
    avg_case_str = 'CASE p.year ' + reduce(lambda a,b:'{} {}'.format(a,b), df['avg_str']) + ' END'
    std_case_str = 'CASE p.year ' + reduce(lambda a,b:'{} {}'.format(a,b), df['std_str']) + ' END'
    
    query_start_time = time.time()
    query = """
    CALL apoc.periodic.iterate(
    'MATCH (p:Quanta) {} RETURN p',
    'SET p.tsprn{} = ABS(p.tspr{}-{})/{}',
    {{batchSize:10000, parallel:true}})
    """.format(where_clause, year, year, avg_case_str, std_case_str)
#     print(query)
    graph.run(query)
    print("Finished writing ({:.2f} min).".format((time.time()-query_start_time)/60))        
    
    print("\t{}: Wrote ts-PageRank to \"tspr{}\" ({:.2f} min)".format(year, year, (time.time()-year_start_time)/60))
    
print("Finished everything ({:.2f} min).".format((time.time()-start_time)/60))

Writing PageRank for 2018 to 2020...


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

	2018: Writing standard PageRank values... 


ClientError: Procedure Not Found: There is no procedure with the name `algo.pageRank` registered for this database instance. Please ensure you've spelled the procedure name correctly and that the procedure is properly deployed.