In [None]:
import pandas as pd
import json
from py2neo import Graph, Node, Relationship
import numpy as np
import time

In [None]:
# Pagerank
graph = Graph("bolt://neo4j:7687")

start_time = time.time()
start_year, end_year, step = 1985, 2010, 5
dfs = []
for year in range(start_year, end_year+1, step):
    
    # < IS MUCH FASTER THAN <=
    print("Running PageRank on works from < {}...".format(year), end=" ")
    query_start_time = time.time()
    query = """
    CALL algo.pageRank.stream(
    'MATCH (p:Quanta) WHERE p.year < {} RETURN id(p) as id',
    'MATCH (p1:Quanta)-[:CITES]->(p2:Quanta) RETURN id(p1) as source, id(p2) as target',
    {{graph:'cypher', iterations:20, write:false, concurrency:20}})
    YIELD node, score
    WITH 
        *,
        node.id AS id, 
        node.title AS title, 
        node.lang AS lang, 
        node.year AS year, 
        node.keywords AS keywords, 
        node.fos AS fos, 
        node.publisher AS publisher,
        score AS page_rank
    ORDER BY page_rank DESC
    LIMIT 10000000
    RETURN 
        id,
        title,
        lang, 
        year, 
        keywords, 
        fos, 
        publisher,
        page_rank;    
    """.format(year,year,year)
    
    #graph.run(query)
    #print(query)
    
    df = graph.run(query).to_data_frame()
    

In [None]:
# Time Scaled PageRank
graph = Graph("bolt://neo4j:7687")

# Run STREAMING PageRank (1) on each year from 1800 to 2020
start_time = time.time()
start_year, end_year, step = 1985, 2010, 5
dfs = []
for year in range(start_year, end_year+1, step):
    
    # < IS MUCH FASTER THAN <=
    print("Running PageRank on works from < {}...".format(year), end=" ")
    query_start_time = time.time()
    query = """
    CALL algo.pageRank.stream(
    'MATCH (p:Quanta) WHERE p.year < 2015 RETURN id(p) as id',
    'MATCH (p1:Quanta)-[:CITES]->(p2:Quanta) RETURN id(p1) as source, id(p2) as target',
    {{graph:'cypher', iterations:20, write:false, concurrency:20}})
    YIELD node, score
    WITH 
        *,  
        node.id AS id, 
        node.title AS title,
        node.lang AS lang, 
        node.year AS year, 
        node.keywords AS keywords, 
        node.fos AS fos, 
        node.publisher AS publisher,
        score AS pagerank
    ORDER BY pagerank DESC
    WITH year, COLLECT({{title: title,  page_rank: pagerank}})[..1000] AS data, AVG(pagerank) AS avg_page_rank, stDev(pagerank) as stdDev
    UNWIND data AS d
    RETURN d.year as year, 
        d.id AS id,
        d.title AS title, 
        d.lang AS lang, 
        d.keywords as keywords, 
        d.publisher as fos, 
        d.publisher as publisher,
        ABS(d.pagerank-avg_page_rank)/stdDev AS scaled_score;
    """.format(year,year,year)
    
    #graph.run(query)
    #print(query)
    
    df = graph.run(query).to_data_frame()
    
    
print("Finished all calculations in {:.2f} minutes.".format((time.time()-start_time)/60))


In [None]:
def normalize_and_sort(df):
    df['scores_minus_mean'] = df.groupby('year').score - df.groupby('year').score.mean()
    df['scaled_scores'] = df.groupby('year').score_minus_mean / df.groupby('year').score.std()
    df = df.sort_values(['scaled_scores'], ascending=False).head(10000)
    
    #df['scores_minus_mean'] = df['score'] - mean
    #df['scaled_scores'] = df['scores_minus_mean'].divide(stDev, axis=1)

    
    return df

In [None]:
results = normalize_and_sort(df)