In [1]:
import json
import time
from functools import reduce

import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from tqdm.autonotebook import tqdm

from py2neo import Graph, Node, Relationship

In [2]:
graph = Graph("bolt://dev_neo4j:7687", auth=('neo4j','myneo'))

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

def run_query(query, graph, print_query=False, run_query=True, 
              print_only=False, to_df=False, verbose=True):
    df = 1
    if print_only: 
        print_query = True
        run_query = False
    start_time = time.time()
    if print_query:
        print(query)
    if run_query:
        if to_df:
            df = graph.run(query).to_data_frame()
        else:
            graph.run(query)
    end_time = time.time()
    minutes_elapsed = (end_time-start_time)/60
    if verbose:
        print("Query completed in {:.2f} minutes.".format(minutes_elapsed))
    return df

Connected to graph database with 370,269,897 nodes and 220,155,390 relationships!


In [5]:
# Collect data

min_year, max_year = 1990, 2015

query = """
MATCH (q:Quanta)-[:PUBLISHED_IN]->(y:Year)
WHERE y.value>{miny} AND y.value < {maxy}
MATCH (q)-[m0:METRICS_IN]->(:Year {{value:y.value}})
MATCH (q)-[m1:METRICS_IN]->(:Year {{value:y.value+1}})
MATCH (q)-[m5:METRICS_IN]->(:Year {{value:y.value+5}})
RETURN id(q) as id, y.value as year, 
    coalesce(m0.earlyAdopters,0) as earlyAdopters_0,
    coalesce(m0.timeScaledPageRank, 0) as timeScaledPageRank_0,
    coalesce(m0.node2vec, 0) as node2vec_0,
    coalesce(m1.earlyAdopters,0) as earlyAdopters_1,
    coalesce(m1.timeScaledPageRank, 0) as timeScaledPageRank_1,
    coalesce(m1.node2vec, 0) as node2vec_1,
    coalesce(m5.timeScaledPageRank, 0) as timeScaledPageRank_5
LIMIT 100
""".format(miny=min_year, maxy=max_year)

df = run_query(query, graph, print_only=True)


MATCH (q:Quanta)-[:PUBLISHED_IN]->(y:Year)
WHERE y.value>1990 AND y.value < 2015
MATCH (q)-[m0:METRICS_IN]->(:Year {value:y.value})
MATCH (q)-[m1:METRICS_IN]->(:Year {value:y.value+1})
MATCH (q)-[m5:METRICS_IN]->(:Year {value:y.value+5})
RETURN id(q) as id, y.value as year, 
    coalesce(m0.earlyAdopters,0) as earlyAdopters_0,
    coalesce(m0.timeScaledPageRank, 0) as timeScaledPageRank_0,
    coalesce(m0.node2vec, 0) as node2vec_0,
    coalesce(m1.earlyAdopters,0) as earlyAdopters_1,
    coalesce(m1.timeScaledPageRank, 0) as timeScaledPageRank_1,
    coalesce(m1.node2vec, 0) as node2vec_1,
    coalesce(m5.timeScaledPageRank, 0) as timeScaledPageRank_5
LIMIT 100

Query completed in 0.00 minutes.
