In [None]:
import time
import json
import pandas as pd
from py2neo import Graph, Node, Relationship
print('hi there')

In [None]:
graph = Graph("bolt://neo4j-quanta:7687", auth=('neo4j','myneo'))

print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

In [None]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [None]:
# Number of Early Adopters by Year
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 
#

query = """
MATCH (a:NatureAuthor)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
WHERE p.year < q.year + 4 AND q.year > 2017
WITH *, q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
WITH q.title as title, p1, p2, p3, q.year as qyear, apoc.coll.sortMulti(collect({name:a.name, year:p.year}), ['^name','^year']) AS alist
WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p1 | x.name]) AS year_1
WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p2 and not x.name in year_1 | x.name]) AS year_2
WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p3 and not x.name in year_1 and not x.name in year_2 | x.name]) AS year_3
return 
    title, 
    size(year_1) as early_adopters_1, 
    size(year_2) as early_adopters_2,
    size(year_3) as early_adopters_3"""

df_earlyadopters = query_to_df(query, graph)
df_earlyadopters.head()

In [None]:
# Number of Uninfected Neighbors of Early Adopters
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 
#

query = """
MATCH (a:NatureAuthor)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
WHERE p.year < q.year + 4 AND q.year > 2017
WITH *, q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
WITH q.title as title, p1, p2, p3, q.year as qyear, 
    apoc.coll.sortMulti(collect({person:a, year:p.year}), ['^person.name','^year']) AS alist
// alist is people who have ever cited TITLE within 3 years of TITLE being published
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.person]) AS year_1
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.person]) AS year_2
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.person]) AS year_3
// year_3 is the set of people who has written a paper that cites TITLE within 3 years of TITLE being published

MATCH (n:PhysicsAuthor)-[:COAUTHOR]-(b:PhysicsAuthor)
WHERE b IN year_3
WITH *, COLLECT(n) AS nlist
WITH *, apoc.coll.toSet([x in nlist where b in year_1 and not x in year_1 | x.name]) AS y1_neighbors
WITH *, apoc.coll.toSet([x in nlist where b in year_2 and not x in year_2 | x.name]) AS y2_neighbors
WITH *, apoc.coll.toSet([x in nlist where b in year_3 and not x in year_3 | x.name]) AS y3_neighbors
RETURN 
    title, 
    sum(size(y1_neighbors)) as neighbors_1, 
    sum(size(y2_neighbors)) as neighbors_2, 
    sum(size(y3_neighbors)) as neighbors_3"""

df_uninfectedneighbors = query_to_df(query, graph)
df_uninfecetedneighbors.head()

In [None]:
# Number of Infected Communities
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 
#

query = """
MATCH (a:NatureAuthor)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
WHERE p.year < q.year + 4 AND q.year>2017
WITH *, q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
WITH q.title as title, p1, p2, p3, q.year as qyear, apoc.coll.sortMulti(collect({person:a, year:p.year}), ['^person.name','^year']) AS alist
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|head(x.person.louvain)]) AS year_1
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|head(x.person.louvain)]) AS year_2
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|head(x.person.louvain)]) AS year_3
RETURN 
    title, 
    size(year_1) as infected_communities_1, 
    size(year_2) as infected_communities_2, 
    size(year_3) as infected_communities_3"""

df_infectedcommunities = query_to_df(query, graph)
df_infectedcommunities.head()

In [None]:
# Usage Entropy
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 
#

query = """
MATCH (a:NatureAuthor)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
WHERE p.year < q.year + 4 AND q.year>2017
WITH 
    q, q.title as title, 
    apoc.coll.toSet(collect({paper:p, community:last(a.louvain)})) as clist, 
    q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper.year<=p1|x.community]) AS year_1_count, size(apoc.coll.toSet([x IN clist WHERE x.paper.year<=p1|x.paper])) as s1
WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper.year<=p2|x.community]) AS year_2_count, size(apoc.coll.toSet([x IN clist WHERE x.paper.year<=p2|x.paper])) as s2
WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper.year<=p3|x.community]) AS year_3_count, size(apoc.coll.toSet([x IN clist WHERE x.paper.year<=p3|x.paper])) as s3
RETURN 
    title, 
    reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2)) as usage_entropy_1,
    reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2)) as usage_entropy_2,
    reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2)) as usage_entropy_3"""

df_usageentropy = query_to_df(query, graph)
df_usageentropy.head()

In [None]:
# Adoption Entropy
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 
#

query = """
MATCH (a:NatureAuthor)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
WHERE p.year < q.year + 4 AND q.year>2017
WITH *, q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
WITH q.title as title, p1, p2, p3, 
    apoc.coll.toSet(collect({person:a, year:p.year})) AS alist
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.person]) AS year_1_people
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.person]) AS year_2_people
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.person]) AS year_3_people
WITH *, apoc.coll.frequencies([x IN year_1_people | head(x.louvain)]) AS year_1_count, size(year_1_people) as s1
WITH *, apoc.coll.frequencies([x IN year_2_people | head(x.louvain)]) AS year_2_count, size(year_2_people) as s2
WITH *, apoc.coll.frequencies([x IN year_3_people | head(x.louvain)]) AS year_3_count, size(year_3_people) as s3
RETURN 
    title, 
    reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2)) as adoption_entropy_1,
    reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2)) as adoption_entropy_2, 
    reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2)) as adoption_entropy_3"""

df_adoptionentropy = query_to_df(query, graph)
df_adoptionentropy.head()