In [1]:
from py2neo import Graph, Node, Relationship 
from tqdm import tqdm
import glob, os, time
graph = Graph("bolt://matlaber5.media.mit.edu:7687", auth=("neo4j", "myneo")) 
#print('done')
#n_nodes = len(graph.nodes)
#n_edges = len(graph.relationships)
#print("Connected to graph with {:,} nodes and {:,} edges!".format(n_nodes, n_edges))
#print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
#    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
#    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))
n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse'] 
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse'] 
print("Connected to graph database with {:,} nodes and {:,} relationships!".format (n_nodes, n_relationships))

Connected to graph database with 758,202,148 nodes and 3,482,672,128 relationships!


In [2]:
graph.database.primitive_counts

{'NumberOfRelationshipIdsInUse': 3482672128,
 'NumberOfPropertyIdsInUse': 2780481667,
 'NumberOfNodeIdsInUse': 758202148,
 'NumberOfRelationshipTypeIdsInUse': 13}

In [3]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [4]:
def run_query(query, graph, print_query=False, run_query=True, print_only=False, to_df=False, verbose=True): 
    df = 1 
    if print_only: 
        print_query = True 
        run_query = False 
        start_time = time.time() 
    if print_query: 
        print(query) 
    if run_query: 
        if to_df: 
            df = graph.run(query).to_data_frame() 
        else: 
            graph.run(query) 
            end_time = time.time() 
            minutes_elapsed = (end_time-start_time)/60 
    if verbose: 
        print("Query completed in {:.2f} minutes.".format(minutes_elapsed)) 
    return df

In [None]:
############EARLY ADOPTERS#############

CALL apoc.periodic.iterate(
    "MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year)
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q, year
    RETURN [p, q, year] as list",
    "WITH list[0] as p, list[1] AS q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH q, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({name:a.name, year:p.year})) AS alist 
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=year | x.name]) AS year_0 
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p1 and not x.name in year_0 | x.name]) AS year_1 
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p2 and not x.name in year_0 and not x.name in year_1 | x.name]) AS year_2
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p3 and not x.name in year_0 and not x.name in year_1 and not x.name in year_2 | x.name]) AS year_3
    MERGE (q)-[m0:METRICS_IN]->(y0:Year {value: year})
    SET m0.early_adopters = size(year_0)
    MERGE (q)-[m1:METRICS_IN]->(y1:Year {value: p1})
    SET m1.early_adopters = size(year_1)
    MERGE (q)-[m2:METRICS_IN]->(y2:Year {value: p2})
    SET m2.early_adopters = size(year_2)
    MERGE (q)-[m3:METRICS_IN]->(y3:Year {value: p3})
    SET m3.early_adopters = size(year_3)",
    {batchSize:5000, iterateList:true, parallel:false})

In [None]:
############UNINFECTED NEIGHBORS#############

CALL apoc.periodic.iterate(
    "MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year)
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q, year 
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH q, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({name:a.name, year:p.year})) AS alist, a
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=year|x.name]) AS year_0
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.name]) AS year_1
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.name]) AS year_2
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.name]) AS year_3
        // year_3 is the set of people who has written a paper that cites q.TITLE within 3 years of q.TITLE being published
    RETURN [q, year_0, year_1, year_2, year_3, a] as list"
    ,
    "WITH list[0] as q, list[1] as year_0, list[2] as year_1, list[3] as year_2, list[4] as year_3, list[5] as a
    MATCH (n:Author)-[:COAUTHOR]->(a)
        // TECHNICALLY WRONG BECAUSE COAUTHORS ARE ADDED OVER TIME
    WHERE a.name IN year_3
    WITH *, COLLECT(n.name) AS nlist
    WITH *, size(apoc.coll.toSet([x in nlist where a.name in year_0 and not x in year_0 | x])) AS y0_neighbors
    WITH *, size(apoc.coll.toSet([x in nlist where a.name in year_1 and not x in year_1 | x])) AS y1_neighbors
    WITH *, size(apoc.coll.toSet([x in nlist where a.name in year_2 and not x in year_2 | x])) AS y2_neighbors
    WITH *, size(apoc.coll.toSet([x in nlist where not x in year_3 | x])) AS y3_neighbors
    MERGE (q)-[m0:METRICS_IN]->(y0:Year {value: year})
    SET m0.uninfected_neighbors = sum(y0_neighbors)
    MERGE (q)-[m1:METRICS_IN]->(y1:Year {value: p1})
    SET m1.uninfected_neighbors = sum(y1_neighbors)
    MERGE (q)-[m2:METRICS_IN]->(y2:Year {value: p2})
    SET m2.uninfected_neighbors = sum(y2_neighbors)
    MERGE (q)-[m3:METRICS_IN]->(y3:Year {value: p3})
    SET m3.uninfected_neighbors = sum(y3_neighbors)",
    {batchSize:5000, iterateList:true, parallel:false})

In [None]:
############INFECTED COMMUNITIES#############

CALL apoc.periodic.iterate(
    "MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year)
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q , year 
    RETURN [p, q, year] as list"
    ,
    "WITH list[0] as p, list[1] as q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH q, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({label:a.labelprop, year:p.year})) AS alist     
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=year|x.label])) AS infected_communities_0
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.label])) AS infected_communities_1
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.label])) AS infected_communities_2
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.label])) AS infected_communities_3
    MERGE (q)-[m1:METRICS_IN]->(y0:Year {value: year})
    SET m0.infected_communities = infected_communities_0
    MERGE (q)-[m1:METRICS_IN]->(y1:Year {value: p1})
    SET m1.infected_communities = infected_communities_1
    MERGE (q)-[m2:METRICS_IN]->(y2:Year {value: p2})
    SET m2.infected_communities = infected_communities_2
    MERGE (q)-[m3:METRICS_IN]->(y3:Year {value: p3})
    SET m3.infected_communities = infected_communities_3",
    {batchSize:5000, iterateList:true, parallel:false})

CALL apoc.periodic.iterate(
    "MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year)
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q , year 
    RETURN [p, q, year] as list"
    ,
    "WITH list[0] as p, list[1] as q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH q, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({label:a.labelprop, year:p.year})) AS alist 
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year=year |x.label])) AS infected_communities_0
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year=p1 AND NOT x.label in infected_communities_0 |x.label])) AS infected_communities_1
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year=p2 AND NOT x.label in infected_communities_0 AND NOT x.label in infected_communities_1 |x.label])) AS infected_communities_2
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year=p3 AND NOT x.label in infected_communities_0 AND NOT x.label in infected_communities_1 AND NOT x.label in infected_communities_2 |x.label])) AS infected_communities_3
    MERGE (q)-[m1:METRICS_IN]->(y0:Year {value: year})
    SET m0.infected_communities = infected_communities_0
    MERGE (q)-[m1:METRICS_IN]->(y1:Year {value: p1})
    SET m1.infected_communities = infected_communities_1
    MERGE (q)-[m2:METRICS_IN]->(y2:Year {value: p2})
    SET m2.infected_communities = infected_communities_2
    MERGE (q)-[m3:METRICS_IN]->(y3:Year {value: p3})
    SET m3.infected_communities = infected_communities_3",
    {batchSize:5000, iterateList:true, parallel:false})

In [None]:
############USAGE ENTROPY#############

for years_post_pub in range(4):
    query = """
    CALL apoc.periodic.iterate("
    MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year)
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year = q.year+{} 
    WITH DISTINCT p, q, year 
    RETURN [p, q, year] as list"
    ,
    "WITH list[0] as p, list[1] as q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH q, year+{} as p1, apoc.coll.toSet(collect({paper_year:p.year, community:a.labelprop})) as clist
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p1|x.community]) AS year_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p1|x.paper_year])) as s
    MERGE (q)-[m:METRICS_IN]->(y:Year {{value: p1})
    SET m.adoption_entropy = reduce(i = 0.0, x IN year_count| i - toFloat(x.count)/s*log(toFloat(x.count)/s)/log(2)) 
    ",{{batchSize:5000, iterateList:true, parallel:false}});
    """.format(years_post_pub, years_post_pub)
    print(query)
    {batchSize:5000, iterateList:true, parallel:false})
    
CALL apoc.periodic.iterate(
    "MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year)
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q, year 
    RETURN [p, q, year] as list"
    ,
    "WITH list[0] as p, list[1] as q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH title, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({paper_year:p.year, community:a.labelprop})) as clist
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=year|x.community]) AS year_0_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p0|x.paper_year])) as s0
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p1|x.community]) AS year_1_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p1|x.paper_year])) as s1
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p2|x.community]) AS year_2_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p2|x.paper_year])) as s2
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p3|x.community]) AS year_3_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p3|x.paper_year])) as s3
    MERGE (q)-[m1:METRICS_IN]->(y0:Year {value: year})
    SET m0.usage_entropy = reduce(i = 0.0, x IN year_0_count| i - toFloat(x.count)/s0*log(toFloat(x.count)/s0)/log(2))
    MERGE (q)-[m1:METRICS_IN]->(y1:Year {value: p1})
    SET m1.usage_entropy = reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2))
    MERGE (q)-[m2:METRICS_IN]->(y2:Year {value: p2})
    SET m2.usage_entropy = reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2))
    MERGE (q)-[m3:METRICS_IN]->(y3:Year {value: p3})
    SET m3.usage_entropy = reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2))",
    {batchSize:5000, iterateList:true, parallel:false})

In [None]:
############ADOPTION ENTROPY#############

for years_post_pub in range(4):
    query = """
    CALL apoc.periodic.iterate("
    MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year)
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year = q.year+{} 
    WITH DISTINCT p, q, year 
    RETURN [p, q, year] as list"
    ,
    "WITH list[0] as p, list[1] as q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH q, apoc.coll.toSet(collect(a.labelprop)) AS people
    WITH *, apoc.coll.frequencies([x IN people | x]) AS year_count, size(people) as s
    MERGE (q)-[m:METRICS_IN]->(y:Year {{value: year+{}}})
    SET m.adoption_entropy = reduce(i = 0.0, x IN year_count| i - toFloat(x.count)/s*log(toFloat(x.count)/s)/log(2)) 
    ",{{batchSize:5000, iterateList:true, parallel:false}});
    """.format(years_post_pub, years_post_pub)
    print(query)
    {batchSize:5000, iterateList:true, parallel:false})
    
CALL apoc.periodic.iterate(
    "MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year)
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q, year 
    RETURN [p, q, year] as list"
    ,
    "WITH list[0] as p, list[1] as q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH title, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({person:a, year:p.year})) AS alist
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=year|x.person]) AS year_0_people
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.person]) AS year_1_people
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.person]) AS year_2_people
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.person]) AS year_3_people
    WITH *, apoc.coll.frequencies([x IN year_0_people | x.labelprop]) AS year_0_count, size(year_0_people) as s0
    WITH *, apoc.coll.frequencies([x IN year_1_people | x.labelprop]) AS year_1_count, size(year_1_people) as s1
    WITH *, apoc.coll.frequencies([x IN year_2_people | x.labelprop]) AS year_2_count, size(year_2_people) as s2
    WITH *, apoc.coll.frequencies([x IN year_3_people | x.labelprop]) AS year_3_count, size(year_3_people) as s3
    MERGE (q)-[m1:METRICS_IN]->(y0:Year {value: year})
    SET m0.usage_entropy = reduce(i = 0.0, x IN year_0_count| i - toFloat(x.count)/s0*log(toFloat(x.count)/s0)/log(2))
    MERGE (q)-[m1:METRICS_IN]->(y1:Year {value: p1})
    SET m1.usage_entropy = reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2))
    MERGE (q)-[m2:METRICS_IN]->(y2:Year {value: p2})
    SET m2.usage_entropy = reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2))
    MERGE (q)-[m3:METRICS_IN]->(y3:Year {value: p3})
    SET m3.usage_entropy = reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2))",
    {batchSize:5000, iterateList:true, parallel:false})

In [None]:
# Number of Early Adopters by Year
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 
#
# # 5316210 total db hits in 6233 ms

#try_next_year = get_last_year_completed("/tmp/data/result/FeatureExtractionResults/EarlyAdopters/")+1
#try_next_year = 1950 # included to boostrap at 2000

for i in tqdm(range(1950, 2018)):
    print(i)
    query = """
    MATCH (a:Author)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
    WHERE p.year < q.year + 4 AND q.year = """+str(i)+""" AND q.venue IN ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
    WITH *, q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
    WITH q.title as title, p1, p2, p3, q.year as qyear, apoc.coll.toSet(collect({name:a.name, year:p.year})) AS alist
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p1 | x.name]) AS year_1
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p2 and not x.name in year_1 | x.name]) AS year_2
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p3 and not x.name in year_1 and not x.name in year_2 | x.name]) AS year_3
    return 
        title, 
        size(year_1) as early_adopters_1, 
        size(year_2) as early_adopters_2,
        size(year_3) as early_adopters_3"""

    df_earlyadopters = query_to_df(query, graph)
    df_earlyadopters
    #df_earlyadopters.to_csv('/tmp/data/result/FeatureExtractionResults/EarlyAdopters/early_adopters_'+str(i)+'.csv', index=False, columns = ['title', 'early_adopters_1', 'early_adopters_2', 'early_adopters_3'])
    

In [None]:
#BEST VERSION

PROFILE MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value:2009})
USING INDEX y:Year(value) 
WITH DISTINCT q, y.value as year 
MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
WHERE p.year < q.year + 4 
WITH DISTINCT p, q.title as title, year 
MATCH (a:Author)-[:AUTHORED]->(p) 
WITH title, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({name:a.name, year:p.year})) AS alist 
WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p1 | x.name]) AS year_1 
WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p2 and not x.name in year_1 | x.name]) AS year_2 
RETURN title, size(year_1) as early_adopters_1, size(year_2) as early_adopters_2, SIZE(apoc.coll.toSet([x IN alist WHERE x.year=p3 and not x.name in year_1 and not x.name in year_2 | x.name])) AS early_adopters_3

In [None]:
############EARLY ADOPTERS#############

CALL apoc.periodic.iterate(
    "MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year)
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q, year
    RETURN [p, q, year] as list",
    "WITH list[0] as p, list[1] AS q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH q, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({name:a.name, year:p.year})) AS alist 
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p1 | x.name]) AS year_1 
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p2 and not x.name in year_1 | x.name]) AS year_2
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p3 and not x.name in year_1 and not x.name in year_2 | x.name]) AS year_3
    MERGE (q)-[m1:METRICS_IN]->(a:Year {value: p1})
    SET m1.early_adopters = size(year_1)
    MERGE (q)-[m2:METRICS_IN]->(b:Year {value: p2})
    SET m2.early_adopters = size(year_2)
    MERGE (q)-[m3:METRICS_IN]->(c:Year {value: p3})
    SET m3.early_adopters = size(year_3)",
    {batchSize:5000, iterateList:true, parallel:false})

In [None]:
for i in tqdm(range(1950, 2018)):
    print(i)
    query = '''
    MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value: '''+str(i)+'''})
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q.title as title, year 
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH title, '''+str(i+1)+''' as p1, '''+str(i+2)+''' as p2, '''+str(i+3)+''' as p3, apoc.coll.toSet(collect({name:a.name, year:p.year})) AS alist 
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p1 | x.name]) AS year_1 
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p2 and not x.name in year_1 | x.name]) AS year_2 
    RETURN 
        title, 
        size(year_1) as early_adopters_1, 
        size(year_2) as early_adopters_2, 
        size(apoc.coll.toSet([x IN alist WHERE x.year=p3 and not x.name in year_1 and not x.name in year_2 | x.name])) AS early_adopters_3'''
    
    df_earlyadopters = query_to_df(query, graph)
    df_earlyadopters
    #df_earlyadopters.to_csv('/tmp/data/result/FeatureExtractionResults/EarlyAdopters/early_adopters_'+str(i)+'.csv', index=False, columns = ['title', 'early_adopters_1', 'early_adopters_2', 'early_adopters_3'])

In [None]:
# INCLUDING YEAR OF PUBLICATION

MATCH (p:Quanta)-[:CITES]->(q:Quanta)<-[:PUBLISHED_IN_VENUE]-(v:Venue)
WHERE q.year = 2000 and p.year < q.year + 4 
WITH DISTINCT p, q.title as title
MATCH (a:Author)-[:AUTHORED]->(p)
WITH title, 2000 as p0, 2001 as p1, 2002 as p2, 2003 as p3, apoc.coll.toSet(collect({name:a.name, year:p.year})) AS alist
WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p0 | x.name]) AS year_0
WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p1 and not x.name in year_0 | x.name]) AS year_1
WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p2 and not x.name in year_0 and not x.name in year_1 | x.name]) AS year_2
RETURN title, size(year_0) as early_adopters_0, size(year_1) as early_adopters_1, size(year_2) as early_adopters_2, SIZE(apoc.coll.toSet([x IN alist WHERE x.year=p3 and not x.name in year_0 and not x.name in year_1 and not x.name in year_2 | x.name])) AS early_adopters_3

In [None]:
# EXTENDED 

PROFILE MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value:2009})
USING INDEX y:Year(value) 
WITH DISTINCT q, y.value as year 
MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
WHERE p.year < q.year + 4 
WITH DISTINCT p, q.title as title, year 
MATCH (a:Author)-[:AUTHORED]->(p) 
WITH title, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({name:a.name, year:p.year})) AS alist 
WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p0 | x.name]) AS year_0
WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p1 and not x.name in year_0 | x.name]) AS year_1
WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p2 and not x.name in year_0 and not x.name in year_1 | x.name]) AS year_2
RETURN title, size(year_0) as early_adopters_0, size(year_1) as early_adopters_1, size(year_2) as early_adopters_2, SIZE(apoc.coll.toSet([x IN alist WHERE x.year=p3 and not x.name in year_0 and not x.name in year_1 and not x.name in year_2 | x.name])) AS early_adopters_3

In [None]:
# TOP 50 PAPERS
# This one ran because the collect statement was placed ahead of the sortMulti

MATCH (p:Quanta)-[:CITES]->(q:Quanta)
WHERE q.year = 2018 and p.year >= q.year
WITH DISTINCT q.title as cited_paper, collect({title:p.title, year:p.year}) as adopting_collection ORDER BY p.year LIMIT 50
UNWIND apoc.coll.sortMulti(adopting_collection, ['^year'], 50) AS early_adopting_papers
RETURN cited_paper, collect(early_adopting_papers.title) as early_adopting_papers_titles

In [None]:
# TOP 50 AUTHORS

MATCH (p:Quanta)-[:CITES]->(q:Quanta)
WHERE p.year > q.year
WITH DISTINCT p, q.title as cited_paper ORDER BY p.year LIMIT 50
MATCH (a:Author)-[:AUTHORED]->(p) 
WITH a.name as author_name, cited_paper, p.year as paper_year
WITH cited_paper, apoc.coll.toSet(collect({name:author_name, year:paper_year})) as author_mapping
UNWIND apoc.coll.sortMulti(author_mapping, ['year'], 50) AS author_list
RETURN cited_paper, collect(author_list.name) AS top_adopters

In [None]:
# Number of Uninfected Neighbors of Early Adopters
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 

years = [2012, 2013, 2014, 2015, 2016, 2018]

try_next_year = get_last_year_completed("/tmp/data/result/FeatureExtractionResults/UninfectedNeighbors/")+1
for i in tqdm([2007, 2008]):
#     for j in range(1,4):
#         query = """
#         MATCH (b:Author)-[:COAUTHOR]-(a:Author)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
#         WHERE q.year=i AND p.year<=i+j
#         // Calculate the number of uninfector neighbors for each paper published in year i at year i+j
#         """
        
        
    query = """
    MATCH (a:Author)-[:AUTHORED]->(p:Quanta )-[:CITES]->(q:Quanta )
    WHERE p.venue IN """+str(top_42)+""" AND q.venue IN """+str(top_42)+""" AND p.year < q.year + 4 AND q.year = """+str(i)+""" 
    WITH *, q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
    WITH q.title as title, p1, p2, p3,  
        apoc.coll.toSet(collect({person:a, year:p.year})) AS alist
    // alist is people who have ever cited TITLE within 3 years of TITLE being published
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.person.name]) AS year_1
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.person.name]) AS year_2
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.person.name]) AS year_3
    // year_3 is the set of people who has written a paper that cites TITLE within 3 years of TITLE being published

    MATCH (n:Author)-[:COAUTHOR]->(b:Author)
    // TECHNICALLY WRONG BECAUSE COAUTHORS ARE ADDED OVER TIME
    WHERE b.name IN year_3
    WITH *, COLLECT(n) AS nlist
    WITH *, apoc.coll.toSet([x in nlist where b.name in year_1 and not x.name in year_1 | x.name]) AS y1_neighbors
    WITH *, apoc.coll.toSet([x in nlist where b.name in year_2 and not x.name in year_2 | x.name]) AS y2_neighbors
    WITH *, apoc.coll.toSet([x in nlist where b.name in year_3 and not x.name in year_3 | x.name]) AS y3_neighbors
    
    RETURN 
        title, 
        sum(size(y1_neighbors)) as neighbors_1, 
        sum(size(y2_neighbors)) as neighbors_2, 
        sum(size(y3_neighbors)) as neighbors_3"""
    
    # split node space manually
    print(query)
    

    df_uninfectedneighbors = query_to_df(query, graph)
#     print(df_uninfectedneighbors)
    df_uninfectedneighbors.to_csv('/tmp/data/result/FeatureExtractionResults/UninfectedNeighbors/uninfected_neighbors_'+str(i)+'.csv', index=False, columns = ['title', 'neighbors_1', 'neighbors_2', 'neighbors_3'])
    #df_uninfectedneighbors.head()

In [None]:
PROFILE MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value:2009})
USING INDEX y:Year(value) 
WITH DISTINCT q, y.value as year 
MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
WHERE p.year < q.year + 4 
WITH DISTINCT p, q.title as title, year 
MATCH (a:Author)-[:AUTHORED]->(p) 
WITH title, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({name:a.name, year:p.year})) AS alist 
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.name]) AS year_1
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.name]) AS year_2
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.name]) AS year_3
    // year_3 is the set of people who has written a paper that cites TITLE within 3 years of TITLE being published
MATCH (n:Author)-[:COAUTHOR]->(a)
    // TECHNICALLY WRONG BECAUSE COAUTHORS ARE ADDED OVER TIME
WHERE b.name IN year_3
WITH *, COLLECT(n.name) AS nlist
WITH *, size(apoc.coll.toSet([x in nlist where b.name in year_1 and not x in year_1 | x])) AS y1_neighbors
WITH *, size(apoc.coll.toSet([x in nlist where b.name in year_2 and not x in year_2 | x])) AS y2_neighbors
RETURN 
    title, 
    sum(y1_neighbors) as neighbors_1, 
    sum(y2_neighbors) as neighbors_2, 
    sum(size(apoc.coll.toSet([x in nlist where b.name in year_3 and not x in year_3 | x]))) as neighbors_3

In [None]:
############UNINFECTED NEIGHBORS#############

CALL apoc.periodic.iterate(
    "MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value: {year}})
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q, year 
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH q, {p1} as p1, {p2} as p2, {p3} as p3, apoc.coll.toSet(collect({name:a.name, year:p.year})) AS alist, a
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.name]) AS year_1
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.name]) AS year_2
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.name]) AS year_3
        // year_3 is the set of people who has written a paper that cites q.TITLE within 3 years of q.TITLE being published
    RETURN [q, year_1, year_2, year_3, a] as list"
    ,
    "WITH list[0] as q, list[1] as year_1, list[2] as year_2, list[3] as year_3, list[4] as a
    MATCH (n:Author)-[:COAUTHOR]->(a)
        // TECHNICALLY WRONG BECAUSE COAUTHORS ARE ADDED OVER TIME
    WHERE b.name IN year_3
    WITH *, COLLECT(n.name) AS nlist
    WITH *, size(apoc.coll.toSet([x in nlist where b.name in year_1 and not x in year_1 | x])) AS y1_neighbors
    WITH *, size(apoc.coll.toSet([x in nlist where b.name in year_2 and not x in year_2 | x])) AS y2_neighbors
    WITH *, size(apoc.coll.toSet([x in nlist where b.name in year_3 and not x in year_3 | x])) AS y3_neighbors
    MERGE (q)-[m1:METRICS_IN]->(a:Year {value: p1})
    SET m1.uninfected_neighbors = sum(y1_neighbors)
    MERGE (q)-[m2:METRICS_IN]->(b:Year {value: p2})
    SET m2.uninfected_neighbors = sum(y2_neighbors)
    MERGE (q)-[m3:METRICS_IN]->(c:Year {value: p3})
    SET m3.uninfected_neighbors = sum(y3_neighbors)",
    {batchSize:5000, iterateList:true, parallel:false})

In [None]:
def find_cited(i):
    """
    MATCH (a:Author)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta {venue:"Nature"})
    WHERE p.year < q.year + 4 AND q.year = """+str(i)+"""
    RETURN q.year+1 as p1, q.year+2 as p2, q.year+3 as p3, q.title as title, q.year as qyear, 
    apoc.coll.toSet(collect({name:a.name, year:p.year})) AS alist, 
    apoc.coll.toSet(collect({paper:p, community:a.labelprop})) as clist
    """

In [None]:
def find_early_adopters_1(p1, alist):
    y1 = apoc.coll.toSet([x IN alist WHERE x.year=p1 | x.name])
    return y1

def find_early_adopters_2(p2, alist, y1):
    y2 = apoc.coll.toSet([x IN alist WHERE x.year=p2 and not x.name in y1 | x.name])
    return y2

def find_early_adopters_3(p3, alist, y1, y2):
    y3 = apoc.coll.toSet([x IN alist WHERE x.year=p3 and not x.name in year_1 and not x.name in year_2 | x.name])
    return y3

year_1 = find_early_adopters_1(p1, alist)
year_2 = find_early_adopters_2(p2, alist, year_1)
year_3 = find_early_adopters_3(p3, alist, year_1, year_2)

print ('early_adopters_1: ' + len(year_1), 'early_adopters_2: ' + len(year_2), 'early_adopters_3: ' + len(year_3))

In [None]:
# Number of Infected Communities
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 
#

base_dir = '/tmp/data/result/FeatureExtractionResults/'
data_dir = base_dir + 'InfectedCommunities/'

try_next_year = get_last_year_completed(data_dir)+1
for i in tqdm(range(try_next_year, 2018)):
#     for j in range(1,4):
#         query = """
#         MATCH (a:NatureAuthor)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
#         WHERE p.year={}+{} AND q.year={}
#         RETURN q.title as title, size(apoc.coll.toSet(collect(a.labelprop)))
#         """.format(i,j,i)
#         df_infectedcommunities = query_to_df(query, graph)
#         df_infectedcommunities.to_csv(data_dir + 'infected_communities_{}_{}.csv'.format(i,j), 
#                                       index=False, 
#                                       columns = ['title', 'infected_communities'])
    
    #df_infectedcommunities.head()
    query = """
    MATCH (a:NatureAuthor)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
    WHERE p.year < q.year + 4 AND q.year = """+str(i)+""" 
    WITH *, q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
    WITH q.title as title, p1, p2, p3, q.year as qyear, apoc.coll.toSet(collect({person:a, year:p.year})) AS alist
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.person.labelprop]) AS year_1
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.person.labelprop]) AS year_2
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.person.labelprop]) AS year_3
    RETURN 
        title, 
        size(year_1) as infected_communities_1, 
        size(year_2) as infected_communities_2, 
        size(year_3) as infected_communities_3"""

    df_infectedcommunities = query_to_df(query, graph)
    df_infectedcommunities.to_csv('/tmp/data/result/FeatureExtractionResults/InfectedCommunities/infected_communities_'+str(i)+'.csv', index=False, columns = ['title', 'infected_communities_1', 'infected_communities_2', 'infected_communities_3'])
    #df_infectedcommunities.head()

In [None]:
PROFILE MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value:2009})
USING INDEX y:Year(value) 
WITH DISTINCT q, y.value as year 
MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
WHERE p.year < q.year + 4 
WITH DISTINCT p, q.title as title, year 
MATCH (a:Author)-[:AUTHORED]->(p) 
WITH title, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({label:a.labelprop, year:p.year})) AS alist 
WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.label])) AS infected_communities_1
WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.label])) AS infected_communities_2
RETURN title, infected_communities_1, infected_communities_2, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.label])) AS infected_communities_3

In [None]:
for i in range(2000, 2018):
    year, p1, p2, p3 = i, i+1, i+2, i+3
    query = '''
    PROFILE MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value: {year}})
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q.title as title, year 
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH title, {p1} as p1, {p2} as p2, {p3} as p3, apoc.coll.toSet(collect({label:a.labelprop, year:p.year})) AS alist 
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.label])) AS infected_communities_1
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.label])) AS infected_communities_2
    RETURN title, infected_communities_1, infected_communities_2, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.label])) AS infected_communities_3
    '''
    
    df_earlyadopters = query_to_df(query, graph)
    #df_earlyadopters.to_csv('/tmp/data/result/FeatureExtractionResults/EarlyAdopters/early_adopters_'+str(i)+'.csv', index=False, columns = ['title', 'infected_communities_1', 'infected_communities_2', 'infected_communities_3'])

In [None]:
############INFECTED COMMUNITIES#############

CALL apoc.periodic.iterate(
    "MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value: {year}})
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q , year 
    RETURN [p, q, year] as list"
    ,
    "WITH list[0] as p, list[1] as q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH q, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({label:a.labelprop, year:p.year})) AS alist 
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.label])) AS infected_communities_1
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.label])) AS infected_communities_2
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.label])) AS infected_communities_3
    MERGE (q)-[m1:METRICS_IN]->(a:Year {value: p1})
    SET m1.infected_communities = infected_communities_1
    MERGE (q)-[m2:METRICS_IN]->(b:Year {value: p2})
    SET m2.infected_communities = infected_communities_1
    MERGE (q)-[m3:METRICS_IN]->(c:Year {value: p3})
    SET m3.infected_communities = infected_communities_1",
    {batchSize:5000, iterateList:true, parallel:false})

CALL apoc.periodic.iterate(
    "MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value: {year}})
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q , year 
    RETURN [p, q, year] as list"
    ,
    "WITH list[0] as p, list[1] as q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH q, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({label:a.labelprop, year:p.year})) AS alist 
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p1 |x.label])) AS infected_communities_1
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p2 AND NOT x.label in infected_communities_1 |x.label])) AS infected_communities_2
    WITH *, SIZE(apoc.coll.toSet([x IN alist WHERE x.year<=p3 AND NOT x.label in infected_communities_1 AND NOT x.label in infected_communities_2 |x.label])) AS infected_communities_3
    MERGE (q)-[m1:METRICS_IN]->(a:Year {value: p1})
    SET m1.infected_communities = infected_communities_1
    MERGE (q)-[m2:METRICS_IN]->(b:Year {value: p2})
    SET m2.infected_communities = infected_communities_1
    MERGE (q)-[m3:METRICS_IN]->(c:Year {value: p3})
    SET m3.infected_communities = infected_communities_1",
    {batchSize:5000, iterateList:true, parallel:false})

In [None]:
# IGNORE FOR NOW

def find_infected(alist, p1, p2, p3):
    y1 = apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.person.labelprop])
    y2 = apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.person.labelprop])
    y3 = apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.person.labelprop])
    return [y1, y2, y3]

infected = find_year_1_infected(alist, p1, p2, p3)
print('infected communities 1: ' + infected[0], 'infected communities 2: ' + infected[1], 'infected communities 3: ' + infected[2])

In [None]:
# Usage Entropy
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 
#
try_next_year = get_last_year_completed("/tmp/data/result/FeatureExtractionResults/UsageEntropy/")+1

for i in tqdm(range(try_next_year, 2018)):
    query = """
    MATCH (a:NatureAuthor)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
    WHERE p.year < q.year + 4 AND q.year= """+str(i)+"""
    WITH 
        q, q.title as title, 
        apoc.coll.toSet(collect({paper:p, community:a.labelprop})) as clist, 
        q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper.year<=p1|x.community]) AS year_1_count, size(apoc.coll.toSet([x IN clist WHERE x.paper.year<=p1|x.paper])) as s1
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper.year<=p2|x.community]) AS year_2_count, size(apoc.coll.toSet([x IN clist WHERE x.paper.year<=p2|x.paper])) as s2
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper.year<=p3|x.community]) AS year_3_count, size(apoc.coll.toSet([x IN clist WHERE x.paper.year<=p3|x.paper])) as s3
    RETURN 
        title, 
        reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2)) as usage_entropy_1,
        reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2)) as usage_entropy_2,
        reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2)) as usage_entropy_3"""

    df_usageentropy = query_to_df(query, graph)
    df_usageentropy.to_csv('/tmp/data/result/FeatureExtractionResults/UsageEntropy/usage_entropy_'+str(i)+'.csv', index=False, columns = ['title', 'usage_entropy_1', 'usage_entropy_2', 'usage_entropy_3'])
    #df_usageentropy.head()

In [None]:
PROFILE MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value:2009})
USING INDEX y:Year(value) 
WITH DISTINCT q, y.value as year 
MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
WHERE p.year < q.year + 4 
WITH DISTINCT p, q.title as title, year 
MATCH (a:Author)-[:AUTHORED]->(p) 
WITH title, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({paper_year:p.year, community:a.labelprop})) as clist
WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p1|x.community]) AS year_1_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p1|x.paper_year])) as s1
WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p2|x.community]) AS year_2_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p2|x.paper_year])) as s2
WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p3|x.community]) AS year_3_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p3|x.paper_year])) as s3
RETURN 
        title, 
        reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2)) as usage_entropy_1,
        reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2)) as usage_entropy_2,
        reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2)) as usage_entropy_3

In [None]:
for i in range(2000, 2018):
    year, p1, p2, p3 = i, i+1, i+2, i+3
    query = '''
    PROFILE MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value: {year}})
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q.title as title, year 
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH title, {p1} as p1, {p2} as p2, {p3} as p3, apoc.coll.toSet(collect({paper_year:p.year, community:a.labelprop})) as clist
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p1|x.community]) AS year_1_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p1|x.paper_year])) as s1
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p2|x.community]) AS year_2_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p2|x.paper_year])) as s2
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p3|x.community]) AS year_3_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p3|x.paper_year])) as s3
    RETURN 
            title, 
            reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2)) as usage_entropy_1,
            reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2)) as usage_entropy_2,
            reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2)) as usage_entropy_3
    '''
    
    df_earlyadopters = query_to_df(query, graph)
    #df_earlyadopters.to_csv('/tmp/data/result/FeatureExtractionResults/EarlyAdopters/early_adopters_'+str(i)+'.csv', index=False, columns = ['title', 'usage_entropy_1', 'usage_entropy_2', 'usage_entropy_3'])

In [None]:
############USAGE ENTROPY#############

for years_post_pub in range(4):
    query = """
    CALL apoc.periodic.iterate("
    MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value: {year}})
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year = q.year+{} 
    WITH DISTINCT p, q, year 
    RETURN [p, q, year] as list"
    ,
    "WITH list[0] as p, list[1] as q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH q, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({paper_year:p.year, community:a.labelprop})) as clist
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p1|x.community]) AS year_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p1|x.paper_year])) as s
    MERGE (q)-[m:METRICS_IN]->(a:Year {{value: year+{}}})
    SET m.adoption_entropy = reduce(i = 0.0, x IN year_count| i - toFloat(x.count)/s*log(toFloat(x.count)/s)/log(2)) 
    ",{{batchSize:5000, iterateList:true, parallel:false}});
    """.format(years_post_pub, years_post_pub)
    print(query)
    {batchSize:1000, iterateList:true, parallel:false})
    
CALL apoc.periodic.iterate(
    "MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value: {year}})
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q, year 
    RETURN [p, q, year] as list"
    ,
    "WITH list[0] as p, list[1] as q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH title, {p1} as p1, {p2} as p2, {p3} as p3, apoc.coll.toSet(collect({paper_year:p.year, community:a.labelprop})) as clist
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p1|x.community]) AS year_1_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p1|x.paper_year])) as s1
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p2|x.community]) AS year_2_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p2|x.paper_year])) as s2
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper_year<=p3|x.community]) AS year_3_count, size(apoc.coll.toSet([x IN clist WHERE x.paper_year<=p3|x.paper_year])) as s3
    MERGE (q)-[m1:METRICS_IN]->(a:Year {value: p1})
    SET m1.usage_entropy = reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2))
    MERGE (q)-[m2:METRICS_IN]->(b:Year {value: p2})
    SET m2.usage_entropy = reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2))
    MERGE (q)-[m3:METRICS_IN]->(c:Year {value: p3})
    SET m3.usage_entropy = reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2))",
    {batchSize:5000, iterateList:true, parallel:false})

In [None]:
def find_usage_entropy(clist, p1, p2, p3):
    ue1 = apoc.coll.frequencies([x IN clist WHERE x.paper.year<=p1|x.community]) AS year_1_count, size(apoc.coll.toSet([x IN clist WHERE x.paper.year<=p1|x.paper]))
    ue2 = apoc.coll.frequencies([x IN clist WHERE x.paper.year<=p2|x.community]) AS year_2_count, size(apoc.coll.toSet([x IN clist WHERE x.paper.year<=p2|x.paper]))
    ue3 = apoc.coll.frequencies([x IN clist WHERE x.paper.year<=p3|x.community]) AS year_3_count, size(apoc.coll.toSet([x IN clist WHERE x.paper.year<=p3|x.paper]))
    reduced_ue1 = reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2))
    reduced_ue2 = reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2))
    reduced_ue3 = reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2))
    return [reduced_ue1, reduced_ue2, reduced_ue3]

usage_entropy = find_usage_entropy(clist, p1, p2, p3)
print('usage_entropy_1: ' + usage_entropy[0], 'usage_entropy_2: ' + usage_entropy[1], 'usage_entropy_3: ' + usage_entropy[2])

In [None]:
# Adoption Entropy
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 
#
try_next_year = get_last_year_completed("/tmp/data/result/FeatureExtractionResults/AdoptionEntropy/")+1

for i in tqdm(range(try_next_year, 2018)):
    query = """
    MATCH (a:NatureAuthor)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
    WHERE p.year < q.year + 4 AND q.year= """+str(i)+"""
    WITH *, q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
    WITH q.title as title, p1, p2, p3, 
        apoc.coll.toSet(collect({person:a, year:p.year})) AS alist
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.person]) AS year_1_people
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.person]) AS year_2_people
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.person]) AS year_3_people
    WITH *, apoc.coll.frequencies([x IN year_1_people | x.labelprop]) AS year_1_count, size(year_1_people) as s1
    WITH *, apoc.coll.frequencies([x IN year_2_people | x.labelprop]) AS year_2_count, size(year_2_people) as s2
    WITH *, apoc.coll.frequencies([x IN year_3_people | x.labelprop]) AS year_3_count, size(year_3_people) as s3
    RETURN 
        title, 
        reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2)) as adoption_entropy_1,
        reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2)) as adoption_entropy_2, 
        reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2)) as adoption_entropy_3"""

    df_adoptionentropy = query_to_df(query, graph)
    df_adoptionentropy.to_csv('/tmp/data/result/FeatureExtractionResults/AdoptionEntropy/adoption_entropy_'+str(i)+'.csv', index=False, columns = ['title', 'adoption_entropy_1', 'adoption_entropy_2', 'adoption_entropy_3'])
    #df_adoptionentropy.head()

In [None]:
PROFILE MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value:2009})
USING INDEX y:Year(value) 
WITH DISTINCT q, y.value as year 
MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
WHERE p.year < q.year + 4 
WITH DISTINCT p, q.title as title, year 
MATCH (a:Author)-[:AUTHORED]->(p) 
WITH title, year+1 as p1, year+2 as p2, year+3 as p3, apoc.coll.toSet(collect({label:a.labelprop, year:p.year})) AS alist 
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.label]) AS year_1_people
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.label]) AS year_2_people
WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.label]) AS year_3_people
WITH *, apoc.coll.frequencies([x IN year_1_people | x]) AS year_1_count, size(year_1_people) as s1
WITH *, apoc.coll.frequencies([x IN year_2_people | x]) AS year_2_count, size(year_2_people) as s2
WITH *, apoc.coll.frequencies([x IN year_3_people | x]) AS year_3_count, size(year_3_people) as s3
RETURN 
    title, 
    reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2)) as adoption_entropy_1,
    reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2)) as adoption_entropy_2, 
    reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2)) as adoption_entropy_3

In [None]:
for i in range(2000, 2018):
    year, p1, p2, p3 = i, i+1, i+2, i+3
    query = '''
    PROFILE MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value: {year}})
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q.title as title, year 
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH title, {p1} as p1, {p2} as p2, {p3} as p3, apoc.coll.toSet(collect({label:a.labelprop, year:p.year})) AS alist 
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.label]) AS year_1_people
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.label]) AS year_2_people
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.label]) AS year_3_people
    WITH *, apoc.coll.frequencies([x IN year_1_people | x]) AS year_1_count, size(year_1_people) as s1
    WITH *, apoc.coll.frequencies([x IN year_2_people | x]) AS year_2_count, size(year_2_people) as s2
    WITH *, apoc.coll.frequencies([x IN year_3_people | x]) AS year_3_count, size(year_3_people) as s3
    RETURN 
        title, 
        reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2)) as adoption_entropy_1,
        reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2)) as adoption_entropy_2, 
        reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2)) as adoption_entropy_3
    '''
    
    df_earlyadopters = query_to_df(query, graph)
    #df_earlyadopters.to_csv('/tmp/data/result/FeatureExtractionResults/EarlyAdopters/early_adopters_'+str(i)+'.csv', index=False, columns = ['title', 'adoption_energy_1', 'adoption_energy_2', 'adoption_energy_3'])

In [None]:
############ADOPTION ENTROPY#############

for years_post_pub in range(4):
    query = """
    CALL apoc.periodic.iterate("
    MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value: {year}})
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year = q.year+{} 
    WITH DISTINCT p, q, year 
    RETURN [p, q, year] as list"
    ,
    "WITH list[0] as p, list[1] as q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH q, apoc.coll.toSet(collect(a.labelprop)) AS people
    WITH *, apoc.coll.frequencies([x IN people | x]) AS year_count, size(people) as s
    MERGE (q)-[m:METRICS_IN]->(a:Year {{value: year+{}}})
    SET m.adoption_entropy = reduce(i = 0.0, x IN year_count| i - toFloat(x.count)/s*log(toFloat(x.count)/s)/log(2)) 
    ",{{batchSize:5000, iterateList:true, parallel:false}});
    """.format(years_post_pub, years_post_pub)
    print(query)
    {batchSize:1000, iterateList:true, parallel:false})
    
CALL apoc.periodic.iterate(
    "MATCH (q:Quanta)-[:PUBLISHED_IN_YEAR]->(y:Year {value: {year}})
    USING INDEX y:Year(value) 
    WITH DISTINCT q, y.value as year 
    MATCH (p:Quanta)-[:CITES]->(q:Quanta) 
    WHERE p.year < q.year + 4 
    WITH DISTINCT p, q, year 
    RETURN [p, q, year] as list"
    ,
    "WITH list[0] as p, list[1] as q, list[2] as year
    MATCH (a:Author)-[:AUTHORED]->(p) 
    WITH title, {p1} as p1, {p2} as p2, {p3} as p3, apoc.coll.toSet(collect({person:a, year:p.year})) AS alist
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.person]) AS year_1_people
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.person]) AS year_2_people
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.person]) AS year_3_people
    WITH *, apoc.coll.frequencies([x IN year_1_people | x.labelprop]) AS year_1_count, size(year_1_people) as s1
    WITH *, apoc.coll.frequencies([x IN year_2_people | x.labelprop]) AS year_2_count, size(year_2_people) as s2
    WITH *, apoc.coll.frequencies([x IN year_3_people | x.labelprop]) AS year_3_count, size(year_3_people) as s3
    MERGE (q)-[m1:METRICS_IN]->(a:Year {value: p1})
    SET m1.usage_entropy = reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2))
    MERGE (q)-[m2:METRICS_IN]->(b:Year {value: p2})
    SET m2.usage_entropy = reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2))
    MERGE (q)-[m3:METRICS_IN]->(c:Year {value: p3})
    SET m3.usage_entropy = reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2))",
    {batchSize:5000, iterateList:true, parallel:false})

In [None]:
def find_adoption_energy(alist, p1, p2, p3):
    y1_people = apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.person])
    y2_people = apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.person])
    y3_people = apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.person])
    y1_count = apoc.coll.frequencies([x IN y1_people | x.labelprop])
    s1 = len(y1_people)
    y2_count = apoc.coll.frequencies([x IN y2_people | x.labelprop])
    s2 = len(y2_people)
    y3_count = apoc.coll.frequencies([x IN y3_people | x.labelprop])
    s3 = len(y3_people) 
    reduced_y1 = reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2))
    reduced_y2 = reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2))
    reduced_y3 = reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2))
    return [reduced_y1, reduced_y2, reduced_y3]

adoption_energy = find_adoption_energy(alist, p1, p2, p3)
print('adoption energy 1: ' + adoption_energy[0], 'adoption energy 2: ' + adoption_energy[1], 'adoption energy 3: ' + adoption_energy[2])

In [None]:
def find_all_information():
    '''
    for i in tqdm(range(try_next_year, 2018)):
        CALL find_cited(i)
        YIELD q.year+1 as p1, q.year+2 as p2, q.year+3 as p3, q.title as title, q.year as qyear, 
        apoc.coll.toSet(collect({name:a.name, year:p.year})) AS alist, 
        apoc.coll.toSet(collect({paper:p, community:a.labelprop})) as clist
        
        year_1_early_adopters = find_early_adopters_1(p1, alist)
        year_2_early_adopters = find_early_adopters_2(p2, alist, year_1)
        year_3_early_adopters = find_early_adopters_3(p3, alist, year_1, year_2)

print ('early_adopters_1: ' + len(year_1), 'early_adopters_2: ' + len(year_2), 'early_adopters_3: ' + len(year_3))
    '''