In [1]:
import time
import json
import pandas as pd
from py2neo import Graph, Node, Relationship
import glob
import os
from tqdm import tqdm
import threading

In [2]:
graph = Graph("bolt://neo4j-magone:7687", auth=('neo4j','myneo'))
top_42 = ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']

print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

Connected to graph database with 278,432,359 nodes and 1,844,501,832 relationships!


In [3]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

def get_last_year_completed(mypath):
    try:
        all_files = sorted(glob.glob(mypath+"*.csv"), reverse=True)
        last_year = os.path.splitext(all_files[0])[0].split("_")[-1]
        return int(last_year)
    except:
        return 2000

x = get_last_year_completed("/tmp/data/result/FeatureExtractionResults/InfectedCommunities-Louvain/")
print(x)

x = get_last_year_completed("/tmp/data/result/FeatureExtractionResults/EarlyAdopters/")
print(x)

def query_by_year(year):

    query = 'MATCH (n: Quanta) WHERE n.year = %s RETURN n.title LIMIT 50000'%year
    query_to_df(query, graph)
    print(threading.current_thread())

# start = time.time()
# threads = []
# for i in range(2000, 2018):
#     t = threading.Thread(target=query_by_year, args=(i,))
#     threads.append(t)
#     t.start()
    
# for thread in threads:
#     thread.join()
# #     query_by_year(i)
    
# print(time.time() - start)


2000
2018


In [None]:
# Number of Early Adopters by Year
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 
#
try_next_year = get_last_year_completed("/tmp/data/result/FeatureExtractionResults/EarlyAdopters/")+1
try_next_year = 1950 # included to boostrap at 2000

for i in tqdm(range(try_next_year, 2018)):
    print(i)
    query = """
    MATCH (a:Author)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta {venue:"Nature"})
    WHERE p.year < q.year + 4 AND q.year = """+str(i)+""" 
    WITH *, q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
    WITH q.title as title, p1, p2, p3, q.year as qyear, apoc.coll.toSet(collect({name:a.name, year:p.year})) AS alist
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p1 | x.name]) AS year_1
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p2 and not x.name in year_1 | x.name]) AS year_2
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year=p3 and not x.name in year_1 and not x.name in year_2 | x.name]) AS year_3
    return 
        title, 
        size(year_1) as early_adopters_1, 
        size(year_2) as early_adopters_2,
        size(year_3) as early_adopters_3"""

    df_earlyadopters = query_to_df(query, graph)
    df_earlyadopters.to_csv('/tmp/data/result/FeatureExtractionResults/EarlyAdopters/early_adopters_'+str(i)+'.csv', index=False, columns = ['title', 'early_adopters_1', 'early_adopters_2', 'early_adopters_3'])
    

In [None]:
# Number of Uninfected Neighbors of Early Adopters
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 

years = [2012, 2013, 2014, 2015, 2016, 2018]

try_next_year = get_last_year_completed("/tmp/data/result/FeatureExtractionResults/UninfectedNeighbors/")+1
for i in tqdm([2007, 2008]):
#     for j in range(1,4):
#         query = """
#         MATCH (b:Author)-[:COAUTHOR]-(a:Author)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
#         WHERE q.year=i AND p.year<=i+j
#         // Calculate the number of uninfector neighbors for each paper published in year i at year i+j
#         """
        
        
    query = """
    MATCH (a:Author)-[:AUTHORED]->(p:Quanta )-[:CITES]->(q:Quanta )
    WHERE p.venue IN """+str(top_42)+""" AND q.venue IN """+str(top_42)+""" AND p.year < q.year + 4 AND q.year = """+str(i)+""" 
    WITH *, q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
    WITH q.title as title, p1, p2, p3,  
        apoc.coll.toSet(collect({person:a, year:p.year})) AS alist
    // alist is people who have ever cited TITLE within 3 years of TITLE being published
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.person.name]) AS year_1
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.person.name]) AS year_2
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.person.name]) AS year_3
    // year_3 is the set of people who has written a paper that cites TITLE within 3 years of TITLE being published

    MATCH (n:Author)-[:COAUTHOR]->(b:Author)
    // TECHNICALLY WRONG BECAUSE COAUTHORS ARE ADDED OVER TIME
    WHERE b.name IN year_3
    WITH *, COLLECT(n) AS nlist
    WITH *, apoc.coll.toSet([x in nlist where b.name in year_1 and not x.name in year_1 | x.name]) AS y1_neighbors
    WITH *, apoc.coll.toSet([x in nlist where b.name in year_2 and not x.name in year_2 | x.name]) AS y2_neighbors
    WITH *, apoc.coll.toSet([x in nlist where b.name in year_3 and not x.name in year_3 | x.name]) AS y3_neighbors
    
    RETURN 
        title, 
        sum(size(y1_neighbors)) as neighbors_1, 
        sum(size(y2_neighbors)) as neighbors_2, 
        sum(size(y3_neighbors)) as neighbors_3"""
    
    # split node space manually
    print(query)
    

    df_uninfectedneighbors = query_to_df(query, graph)
#     print(df_uninfectedneighbors)
    df_uninfectedneighbors.to_csv('/tmp/data/result/FeatureExtractionResults/UninfectedNeighbors/uninfected_neighbors_'+str(i)+'.csv', index=False, columns = ['title', 'neighbors_1', 'neighbors_2', 'neighbors_3'])
    #df_uninfectedneighbors.head()

  0%|          | 0/2 [00:00<?, ?it/s]


    MATCH (a:Author)-[:AUTHORED]->(p:Quanta )-[:CITES]->(q:Quanta )
    WHERE p.venue IN ['Angewandte Chemie', 'Blood', 'Cancer Cell', 'Cancer Discovery', 'Cancer Research', 'Cell', 'Cell Host & Microbe', 'Cell Metabolism', 'Cell Stem Cell', 'Chemistry & Biology', 'The EMBO Journal', 'Genes & Development', 'Immunity', 'Journal of Neurology', 'Journal of the American Chemical Society', 'JAMA', 'Journal of Biological Chemistry', 'Journal of Cell Biology', 'Journal of Clinical Investigation', 'Journal of Experimental Medicine', 'Journal of Medicinal Chemistry', 'The Lancet', 'Nature Cell Biology', 'Nature Chemical Biology', 'Nature Chemistry', 'Nature Medicine', 'Nature Methods', 'Nature', 'Nature Biotechnology', 'The New England Journal of Medicine', 'Neuron', 'Nature Genetics', 'Nature Immunology', 'Nature Neuroscience', 'Nature Structural & Molecular Biology', 'PLOS Biology', 'PLOS Genetics', 'PLOS Pathogens', 'Proceedings of the National Academy of Sciences of the United States of Am

In [None]:
# Number of Infected Communities
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 
#

base_dir = '/tmp/data/result/FeatureExtractionResults/'
data_dir = base_dir + 'InfectedCommunities/'

try_next_year = get_last_year_completed(data_dir)+1
for i in tqdm(range(try_next_year, 2018)):
#     for j in range(1,4):
#         query = """
#         MATCH (a:NatureAuthor)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
#         WHERE p.year={}+{} AND q.year={}
#         RETURN q.title as title, size(apoc.coll.toSet(collect(a.labelprop)))
#         """.format(i,j,i)
#         df_infectedcommunities = query_to_df(query, graph)
#         df_infectedcommunities.to_csv(data_dir + 'infected_communities_{}_{}.csv'.format(i,j), 
#                                       index=False, 
#                                       columns = ['title', 'infected_communities'])
    
    #df_infectedcommunities.head()
    query = """
    MATCH (a:NatureAuthor)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
    WHERE p.year < q.year + 4 AND q.year = """+str(i)+""" 
    WITH *, q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
    WITH q.title as title, p1, p2, p3, q.year as qyear, apoc.coll.toSet(collect({person:a, year:p.year})) AS alist
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.person.labelprop]) AS year_1
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.person.labelprop]) AS year_2
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.person.labelprop]) AS year_3
    RETURN 
        title, 
        size(year_1) as infected_communities_1, 
        size(year_2) as infected_communities_2, 
        size(year_3) as infected_communities_3"""

    df_infectedcommunities = query_to_df(query, graph)
    df_infectedcommunities.to_csv('/tmp/data/result/FeatureExtractionResults/InfectedCommunities/infected_communities_'+str(i)+'.csv', index=False, columns = ['title', 'infected_communities_1', 'infected_communities_2', 'infected_communities_3'])
    #df_infectedcommunities.head()

In [None]:
# Usage Entropy
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 
#
try_next_year = get_last_year_completed("/tmp/data/result/FeatureExtractionResults/UsageEntropy/")+1

for i in tqdm(range(try_next_year, 2018)):
    query = """
    MATCH (a:NatureAuthor)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
    WHERE p.year < q.year + 4 AND q.year= """+str(i)+"""
    WITH 
        q, q.title as title, 
        apoc.coll.toSet(collect({paper:p, community:a.labelprop})) as clist, 
        q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper.year<=p1|x.community]) AS year_1_count, size(apoc.coll.toSet([x IN clist WHERE x.paper.year<=p1|x.paper])) as s1
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper.year<=p2|x.community]) AS year_2_count, size(apoc.coll.toSet([x IN clist WHERE x.paper.year<=p2|x.paper])) as s2
    WITH *, apoc.coll.frequencies([x IN clist WHERE x.paper.year<=p3|x.community]) AS year_3_count, size(apoc.coll.toSet([x IN clist WHERE x.paper.year<=p3|x.paper])) as s3
    RETURN 
        title, 
        reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2)) as usage_entropy_1,
        reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2)) as usage_entropy_2,
        reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2)) as usage_entropy_3"""

    df_usageentropy = query_to_df(query, graph)
    df_usageentropy.to_csv('/tmp/data/result/FeatureExtractionResults/UsageEntropy/usage_entropy_'+str(i)+'.csv', index=False, columns = ['title', 'usage_entropy_1', 'usage_entropy_2', 'usage_entropy_3'])
    #df_usageentropy.head()

In [None]:
# Adoption Entropy
#
# NOTE limitation on q.year to speed test run time.
# Eventually will need to write to CSV and process in batches. 
#
try_next_year = get_last_year_completed("/tmp/data/result/FeatureExtractionResults/AdoptionEntropy/")+1

for i in tqdm(range(try_next_year, 2018)):
    query = """
    MATCH (a:NatureAuthor)-[:AUTHORED]->(p:Quanta)-[:CITES]->(q:Quanta)
    WHERE p.year < q.year + 4 AND q.year= """+str(i)+"""
    WITH *, q.year+1 as p1, q.year+2 as p2, q.year+3 as p3
    WITH q.title as title, p1, p2, p3, 
        apoc.coll.toSet(collect({person:a, year:p.year})) AS alist
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p1|x.person]) AS year_1_people
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p2|x.person]) AS year_2_people
    WITH *, apoc.coll.toSet([x IN alist WHERE x.year<=p3|x.person]) AS year_3_people
    WITH *, apoc.coll.frequencies([x IN year_1_people | x.labelprop]) AS year_1_count, size(year_1_people) as s1
    WITH *, apoc.coll.frequencies([x IN year_2_people | x.labelprop]) AS year_2_count, size(year_2_people) as s2
    WITH *, apoc.coll.frequencies([x IN year_3_people | x.labelprop]) AS year_3_count, size(year_3_people) as s3
    RETURN 
        title, 
        reduce(i = 0.0, x IN year_1_count| i - toFloat(x.count)/s1*log(toFloat(x.count)/s1)/log(2)) as adoption_entropy_1,
        reduce(i = 0.0, x IN year_2_count| i - toFloat(x.count)/s2*log(toFloat(x.count)/s2)/log(2)) as adoption_entropy_2, 
        reduce(i = 0.0, x IN year_3_count| i - toFloat(x.count)/s3*log(toFloat(x.count)/s3)/log(2)) as adoption_entropy_3"""

    df_adoptionentropy = query_to_df(query, graph)
    df_adoptionentropy.to_csv('/tmp/data/result/FeatureExtractionResults/AdoptionEntropy/adoption_entropy_'+str(i)+'.csv', index=False, columns = ['title', 'adoption_entropy_1', 'adoption_entropy_2', 'adoption_entropy_3'])
    #df_adoptionentropy.head()