In [None]:
import json
from py2neo import Graph, Node, Relationship
#from py2neo.Graph import database 

# Need to get authentication working, currently NEO4J_AUTH=none
graph = Graph("bolt://neo4j:7687")
# graph = Graph('bolt://localhost:7687', bolt=True)

#graph.delete_all()

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

# Identify papers with strong "early warning" signal


In [None]:
import glob, os, time
import pandas as pd
n_years = 5
step_size = 1

## Calculate average citation AUC by year for Nature and Science papers

In [None]:
c = "toFloat(SIZE((b)<-[:CITES]-(:Quanta {{year: b.year+{}}}))) as c{}"
all_c = [c.format(i,i) for i in range(0,n_years+1,step_size)]
joined_c = ",\n".join(all_c)

s = "(c{}+c{})/2"
all_s = [s.format(i,i+1) for i in range(0,n_years,step_size)]
joined_s = "+".join(all_s)

query = """
MATCH (b:Quanta)
WHERE (b.venue="Science" OR b.venue="Nature") AND b.year>=1950
WITH b, {}
RETURN 
count(b) as num_articles,
sum({}) as sum_area, 
avg({}) as avg_area,
    """.format(joined_c, joined_s, joined_s, joined_s)

# print(query)
query_start_time = time.time()
df = graph.run(query).to_data_frame()
print("Done in {:.2f} minutes.".format((time.time()-query_start_time)/60))

mean_area = df['avg_area'][0]
mean_area = 20.24

## Identify paper with top citation area

In [None]:
c = "SIZE((b)<-[:CITES]-(:Quanta {{year: b.year+{}}})) as c{}"
all_c = [c.format(i,i) for i in range(0,n_years+1,step_size)]
joined_c = ",\n".join(all_c)

s = "(c{}+c{})/2"
all_s = [s.format(i,i+1) for i in range(0,n_years,step_size)]
joined_s = "+".join(all_s)

query = """
MATCH (b:Quanta)
WHERE b.year>=1970 AND (b.venue="Nature" or b.venue="Science")
WITH b, {}
RETURN 
    b.title as title, 
    b.year as year_published,
    {} as citation_area
ORDER BY citation_area DESC
LIMIT 100
    """.format(joined_c, joined_s)

# print(query)
query_start_time = time.time()
df = graph.run(query).to_data_frame()
print("Done in {:.2f} minutes.".format((time.time()-query_start_time)/60))

In [None]:
df.to_csv("/tmp/data/result/TopAreaAllQuantaNatureScience.csv")
df

# Calculate citations per year post publication for specific journals

In [None]:
import glob, os, time
import pandas as pd
start_time = time.time()
dfs = []
for i in range(0,20,1):
    query = """
    MATCH (b:Quanta)
    WHERE b.venue="Science" OR b.venue="Nature" AND b.year>=1950
    WITH b, SIZE((b)<-[:CITES]-(:Quanta {{year: b.year+{}}})) as citationCount
    RETURN 
        b.venue as journal, 
        {} as years_post_publication, 
        count(b) as num_articles,
        sum(citationCount) as total_citations, 
        avg(citationCount) as avg_citations,
        percentileCont(citationCount, 0.5) as median_citations
        """.format(i,i)
#     print(query)
    query_start_time = time.time()
    dfs.append(graph.run(query).to_data_frame())
    query_end_time = time.time()
    print("Done ({:.2f} minutes).".format((query_end_time-query_start_time)/60))
    
end_time = time.time()
print("Finished all calculations in {:.2f} minutes.".format((end_time-start_time)/60))

In [None]:
r = pd.concat(dfs)
r.set_index('journal', inplace=True)
r.to_csv(path_or_buf='/tmp/data/result/NatureScienceCitations.csv', sep=",", header=True, index=True)
print("Done.")

In [None]:
query = """
MATCH (b:Quanta)
WHERE b.title = "CRISPR provides acquired resistance against viruses in prokaryotes."
WITH b
MATCH (b)<-[:CITES]-(a:Quanta)
WITH b.year as pubYear, COUNT(a) as numCitations, a.year as citationYear
RETURN 
    citationYear - pubYear as years_post_publication, 
    numCitations as num_citations
"""
query_start_time = time.time()
df = graph.run(query).to_data_frame()
# df.set_index('years_post_publication', inplace=True)
df.sort_index(inplace=True)
query_end_time = time.time()
print("Done in ({:.2f} minutes).".format((query_end_time-query_start_time)/60))