In [4]:
import time
import pandas as pd
from py2neo import Graph, Node, Relationship

In [5]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [6]:
graph = Graph( "bolt://matlaber5.media.mit.edu:7687", auth=('neo4j','myneo'))
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

Connected to graph database with 278,432,359 nodes and 1,844,501,832 relationships!


In [7]:
top_5 = ['Cell', 'Nature', 'Nature Biotechnology','Proceedings of the National Academy of Sciences of the United States of America','Science']
top_10 = ['Cell', 'Nature', 'Nature Biotechnology','Proceedings of the National Academy of Sciences of the United States of America','Science', 'Journal of the American Chemical Society', 'JAMA', 'The New England Journal of Medicine', 'Nature Genetics', 'Neuron']
top_42 = ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']

In [8]:
# Scheme 1: Post-2015-Profs PR
query = """
call apoc.export.csv.query(
"MATCH (a:Top42Author)
WHERE a.last_author_yr > 2015 

MATCH(a)-[r:AUTHORED]->(q:Quanta)
WHERE q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
AND q.year < a.last_author_yr 
AND r.is_first_author
WITH a, 
    COUNT(q) AS num_pubs_pre, 
    SUM(q.pageRank_2018)/COUNT(q) AS avg_PR_pre, 
    MAX(q.pageRank_2018) AS max_PR_pre, 
    percentileCont(q.pageRank_2018, 0.5) AS median_PR_pre 

MATCH(a)-[r:AUTHORED]->(q:Quanta)
WHERE q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
AND q.year >= a.last_author_yr 
AND r.is_last_author
RETURN a.name AS name, 
    num_pubs_pre, avg_PR_pre, max_PR_pre, median_PR_pre,
    COUNT(q) AS num_pubs_post, 
    SUM(q.pageRank_2018)/COUNT(q) AS avg_PR_post, 
    MAX(q.pageRank_2018) AS max_PR_post, 
    percentileCont(q.pageRank_2018, 0.5) AS median_PR_post"

, "/data/csv/PR_Early_Profs.csv", {batchSize:100, iterateList:true, parallel:true})
"""
df_pr = query_to_df(query, graph)

Starting query... Done (5.64 minutes).


In [None]:
import pandas as pd
df_PR = pd.read_csv('~/scaling-science/data/magone/PR_Early_Profs.csv', index_col = 'name', encoding = "UTF-8")

In [None]:
df_PR_trim = df_PR
print("Total: %d authors" % df_PR_trim.shape[0])
df_PR_trim = df_PR_trim[df_PR_trim['num_pubs_pre'] >= 2]
df_PR_trim = df_PR_trim[df_PR_trim['num_pubs_post'] >= 2]
print("Remaining after trimming: %d authors" % df_PR_trim.shape[0])
df_PR_trim

In [None]:
def top_100_to_csv(df, cols, name, num = 100, to_csv = False):
    d = dict()
    for col in cols:
        df_notna = df[pd.notna(df[col])]
        sorted = df_notna[[col]].sort_values(col, ascending = False).head(num)
        names = sorted.index.tolist()
        values = sorted[col].tolist()
        num = min(num, len(names))
        d[col] = pd.Series(names, index=list(range(1, num+1)))
        d[col+'_val'] = pd.Series(values, list(range(1, num+1)))
    df_rank = pd.DataFrame(d)
    if to_csv:
        df_rank.to_csv('~/scaling-science/data/magone/{}_Early_Profs_rank.csv'.format(name), index = False, encoding = "UTF-8")
    return df_rank

top_100_to_csv(df_PR_trim, [col for col in df_PR.columns if 'num_pubs' not in col], 'PR', to_csv = True)