In [34]:
import time
import pandas as pd
from py2neo import Graph, Node, Relationship

In [35]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [36]:
graph = Graph( "bolt://matlaber5.media.mit.edu:7687", auth=('neo4j','myneo'))
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

Connected to graph database with 278,432,359 nodes and 1,844,501,832 relationships!


In [37]:
top_5 = ['Cell', 'Nature', 'Nature Biotechnology','Proceedings of the National Academy of Sciences of the United States of America','Science']
top_10 = ['Cell', 'Nature', 'Nature Biotechnology','Proceedings of the National Academy of Sciences of the United States of America','Science', 'Journal of the American Chemical Society', 'JAMA', 'The New England Journal of Medicine', 'Nature Genetics', 'Neuron']
top_42 = ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']

### Get Top Profs

In [9]:
import pandas as pd
df_impact = pd.read_csv('~/Workspace/UROP/scaling-science/notebooks/Collaboration/Data/impact_top42.csv', index_col = 'name', encoding = "UTF-8")
df_orgs = pd.read_csv('~/Workspace/UROP/scaling-science/notebooks/Collaboration/Data/orgs_top42.csv', index_col = 'name', encoding = "UTF-8")
df_num_pubs = pd.read_csv('~/Workspace/UROP/scaling-science/notebooks/Collaboration/Data/pct_collaborative_pubs_top42.csv', index_col = 'name', encoding = "UTF-8")[['num_pubs']]

In [10]:
df_impact = df_impact[[col for col in df_impact.columns if "min" not in col]]
df = df_impact.join(df_orgs, how = 'outer')
df = df.join(df_num_pubs, how = 'outer')

In [25]:
print("Total: %d authors" % df.shape[0])
df_trimmed = df[df['num_pubs'] >= 50]
df_trimmed = df_trimmed[df_trimmed['orgs'] <= 1000]
print("Remaining after trimming: %d authors" % df_trimmed.shape[0])

Total: 1965872 authors
Remaining after trimming: 525543 authors


In [26]:
def top_100_to_csv(trimmed_df, cols, name, num, to_csv = False):
    d = dict()
    for col in cols:
        df_notna = trimmed_df[pd.notna(trimmed_df[col])]
        sorted = df_notna[[col]].sort_values(col, ascending = False).head(num)
        names = sorted.index.tolist()
        values = sorted[col].tolist()
        d[col] = pd.Series(names, index=list(range(1, num+1)))
        d[col+'_val'] = pd.Series(values, list(range(1, num+1)))
    df_rank = pd.DataFrame(d)
    if to_csv:
        df_rank.to_csv('~/Workspace/UROP/scaling-science/notebooks/Collaboration/Data/{}_rank_{}.csv'.format(name, num), index = False, encoding = "UTF-8")
    return df_rank

In [56]:
df_impact_rank = top_100_to_csv(df_trimmed, df_impact.columns, 'impact', 1000, to_csv = False)
df_impact_rank

Unnamed: 0,avg_PR,avg_PR_val,max_PR,max_PR_val,median_PR,median_PR_val,avg_AR,avg_AR_val,max_AR,max_AR_val,median_AR,median_AR_val
1,Ulrich K. Laemmli,35.501431,Joseph Sambrook,4942.389210,Patricia M. McNamara,3.531750,Ulrich K. Laemmli,18.632654,Oliver H. Lowry,2160.619064,Patricia M. McNamara,1.520226
2,Stephen P. Timoshenko,24.433140,Tom Maniatis,4942.389210,Thomas R. Dawber,2.912980,Stephen F. Altschul,12.874065,Tom Maniatis,1502.146478,C Redmond,1.185122
3,R. E. Kalman,23.923494,Oliver H. Lowry,4713.407514,Tavia Gordon,2.518032,David J. Lipman,12.561464,Joseph Sambrook,1502.146478,W. M. Cowan,1.163485
4,Stephen F. Altschul,23.041292,Ulrich K. Laemmli,2664.815842,Frank W. Foote,2.358066,Piotr Chomczynski,9.520731,Ulrich K. Laemmli,1408.893785,Robert T. DeBoy,1.158474
5,David J. Lipman,22.622525,Lars Hedin,2039.665534,C Redmond,2.283521,Oliver H. Lowry,8.934023,Paul Meier,600.372834,Robert J. Dodson,1.119536
6,S.M. Sze,22.118643,Donald A. McQuarrie,1472.552308,Parkhurst A. Shore,2.244719,Tom Maniatis,7.046123,Donald A. McQuarrie,450.608741,Hoda Khouri,1.116985
7,Tom Maniatis,20.387078,S.M. Sze,1429.941927,J. M. Dziedzic,2.216630,Brian P. Flannery,6.296602,Piotr Chomczynski,422.102206,Tavia Gordon,1.105510
8,Oliver H. Lowry,19.520199,Paul Meier,1335.049007,Stephen W. Kuffler,2.047200,Stephen P. Timoshenko,6.017492,Nicoletta Sacchi,422.102206,Thomas R. Dawber,1.097980
9,Piotr Chomczynski,17.288263,Linus Pauling,1062.882866,W. M. Cowan,1.989251,Mark S. Guyer,5.915697,Marshal F. Folstein,383.737923,Wallace V. Friesen,1.038105
10,Joseph Sambrook,17.173986,David S. Johnson,1062.522355,Arthur Purdy Stout,1.968363,Kary B. Mullis,5.502184,Paul R. McHugh,383.737923,Ellen E. Walters,1.015886


In [60]:
top_profs = []
for p in list(df_impact_rank.head(500)['median_PR']):
    if p in list(df_impact_rank.head(500)['median_AR']):
        top_profs.append(p)
# print(top_profs)
top_profs = [p for p in top_profs if "'" not in p]
print(len(top_profs))

328


### Young Profs Rank

In [67]:
avg_num_authors = 14.6906612
# Impact deweighted by num_authors
query = """
call apoc.export.csv.query(
"MATCH (a:Top42Author)
WHERE a.last_author_yr > 2013 

MATCH (a)-[r:AUTHORED]->(q:Quanta)<-[:AUTHORED]-(b:Author)
WHERE q.venue in {}
    AND q.year < a.last_author_yr
    AND r.is_first_author
    AND b.name in {}
WITH a, COUNT(q) AS num_top_collabs, COLLECT(DISTINCT b.name) as top_collabs
WHERE num_top_collabs > 0

MATCH (a)-[r:AUTHORED]->(q:Quanta)
WHERE q.venue in {}
    AND a.last_author_yr <= q.year 
    AND r.is_last_author

RETURN a.name AS name, 
    top_collabs, 
    COUNT(q) AS num_pubs, 
    SUM(14.6906612 * q.pageRank_2018 / (14.6906612 + q.num_authors)) / COUNT(q) AS avg_PR,
    MAX(14.6906612 * q.pageRank_2018 / (14.6906612 + q.num_authors)) AS max_PR, 
    percentileCont((14.6906612 * q.pageRank_2018 / (14.6906612 + q.num_authors)), 0.5) AS median_PR"
    , "/data/csv/PR_Early_Profs_2.csv", {})
""".format(str(top_42), str(top_profs), str(top_42), "{batchSize:100, iterateList:true, parallel:true}")
df_pr = query_to_df(query, graph)

Starting query... Done (0.01 minutes).


In [None]:
import pandas as pd
df_PR = pd.read_csv('~/scaling-science/data/magone/PR_Early_Profs_2.csv', index_col = 'name', encoding = "UTF-8")

In [None]:
df_PR_trim = df_PR
print("Total: %d authors" % df_PR_trim.shape[0])
df_PR_trim = df_PR_trim[df_PR_trim['num_pubs'] >= 3]
print("Remaining after trimming: %d authors" % df_PR_trim.shape[0])

In [None]:
def top_100_to_csv(trimmed_df, cols, name, num, to_csv = False):
    d = dict()
    for col in [c for c in cols if c != 'top_collabs']:
        sorted = trimmed_df[[col, 'top_collabs']].sort_values(col, ascending = False).head(num)
        names = sorted.index.tolist()
        values = sorted[col].tolist()
        top_collabs = sorted['top_collabs'].tolist()
        d[col] = pd.Series(names, index=list(range(1, num+1)))
        d[col+'_val'] = pd.Series(values, list(range(1, num+1)))
        d[col+'_top_collabs'] = pd.Series(top_collabs, list(range(1, num+1)))
    df_rank = pd.DataFrame(d)
    if to_csv:
        df_rank.to_csv('~/scaling-science/data/magone/{}_Early_Profs_rank_2.csv'.format(name), index = False, encoding = "UTF-8")
    return df_rank

In [None]:
num = df_PR_trim.shape[0]
top_100_to_csv(df_PR_trim, df_PR.columns, 'PR', num, to_csv = False)

### Misc queries

In [None]:
# Average num_authors
query = """call apoc.periodic.iterate(
    "MATCH (b:Top42Author)-[:AUTHORED]-(pub:Quanta)
    RETURN pub, COUNT(DISTINCT b) as num_authors",
    "SET pub.num_authors = num_authors", {batchSize:10000, iterateList:true, parallel:true})
    """
df = query_to_df(query, graph)
df

query = """
call apoc.export.csv.query(
"MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
	WHERE q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
    RETURN avg(q.num_authors)"
, "/data/csv/avg_num_authors.csv", {batchSize:10000, iterateList:true, parallel:true})
    """
df = query_to_df(query, graph)
df

Starting query... 