In [1]:
import time
import json
import math
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from py2neo import Graph, Node, Relationship
%matplotlib inline

In [2]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [3]:
graph = Graph( "bolt://matlaber5.media.mit.edu:7687", auth=('neo4j','myneo'))
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

Connected to graph database with 278,432,359 nodes and 1,844,501,832 relationships!


In [4]:
top_5 = ['Cell', 'Nature', 'Nature Biotechnology','Proceedings of the National Academy of Sciences of the United States of America','Science']
top_10 = ['Cell', 'Nature', 'Nature Biotechnology','Proceedings of the National Academy of Sciences of the United States of America','Science', 'Journal of the American Chemical Society', 'JAMA', 'The New England Journal of Medicine', 'Nature Genetics', 'Neuron']
top_42 = ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']

In [28]:
# Disruption
query = """
call apoc.export.csv.query(
"MATCH (a:Top42Author)
MATCH (a)-[:AUTHORED]->(q:Quanta)
WHERE q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
     AND exists(q.Disruption) AND a.last_author_yr <= q.year AND q.year <= a.last_author_yr + 5 
WITH a, COUNT(q) AS num_disrupt, SUM(q.Disruption) AS tot_disrupt, MIN(q.Disruption) AS min_disrupt, MAX(q.Disruption) AS max_disrupt, percentileCont(q.Disruption, 0.5) AS median_disrupt 
RETURN a.last_author_yr AS last_author_yr, 
    a.name AS name, 
    num_disrupt, 
    toFloat(tot_disrupt)/num_disrupt AS avg_disrupt, 
    min_disrupt, 
    max_disrupt, 
    median_disrupt"
, "/data/csv/Disrupt_Early_Profs.csv", {batchSize:100, iterateList:true, parallel:true})
"""
df_disruption = query_to_df(query, graph)
# df_disruption.to_csv('~/Workspace/UROP/scaling-science/notebooks/Collaboration/Data/Disrupt_Early_Profs.csv', index = False, encoding = "UTF-8")

Starting query... Done (1.70 minutes).


In [38]:
# Collaboration
avg_degree = 11.327052
query = """
call apoc.export.csv.query(
"MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)<-[:AUTHORED]-(b:Top42Author)
	WHERE q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
        AND a.last_author_yr <= q.year AND b.last_author_yr <= q.year AND q.year <= a.last_author_yr + 5 
    WITH a, b, 
        (toFloat(1) / (11.327052 + avg(q.num_profs_v2))) as u_lin,
        (toFloat(1) / sqrt(11.327052 + avg(q.num_profs_v2))) as u_sqrt,
        (toFloat(1) / (11.327052 + avg(q.num_profs_v2))^2) as u_square,
        (toFloat(1) / exp(11.327052 + avg(q.num_profs_v2))) as u_exp
        
    RETURN a.name as name, 
        a.last_author_yr as last_author_yr,
        count(u_lin) as unique_coauthor_profs, 
        11.327052*sum(u_lin) as unique_coauthor_profs_lin_damp,
        sqrt(11.327052)*sum(u_sqrt) as unique_coauthor_profs_sqrt_damp,
        11.327052^2*sum(u_square) as unique_coauthor_profs_square_damp,
        exp(11.327052)*sum(u_exp) as unique_coauthor_profs_exp_damp"
        , "/data/csv/Collab_Early_Profs.csv", {batchSize:100, iterateList:true, parallel:true})
    """
df_collab = query_to_df(query, graph)
# df_collab.to_csv('~/Workspace/UROP/scaling-science/notebooks/Collaboration/Data/Collab_Early_Profs.csv', index = False, encoding = "UTF-8")

Starting query... Done (0.00 minutes).


In [42]:
# Impact
query = """
call apoc.export.csv.query(
"MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
WHERE q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
    AND a.last_author_yr <= q.year AND q.year <= a.last_author_yr + 5 

WITH a, COUNT(q) AS num_pubs, 
    SUM(q.pageRank_2018) AS tot_rank, 
    MIN(q.pageRank_2018) AS min_rank, 
    MAX(q.pageRank_2018) AS max_rank, 
    percentileCont(q.pageRank_2018, 0.5) AS median_rank 

RETURN a.name AS name, 
    toFloat(tot_rank)/num_pubs AS avg_PR, 
    min_rank AS min_PR, 
    max_rank AS max_PR, 
    median_rank AS median_PR"
    , "/data/csv/PR_Early_Profs.csv", {batchSize:100, iterateList:true, parallel:true})

"""
df_pr = query_to_df(query, graph)

query = """
call apoc.export.csv.query(
"MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
WHERE q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
    AND a.last_author_yr <= q.year AND q.year <= a.last_author_yr + 5

WITH a, COUNT(q) AS num_pubs, 
    SUM(q.articleRank2018) AS tot_rank, 
    MIN(q.articleRank2018) AS min_rank, 
    MAX(q.articleRank2018) AS max_rank, 
    percentileCont(q.articleRank2018, 0.5) AS median_rank 
    
RETURN a.name AS name, 
    toFloat(tot_rank)/num_pubs AS avg_AR, 
    min_rank AS min_AR, 
    max_rank AS max_AR, 
    median_rank AS median_AR"
    , "/data/csv/AR_Early_Profs.csv", {batchSize:100, iterateList:true, parallel:true})

"""
df_ar = query_to_df(query, graph)

Starting query... Done (0.00 minutes).
Starting query... Done (0.00 minutes).


In [None]:
df_pr.to_csv('~/Workspace/UROP/scaling-science/notebooks/Collaboration/Data/PR_Early_Profs.csv', index = False, encoding = "UTF-8")
df_ar.to_csv('~/Workspace/UROP/scaling-science/notebooks/Collaboration/Data/AR_Early_Profs.csv', index = False, encoding = "UTF-8")
df_pr = pd.read_csv('~/Workspace/UROP/scaling-science/notebooks/Collaboration/Data/PR_Early_Profs.csv', index_col = 'name')
df_ar = pd.read_csv('~/Workspace/UROP/scaling-science/notebooks/Collaboration/Data/AR_Early_Profs.csv', index_col = 'name')
df_impact = df_pr.join(df_ar, how = 'outer')
df_impact.to_csv('~/Workspace/UROP/scaling-science/notebooks/Collaboration/Data/Impact_Early_Profs.csv', index = False, encoding = "UTF-8")