In [1]:
import time
import json
import math
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from py2neo import Graph, Node, Relationship
%matplotlib inline

In [2]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [3]:
graph = Graph( "bolt://matlaber5.media.mit.edu:7687", auth=('neo4j','myneo'))
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

Connected to graph database with 278,432,359 nodes and 1,844,501,832 relationships!


In [None]:
# Augment nodes with pct_last_author
query = """call apoc.periodic.iterate(
"MATCH (a:Top42Author) RETURN a",
"MATCH (a)-[r:AUTHORED]->(:Quanta)
WITH a, toFloat(SUM(CASE WHEN r.is_last_author THEN 1 ELSE 0 END))/COUNT(r) as pct_last_author
SET a.pct_last_author = pct_last_author", {batchSize:100, parallel:true})
"""

In [None]:
# Augment quanta with num_profs, num_authors
query = """call apoc.periodic.iterate(
    "MATCH (b:Top42Author)-[:AUTHORED]-(pub:Quanta)
    WHERE b.pct_last_author > .25
    RETURN pub, COUNT(DISTINCT b) as num_profs",
    "SET pub.num_profs = num_profs", {batchSize:1000, parallel:true})
    """
query = """call apoc.periodic.iterate(
    "MATCH (b:Top42Author)-[:AUTHORED]-(pub:Quanta)
    RETURN pub, COUNT(DISTINCT b) as num_authors",
    "SET pub.num_authors = num_authors", {batchSize:1000, parallel:true})
    """

In [None]:
# All versions of pct_collaborative_pubs for Top42Authors
query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
	WHERE q.num_profs > 1 AND q.num_authors > 1 
    WITH a, 
        SUM(toFloat(1)/(q.num_profs - 1)) AS collab_linprof,
        SUM(toFloat(1)/(sqrt(q.num_profs - 1))) AS collab_sqrtprof,
        SUM(toFloat(1)/(sqrt(q.num_authors - 1))) AS collab_sqrtauth,
        SUM(toFloat(1)/(q.num_authors - 1)) AS collab_linauth,
        SUM(toFloat(1)) AS collab_unweighted
    MATCH (a)-[:AUTHORED]-(q:Quanta)
    WITH a, COUNT(q) AS num_pubs, collab_linprof, collab_sqrtauth, collab_linauth, collab_unweighted, collab_sqrtprof
    RETURN a.name AS name, 
        num_pubs,
        collab_linprof/num_pubs AS pct_collab_linprof,
        collab_sqrtprof/num_pubs AS pct_collab_sqrtprof,
        collab_unweighted/num_pubs AS pct_collab_unweighted,
        collab_sqrtauth/num_pubs AS pct_collab_sqrtauth,
        collab_linauth/num_pubs AS pct_collab_linauth
    """
# df = query_to_df(query, graph)
# df.to_csv('C:\\Users\\Brend\\Downloads\\pct_collaborative_pubs_top42.csv', index = False, encoding = "UTF-8")

In [None]:
#Ratio of cross-cluster and intra-cluster edges to total edges
query = """
MATCH (u:Top42Author)
MATCH (u)-[e:COAUTHOR]-(b:Top42Author)
WITH u, COUNT(b) AS k_u
WITH u, k_u AS den

// intracommunity edges
OPTIONAL MATCH (u)-[e:COAUTHOR]-(b:Top42Author)
WHERE last(b.louvain) = last(u.louvain)
WITH den, u, COUNT(b) AS k_intra_u
WITH den, u, k_intra_u AS IntraClusterConns, CASE WHEN den=0 THEN 0 ELSE toFloat(k_intra_u)/den END AS IntraClusterRatio
WITH den, u, IntraClusterRatio, IntraClusterConns, 1.0 - IntraClusterRatio AS InterClusterRatio

RETURN u.name AS name, IntraClusterRatio, InterClusterRatio, den AS TotalConns, IntraClusterConns, den - IntraClusterConns AS InterClusterConns
"""

# df_clusters = query_to_df(query, graph)
# df_clusters.to_csv('C:\\Users\\Brend\\Downloads\\clusters_top42.csv', index = False, encoding = "UTF-8")

In [None]:
# Author focus
query = """
MATCH (u:Top42Author)
MATCH (u)-[e:COAUTHOR]-(b:Top42Author)
WITH e.strength AS weight, u, COUNT(b) AS k_u
WITH u, toFloat(sum(weight*k_u))/sum(k_u) AS den

OPTIONAL MATCH (u)-[e:COAUTHOR]-(b:NatureAuthor)
WHERE last(b.louvain) = last(u.louvain)
WITH den, u, COUNT(b) AS k_intra_u, CASE WHEN e IS NULL THEN 0 ELSE e.strength END AS weight
WITH den, u, CASE WHEN sum(k_intra_u)=0 THEN 0 ELSE toFloat(sum(weight*k_intra_u))/sum(k_intra_u) END AS num
WITH den, u, CASE WHEN den=0 THEN 0 ELSE num/den END AS NatureIntraCommunityFocus
WITH den, u, NatureIntraCommunityFocus

OPTIONAL MATCH (u)-[e:COAUTHOR]-(b:NatureAuthor)
WHERE last(b.louvain) <> last(u.louvain)
WITH den, u, NatureIntraCommunityFocus, COUNT(b) AS k_inter_u, CASE WHEN e IS NULL THEN 0 ELSE e.strength END AS weight
WITH den, u, NatureIntraCommunityFocus, CASE WHEN sum(k_inter_u)=0 THEN 0 ELSE toFloat(sum(weight*k_inter_u))/sum(k_inter_u) END AS num
WITH u, NatureIntraCommunityFocus, CASE WHEN den=0 THEN 0 ELSE num/den END AS NatureInterCommunityFocus

RETURN u.name AS name, NatureIntraCommunityFocus AS IntraCommunityFocus, NatureInterCommunityFocus AS InterCommunityFocus
"""

# df_community_focus = query_to_df(query, graph)
# df_community_focus.to_csv('C:\\Users\\Brend\\Downloads\\community_focus_top42.csv', index = False, encoding = "UTF-8")

In [None]:
query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
WITH a, COUNT(q) AS num_pubs, SUM(q.pageRank_2018) AS tot_rank, MIN(q.pageRank_2018) AS min_rank, MAX(q.pageRank_2018) AS max_rank, percentileCont(q.pageRank_2018, 0.5) AS median_rank 
RETURN a.name AS name, toFloat(tot_rank)/num_pubs AS avg_PR, min_rank AS min_PR, max_rank AS max_PR, median_rank AS median_PR
"""
df_pr = query_to_df(query, graph)

df_impact = df_pr.join(df_ar, how = 'outer')
df_impact.to_csv('C:\\Users\\Brend\\Downloads\\impact_top42.csv', encoding = "UTF-8")
query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
WITH a, COUNT(q) AS num_pubs, SUM(q.articleRank2018) AS tot_rank, MIN(q.articleRank2018) AS min_rank, MAX(q.articleRank2018) AS max_rank, percentileCont(q.articleRank2018, 0.5) AS median_rank 
RETURN a.name AS name, toFloat(tot_rank)/num_pubs AS avg_AR, min_rank AS min_AR, max_rank AS max_AR, median_rank AS median_AR
"""
df_ar = query_to_df(query, graph)

df_pr.to_csv('C:\\Users\\Brend\\Downloads\\pr_top42.csv', index = False, encoding = "UTF-8")
df_ar.to_csv('C:\\Users\\Brend\\Downloads\\ar_top42.csv', index = False, encoding = "UTF-8")
df_impact = df_pr.join(df_ar, how = 'outer')
df_impact.to_csv('C:\\Users\\Brend\\Downloads\\impact_top42.csv', encoding = "UTF-8")

In [None]:
# Augment nodes with last_author_yr
# COMPLETE on ML 5
query = """call apoc.periodic.iterate(
"MATCH (a:Top42Author) RETURN a",
"MATCH (a)-[r:AUTHORED]->(q:Quanta)
WHERE r.is_last_author
WITH a, min(q.year) AS last_author_yr
SET a.last_author_yr = last_author_yr", {batchSize:100, parallel:true})
"""
df = query_to_df(query, graph)

In [None]:
# Augment quanta with num_profs_v2
# COMPLETE on ML 5
query = """call apoc.periodic.iterate(
    "MATCH (b:Top42Author)-[:AUTHORED]-(pub:Quanta)
    WHERE pub.venue in['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
        AND b.last_author_yr <= pub.year
    RETURN pub, COUNT(DISTINCT b) as num_profs",
    "SET pub.num_profs_v2 = num_profs", {batchSize:1000, parallel:true})
    """
df = query_to_df(query, graph)

Starting query... 

In [None]:
# All versions of pct_collaborative_pubs for Top42Authors
query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
	WHERE q.num_profs_v2 > 1 AND q.num_authors > 1 
    WITH a, 
        SUM(toFloat(1)/(q.num_profs_v2 - 1)) AS collab_linprof,
        SUM(toFloat(1)/(sqrt(q.num_profs_v2 - 1))) AS collab_sqrtprof,
        SUM(toFloat(1)/(sqrt(q.num_authors - 1))) AS collab_sqrtauth,
        SUM(toFloat(1)/(q.num_authors - 1)) AS collab_linauth,
        SUM(toFloat(1)) AS collab_unweighted
    MATCH (a)-[:AUTHORED]-(q:Quanta)
    WITH a, COUNT(q) AS num_pubs, collab_linprof, collab_sqrtauth, collab_linauth, collab_unweighted, collab_sqrtprof
    RETURN a.name AS name, 
        num_pubs,
        collab_linprof/num_pubs AS pct_collab_linprof,
        collab_sqrtprof/num_pubs AS pct_collab_sqrtprof,
        collab_unweighted/num_pubs AS pct_collab_unweighted,
        collab_sqrtauth/num_pubs AS pct_collab_sqrtauth,
        collab_linauth/num_pubs AS pct_collab_linauth
    """
df = query_to_df(query, graph)
df.to_csv('C:\\Users\\Brend\\Downloads\\pct_collaborative_pubs_top42_v2.csv', index = False, encoding = "UTF-8")

In [None]:
# Augment Quanta with disruption
# COMPLETE ON ML 5
query = """
call apoc.periodic.iterate(
"MATCH (q:Quanta) 
WHERE q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
RETURN q",

"MATCH (q)-[:CITES]->(p:Quanta)
WITH q, collect(p) as references

OPTIONAL MATCH (p:Quanta)-[:CITES]->(q)
WITH q, references, collect(p) as ij

UNWIND references as ref
OPTIONAL MATCH (p:Quanta)-[r:CITES]->(ref)
WITH q, references, ij, COLLECT(p) as jk

WITH q, references, ij, jk, [p in ij where p in jk] as j
WITH q, references, ij, jk, j, [p in ij where not(p in j)] as i, [p in jk where not(p in j)] as k
WITH q, size(j) as j, size(k) as k, size(i) as i
SET q.j = j, q.i = i, q.k = k, q.Disruption = toFloat(i-j)/(i+j+k)", {batchSize:10000, parallel:true})
"""
df = query_to_df(query, graph)

In [None]:
# Recalculate Disruption
# COMPLETE ON ML 5
call apoc.periodic.iterate(
"MATCH (q:Quanta) WHERE exists(q.i) AND exists(q.j) AND exists(q.k) 
AND q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
RETURN q",

"SET q.Disruption = toFloat(q.i-q.j)/(q.i+q.j+q.k)", {batchSize:1, parallel:true})

In [None]:
# Fix for j=null
# COMPLETE ON ML 5
query = 
"""
call apoc.periodic.iterate(
"MATCH (q:Quanta) WHERE exists(q.i) AND NOT exists(q.j) AND exists(q.k) 
AND q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
RETURN q",

"SET q.Disruption = toFloat(q.i)/(q.i+q.k)", {batchSize:10000, parallel:true})
"""

New queries

In [5]:
# Disruption by author
# COMPLETE ON ML 5
query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
WHERE exists(q.Disruption)
AND q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
WITH a, COUNT(q) AS num_disrupt, SUM(q.Disruption) AS tot_disrupt, MIN(q.Disruption) AS min_disrupt, MAX(q.Disruption) AS max_disrupt, percentileCont(q.Disruption, 0.5) AS median_disrupt 
RETURN a.name AS name, num_disrupt, toFloat(tot_disrupt)/num_disrupt AS avg_disrupt, min_disrupt, max_disrupt, median_disrupt
"""
df_disruption = query_to_df(query, graph)
df_disruption.to_csv('C:\\Users\\Brend\\Downloads\\disrupt_top42.csv', index = False, encoding = "UTF-8")

Starting query... Done (2.19 minutes).


In [15]:
# Disruption by professor
# COMPLETE ON ML 5
query = """
MATCH (a:Top42Author)
MATCH (a)-[:AUTHORED]->(q:Quanta)
WHERE exists(q.Disruption) AND a.last_author_yr <= q.year
AND q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
WITH a, COUNT(q) AS num_disrupt, SUM(q.Disruption) AS tot_disrupt, MIN(q.Disruption) AS min_disrupt, MAX(q.Disruption) AS max_disrupt, percentileCont(q.Disruption, 0.5) AS median_disrupt 
RETURN a.name AS name, num_disrupt, toFloat(tot_disrupt)/num_disrupt AS avg_disrupt, min_disrupt, max_disrupt, median_disrupt
"""
df_disruption_profs = query_to_df(query, graph)
df_disruption_profs.to_csv('C:\\Users\\Brend\\Downloads\\disrupt_profs_top42.csv', index = False, encoding = "UTF-8")

Starting query... Done (1.83 minutes).


In [24]:
# unique_coauthor_profs
# RUNNING ON ML 5
query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)<-[:AUTHORED]-(b:Top42Author)
	WHERE q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
        AND a.last_author_yr <= q.year AND b.last_author_yr <= q.year
    RETURN a.name AS name, count(distinct b) as unique_coauthor_profs
    """
df = query_to_df(query, graph)
df.to_csv('C:\\Users\\Brend\\Downloads\\unique_profs_top42.csv', index = False, encoding = "UTF-8")

Starting query... Done (9.53 minutes).


In [7]:
# Average num_profs_v2
# TO RUN ON ML 5
query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
	WHERE q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
    RETURN avg(q.num_profs_v2)
    """
df = query_to_df(query, graph)
df

Starting query... Done (1.21 minutes).


Unnamed: 0,avg(q.num_profs_v2)
0,11.327052


In [4]:
# unique_coauthor_profs_damped
# TO RUN ON ML 5
avg_degree = 11.327052
query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)<-[:AUTHORED]-(b:Top42Author)
	WHERE q.venue in ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']
        AND a.last_author_yr <= q.year AND b.last_author_yr <= q.year
    WITH a, b, 
        (toFloat(1) / ({} + avg(q.num_profs_v2))) as u_lin,
        (toFloat(1) / sqrt({} + avg(q.num_profs_v2))) as u_sqrt,
        (toFloat(1) / ({} + avg(q.num_profs_v2))^2) as u_square,
        (toFloat(1) / exp({} + avg(q.num_profs_v2))) as u_exp
        
    RETURN a.name as name, 
        count(u_lin) as unique_coauthor_profs, 
        {}*sum(u_lin) as unique_coauthor_profs_lin_damp,
        sqrt({})*sum(u_sqrt) as unique_coauthor_profs_sqrt_damp,
        {}^2*sum(u_square) as unique_coauthor_profs_square_damp,
        exp({})*sum(u_exp) as unique_coauthor_profs_exp_damp
    """.format(avg_degree, avg_degree, avg_degree, avg_degree, avg_degree, avg_degree, avg_degree, avg_degree)
df = query_to_df(query, graph)
df.to_csv('C:\\Users\\Brend\\Downloads\\unique_profs_top42.csv', index = False, encoding = "UTF-8")

Starting query... Done (34.22 minutes).


In [4]:
# num affiliations for each author
query = """
MATCH (a:Top42Author)-[:AFFILIATED_WITH]->(o:Organization)
RETURN a.name as name, COUNT(o) as orgs
"""
df = query_to_df(query, graph)
df.to_csv('C:\\Users\\Brend\\Downloads\\orgs_top42.csv', index = False, encoding = "UTF-8")

Starting query... Done (6.72 minutes).
