In [7]:
import time
import json
import math
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from py2neo import Graph, Node, Relationship
%matplotlib inline

In [8]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [9]:
graph = Graph( "bolt://matlaber10.media.mit.edu:7687", auth=('neo4j','myneo'))
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

Connected to graph database with 278,590,931 nodes and 1,844,902,937 relationships!


In [None]:
# Augment nodes with pct_last_author
query = """call apoc.periodic.iterate(
"MATCH (a:Top42Author) RETURN a",
"MATCH (a)-[r:AUTHORED]->(:Quanta)
WITH a, toFloat(SUM(CASE WHEN r.is_last_author THEN 1 ELSE 0 END))/COUNT(r) as pct_last_author
SET a.pct_last_author = pct_last_author", {batchSize:100, parallel:true})
"""

In [None]:
# Augment quanta with num_profs, num_authors
query = """call apoc.periodic.iterate(
    "MATCH (b:Top42Author)-[:AUTHORED]-(pub:Quanta)
    WHERE b.pct_last_author > .25
    RETURN pub, COUNT(DISTINCT b) as num_profs",
    "SET pub.num_profs = num_profs", {batchSize:1000, parallel:true})
    """
query = """call apoc.periodic.iterate(
    "MATCH (b:Top42Author)-[:AUTHORED]-(pub:Quanta)
    RETURN pub, COUNT(DISTINCT b) as num_authors",
    "SET pub.num_authors = num_authors", {batchSize:1000, parallel:true})
    """

In [None]:
# All versions of pct_collaborative_pubs for Top42Authors
query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
	WHERE q.num_profs > 1 AND q.num_authors > 1 
    WITH a, 
        SUM(toFloat(1)/(q.num_profs - 1)) AS collab_linprof,
        SUM(toFloat(1)/(sqrt(q.num_profs - 1))) AS collab_sqrtprof,
        SUM(toFloat(1)/(sqrt(q.num_authors - 1))) AS collab_sqrtauth,
        SUM(toFloat(1)/(q.num_authors - 1)) AS collab_linauth,
        SUM(toFloat(1)) AS collab_unweighted
    MATCH (a)-[:AUTHORED]-(q:Quanta)
    WITH a, COUNT(q) AS num_pubs, collab_linprof, collab_sqrtauth, collab_linauth, collab_unweighted, collab_sqrtprof
    RETURN a.name AS name, 
        num_pubs,
        collab_linprof/num_pubs AS pct_collab_linprof,
        collab_sqrtprof/num_pubs AS pct_collab_sqrtprof,
        collab_unweighted/num_pubs AS pct_collab_unweighted,
        collab_sqrtauth/num_pubs AS pct_collab_sqrtauth,
        collab_linauth/num_pubs AS pct_collab_linauth
    """
# df = query_to_df(query, graph)
# df.to_csv('C:\\Users\\Brend\\Downloads\\pct_collaborative_pubs_top42.csv', index = False, encoding = "UTF-8")

In [None]:
#Ratio of cross-cluster and intra-cluster edges to total edges
query = """
MATCH (u:Top42Author)
MATCH (u)-[e:COAUTHOR]-(b:Top42Author)
WITH u, COUNT(b) AS k_u
WITH u, k_u AS den

// intracommunity edges
OPTIONAL MATCH (u)-[e:COAUTHOR]-(b:Top42Author)
WHERE last(b.louvain) = last(u.louvain)
WITH den, u, COUNT(b) AS k_intra_u
WITH den, u, k_intra_u AS IntraClusterConns, CASE WHEN den=0 THEN 0 ELSE toFloat(k_intra_u)/den END AS IntraClusterRatio
WITH den, u, IntraClusterRatio, IntraClusterConns, 1.0 - IntraClusterRatio AS InterClusterRatio

RETURN u.name AS name, IntraClusterRatio, InterClusterRatio, den AS TotalConns, IntraClusterConns, den - IntraClusterConns AS InterClusterConns
"""

# df_clusters = query_to_df(query, graph)
# df_clusters.to_csv('C:\\Users\\Brend\\Downloads\\clusters_top42.csv', index = False, encoding = "UTF-8")

In [None]:
query = """
MATCH (u:Top42Author)
MATCH (u)-[e:COAUTHOR]-(b:Top42Author)
WITH e.strength AS weight, u, COUNT(b) AS k_u
WITH u, toFloat(sum(weight*k_u))/sum(k_u) AS den

OPTIONAL MATCH (u)-[e:COAUTHOR]-(b:NatureAuthor)
WHERE last(b.louvain) = last(u.louvain)
WITH den, u, COUNT(b) AS k_intra_u, CASE WHEN e IS NULL THEN 0 ELSE e.strength END AS weight
WITH den, u, CASE WHEN sum(k_intra_u)=0 THEN 0 ELSE toFloat(sum(weight*k_intra_u))/sum(k_intra_u) END AS num
WITH den, u, CASE WHEN den=0 THEN 0 ELSE num/den END AS NatureIntraCommunityFocus
WITH den, u, NatureIntraCommunityFocus

OPTIONAL MATCH (u)-[e:COAUTHOR]-(b:NatureAuthor)
WHERE last(b.louvain) <> last(u.louvain)
WITH den, u, NatureIntraCommunityFocus, COUNT(b) AS k_inter_u, CASE WHEN e IS NULL THEN 0 ELSE e.strength END AS weight
WITH den, u, NatureIntraCommunityFocus, CASE WHEN sum(k_inter_u)=0 THEN 0 ELSE toFloat(sum(weight*k_inter_u))/sum(k_inter_u) END AS num
WITH u, NatureIntraCommunityFocus, CASE WHEN den=0 THEN 0 ELSE num/den END AS NatureInterCommunityFocus

RETURN u.name AS name, NatureIntraCommunityFocus AS IntraCommunityFocus, NatureInterCommunityFocus AS InterCommunityFocus
"""

# df_community_focus = query_to_df(query, graph)
# df_community_focus.to_csv('C:\\Users\\Brend\\Downloads\\community_focus_top42.csv', index = False, encoding = "UTF-8")

In [None]:
df_clusters = pd.read_csv('C:\\Users\\Brend\\Downloads\\clusters_top42.csv', index_col = 'name', encoding = "UTF-8")
df_pct_collab = pd.read_csv('C:\\Users\\Brend\\Downloads\\pct_collaborative_pubs_top42.csv', index_col = 'name', encoding = 'UTF-8')
df_author_focus = pd.read_csv('C:\\Users\\Brend\\Downloads\\community_focus_top42.csv', index_col = 'name', encoding = 'UTF-8')

In [None]:
df_metrics = df_pct_collab.join(df_author_focus, how = 'outer')
df_metrics = df_metrics.join(df_clusters[['TotalConns', 'InterClusterRatio', 'IntraClusterRatio']], how = 'outer')
df_metrics.to_csv('C:\\Users\\Brend\\Downloads\\metrics_top42.csv', encoding = "UTF-8")

In [None]:
query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
WITH a, COUNT(q) AS num_pubs, SUM(q.pageRank_2018) AS tot_rank, MIN(q.pageRank_2018) AS min_rank, MAX(q.pageRank_2018) AS max_rank, percentileCont(q.pageRank_2018, 0.5) AS median_rank 
RETURN a.name AS name, toFloat(tot_rank)/num_pubs AS avg_PR, min_rank AS min_PR, max_rank AS max_PR, median_rank AS median_PR
"""
df_pr = query_to_df(query, graph)

query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
WITH a, COUNT(q) AS num_pubs, SUM(q.articleRank2018) AS tot_rank, MIN(q.articleRank2018) AS min_rank, MAX(q.articleRank2018) AS max_rank, percentileCont(q.articleRank2018, 0.5) AS median_rank 
RETURN a.name AS name, toFloat(tot_rank)/num_pubs AS avg_AR, min_rank AS min_AR, max_rank AS max_AR, median_rank AS median_AR
"""
df_ar = query_to_df(query, graph)

df_pr.to_csv('C:\\Users\\Brend\\Downloads\\pr_top42.csv', index = False, encoding = "UTF-8")
df_ar.to_csv('C:\\Users\\Brend\\Downloads\\ar_top42.csv', index = False, encoding = "UTF-8")

In [None]:
df_pr = pd.read_csv('C:\\Users\\Brend\\Downloads\\pr_top42.csv', index_col = 'name', encoding = "UTF-8")
df_ar = pd.read_csv('C:\\Users\\Brend\\Downloads\\ar_top42.csv', index_col = 'name', encoding = "UTF-8")

In [None]:
df_impact = df_pr.join(df_ar, how = 'outer')
df_impact.to_csv('C:\\Users\\Brend\\Downloads\\impact_top42.csv', encoding = "UTF-8")

In [None]:
df_metrics = pd.read_csv('C:\\Users\\Brend\\Downloads\\metrics_top42.csv', index_col = 'name', encoding = "UTF-8")
df_impact = pd.read_csv('C:\\Users\\Brend\\Downloads\\impact_top42.csv', index_col = 'name', encoding = "UTF-8")
df_all = df_metrics.join(df_impact, how = 'outer')
df_all.to_csv('C:\\Users\\Brend\\Downloads\\correlation_top42.csv', encoding = "UTF-8")

Define  prof after being last author

In [4]:
# Augment nodes with last_author_yr
# TO RUN
query = """call apoc.periodic.iterate(
"MATCH (a:Top42Author) RETURN a",
"MATCH (a)-[r:AUTHORED]->(q:Quanta)
WHERE r.is_last_author
WITH a, min(q.year) AS last_author_yr
SET a.last_author_yr = last_author_yr", {batchSize:100, parallel:true})
"""
df = query_to_df(query, graph)

Starting query... Done (1.23 minutes).


In [5]:
# Augment quanta with num_profs_v2
# TO RUN
query = """call apoc.periodic.iterate(
    "MATCH (b:Top42Author)-[:AUTHORED]-(pub:Quanta)
    WHERE b.last_author_yr <= pub.year
    RETURN pub, COUNT(DISTINCT b) as num_profs",
    "SET pub.num_profs_v2 = num_profs", {batchSize:1000, parallel:true})
    """
df = query_to_df(query, graph)

Starting query... Done (36.90 minutes).


In [6]:
# All versions of pct_collaborative_pubs for Top42Authors
# TO RUN
query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
	WHERE q.num_profs_v2 > 1 AND q.num_authors > 1 
    WITH a, 
        SUM(toFloat(1)/(q.num_profs_v2 - 1)) AS collab_linprof,
        SUM(toFloat(1)/(sqrt(q.num_profs_v2 - 1))) AS collab_sqrtprof,
        SUM(toFloat(1)/(sqrt(q.num_authors - 1))) AS collab_sqrtauth,
        SUM(toFloat(1)/(q.num_authors - 1)) AS collab_linauth,
        SUM(toFloat(1)) AS collab_unweighted
    MATCH (a)-[:AUTHORED]-(q:Quanta)
    WITH a, COUNT(q) AS num_pubs, collab_linprof, collab_sqrtauth, collab_linauth, collab_unweighted, collab_sqrtprof
    RETURN a.name AS name, 
        num_pubs,
        collab_linprof/num_pubs AS pct_collab_linprof,
        collab_sqrtprof/num_pubs AS pct_collab_sqrtprof,
        collab_unweighted/num_pubs AS pct_collab_unweighted,
        collab_sqrtauth/num_pubs AS pct_collab_sqrtauth,
        collab_linauth/num_pubs AS pct_collab_linauth
    """
df = query_to_df(query, graph)
df.to_csv('C:\\Users\\Brend\\Downloads\\pct_collaborative_pubs_top42_v2.csv', index = False, encoding = "UTF-8")

Starting query... Done (23.77 minutes).


Disruption

In [10]:
query = """
MATCH (q:Quanta)
WHERE q.id = "8b3b810b-50bc-4d27-866a-ed30b22fde6a"
MATCH (q)-[:CITES]->(p:Quanta)
WITH q, collect(p) as references
MATCH (p2:Quanta)-[:CITES]->(q)
WITH q, references, collect(p2) as ij
MATCH (p3:Quanta)-[:CITES]->(p4:Quanta)
WHERE p4 in references
WITH q, references, ij, collect(DISTINCT p3) as jk
WITH q, references, ij, jk, [p in ij where p in jk] as j
WITH q, references, ij, jk, j, [p in ij where not(p  in j)] as i, [p in jk where not(p in j)] as k
RETURN q.id as id,
	size(references),
    size(ij), 
    size(jk),
    size(j),
    size(k),
    size(i)
"""

In [None]:
query = """
MATCH (q:Quanta)
WHERE q.id = "8b3b810b-50bc-4d27-866a-ed30b22fde6a"
MATCH (q)-[:CITES]->(p:Quanta)
WITH q, collect(p) as references
MATCH (p2:Quanta)-[:CITES]->(q)
WITH q, references, collect(p2) as ij
WITH q, references, ij, 
    REDUCE(jk = [], x in EXTRACT (p in references | COLLECT(MATCH (p2:Quanta)-[r:CITES]->(p) RETURN p2) | jk UNION x)
WITH q, references, ij, jk, [p in ij where p in jk] as j
WITH q, references, ij, jk, j, [p in ij where not(p  in j)] as i, [p in jk where not(p in j)] as k
RETURN q.id as id,
	size(references),
    size(ij), 
    size(jk),
    size(j),
    size(k),
    size(i)
"""