In [2]:
import time
import json
import math
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from py2neo import Graph, Node, Relationship
%matplotlib inline



In [3]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [4]:
graph = Graph( "bolt://matlaber10.media.mit.edu:7687", auth=('neo4j','myneo'))
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

Connected to graph database with 278,590,931 nodes and 1,844,902,937 relationships!


In [4]:
# All versions of pct_collaborative_pubs for Top42Authors
query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
	WHERE q.num_profs > 1 AND q.num_authors > 1 
    WITH a, 
        SUM(toFloat(1)/(q.num_profs - 1)) AS collab_linprof,
        SUM(toFloat(1)/(sqrt(q.num_profs - 1))) AS collab_sqrtprof,
        SUM(toFloat(1)/(sqrt(q.num_authors - 1))) AS collab_sqrtauth,
        SUM(toFloat(1)/(q.num_authors - 1)) AS collab_linauth,
        SUM(toFloat(1)/(q.num_profs + 1 - q.num_profs)) AS collab_unweighted
    MATCH (a)-[:AUTHORED]-(q:Quanta)
    WITH a, COUNT(q) AS num_pubs, collab_linprof, collab_sqrtauth, collab_linauth, collab_unweighted, collab_sqrtprof
    RETURN a.name AS name, 
        num_pubs,
        collab_linprof/num_pubs AS pct_collab_linprof,
        collab_sqrtprof/num_pubs AS pct_collab_sqrtprof,
        collab_unweighted/num_pubs AS pct_collab_unweighted,
        collab_sqrtauth/num_pubs AS pct_collab_sqrtauth,
        collab_linauth/num_pubs AS pct_collab_linauth
    """
# df = query_to_df(query, graph)
# df.to_csv('C:\\Users\\Brend\\Downloads\\pct_collaborative_pubs_top42.csv', index = False, encoding = "UTF-8")

Starting query... Done (30.38 minutes).


In [5]:
#Ratio of cross-cluster and intra-cluster edges to total edges
query = """
MATCH (u:Top42Author)
MATCH (u)-[e:COAUTHOR]-(b:Top42Author)
WITH u, COUNT(b) AS k_u
WITH u, k_u AS den

// intracommunity edges
OPTIONAL MATCH (u)-[e:COAUTHOR]-(b:Top42Author)
WHERE last(b.louvain) = last(u.louvain)
WITH den, u, COUNT(b) AS k_intra_u
WITH den, u, k_intra_u AS IntraClusterConns, CASE WHEN den=0 THEN 0 ELSE toFloat(k_intra_u)/den END AS IntraClusterRatio
WITH den, u, IntraClusterRatio, IntraClusterConns, 1.0 - IntraClusterRatio AS InterClusterRatio

RETURN u.name AS name, IntraClusterRatio, InterClusterRatio, den AS TotalConns, IntraClusterConns, den - IntraClusterConns AS InterClusterConns
"""

# df_clusters = query_to_df(query, graph)
# df_clusters.to_csv('C:\\Users\\Brend\\Downloads\\clusters_top42.csv', index = False, encoding = "UTF-8")

Starting query... Done (23.58 minutes).


In [4]:
query = """
MATCH (u:Top42Author)
MATCH (u)-[e:COAUTHOR]-(b:Top42Author)
WITH e.strength AS weight, u, COUNT(b) AS k_u
WITH u, toFloat(sum(weight*k_u))/sum(k_u) AS den

OPTIONAL MATCH (u)-[e:COAUTHOR]-(b:NatureAuthor)
WHERE last(b.louvain) = last(u.louvain)
WITH den, u, COUNT(b) AS k_intra_u, CASE WHEN e IS NULL THEN 0 ELSE e.strength END AS weight
WITH den, u, CASE WHEN sum(k_intra_u)=0 THEN 0 ELSE toFloat(sum(weight*k_intra_u))/sum(k_intra_u) END AS num
WITH den, u, CASE WHEN den=0 THEN 0 ELSE num/den END AS NatureIntraCommunityFocus
WITH den, u, NatureIntraCommunityFocus

OPTIONAL MATCH (u)-[e:COAUTHOR]-(b:NatureAuthor)
WHERE last(b.louvain) <> last(u.louvain)
WITH den, u, NatureIntraCommunityFocus, COUNT(b) AS k_inter_u, CASE WHEN e IS NULL THEN 0 ELSE e.strength END AS weight
WITH den, u, NatureIntraCommunityFocus, CASE WHEN sum(k_inter_u)=0 THEN 0 ELSE toFloat(sum(weight*k_inter_u))/sum(k_inter_u) END AS num
WITH u, NatureIntraCommunityFocus, CASE WHEN den=0 THEN 0 ELSE num/den END AS NatureInterCommunityFocus

RETURN u.name AS name, NatureIntraCommunityFocus AS IntraCommunityFocus, NatureInterCommunityFocus AS InterCommunityFocus
"""

# df_community_focus = query_to_df(query, graph)
# df_community_focus.to_csv('C:\\Users\\Brend\\Downloads\\community_focus_top42.csv', index = False, encoding = "UTF-8")

Starting query... Done (46.45 minutes).


In [3]:
df_clusters = pd.read_csv('C:\\Users\\Brend\\Downloads\\clusters_top42.csv', index_col = 'name', encoding = "UTF-8")
df_pct_collab = pd.read_csv('C:\\Users\\Brend\\Downloads\\pct_collaborative_pubs_top42.csv', index_col = 'name', encoding = 'UTF-8')
df_author_focus = pd.read_csv('C:\\Users\\Brend\\Downloads\\community_focus_top42.csv', index_col = 'name', encoding = 'UTF-8')

In [4]:
df_metrics = df_pct_collab.join(df_author_focus, how = 'outer')
df_metrics = df_metrics.join(df_clusters[['TotalConns', 'InterClusterRatio', 'IntraClusterRatio']], how = 'outer')
df_metrics.to_csv('C:\\Users\\Brend\\Downloads\\metrics_top42.csv', encoding = "UTF-8")

In [24]:
query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
WITH a, COUNT(q) AS num_pubs, SUM(q.pageRank_2018) AS tot_rank, MIN(q.pageRank_2018) AS min_rank, MAX(q.pageRank_2018) AS max_rank, percentileCont(q.pageRank_2018, 0.5) AS median_rank 
RETURN a.name AS name, toFloat(tot_rank)/num_pubs AS avg_PR, min_rank AS min_PR, max_rank AS max_PR, median_rank AS median_PR
"""
df_pr = query_to_df(query, graph)

query = """
MATCH (a:Top42Author)-[:AUTHORED]->(q:Quanta)
WITH a, COUNT(q) AS num_pubs, SUM(q.articleRank2018) AS tot_rank, MIN(q.articleRank2018) AS min_rank, MAX(q.articleRank2018) AS max_rank, percentileCont(q.articleRank2018, 0.5) AS median_rank 
RETURN a.name AS name, toFloat(tot_rank)/num_pubs AS avg_AR, min_rank AS min_AR, max_rank AS max_AR, median_rank AS median_AR
"""
df_ar = query_to_df(query, graph)

df_pr.to_csv('C:\\Users\\Brend\\Downloads\\pr_top42.csv', index = False, encoding = "UTF-8")
df_ar.to_csv('C:\\Users\\Brend\\Downloads\\ar_top42.csv', index = False, encoding = "UTF-8")

Starting query... Done (33.32 minutes).
Starting query... Done (21.81 minutes).
