#### Definitions

In [None]:
import time
import json
import math
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from py2neo import Graph, Node, Relationship
%matplotlib inline



In [None]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [None]:
graph = Graph( "bolt://matlaber10.media.mit.edu:7687", auth=('neo4j','myneo'))
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

#### High-level Inventory

In [None]:
d = graph.run("MATCH (a:NatureAuthor) RETURN COUNT(a) as number_of_nature_authors").data()[0]
num_nature_authors = d['number_of_nature_authors']
print("Number of authors: {:,}".format(num_nature_authors))

In [None]:
print("Author properties:")
d = graph.run("MATCH (a:NatureAuthor) RETURN properties(a), ID(a)").data()
print(d[0])
print(d[1])

In [None]:
print("Node types")
d = graph.run("CALL db.labels() YIELD label RETURN label;").data()
d

In [None]:
print("Relationship types")
d = graph.run("CALL db.relationshipTypes() YIELD relationshipType RETURN relationshipType;").data()
d


In [None]:
d = graph.run("CALL db.propertyKeys() YIELD propertyKey  RETURN propertyKey").data()
d

#### Number of coauthors

In [None]:
query = """MATCH (a:NatureAuthor)
WITH a, size((a)-[:COAUTHOR]-()) as Num_Coauthors
RETURN a.name, Num_Coauthors"""

df_coauthors = query_to_df(query, graph)
degree_df = df_coauthors['Num_Coauthors']

print("Coauthorship statistics:\n")
degree_df.describe()

In [None]:
print("Rank by number of coauthors:")
df_coauthors.sort_values('Num_Coauthors', ascending=False).head(5)


#### Number of coauthors in different communities

In [None]:
query = """MATCH (a:NatureAuthor)-[:COAUTHOR]-(b)
WHERE NOT b.louvain[3] = a.louvain[3]
WITH a, size(collect(b)) AS Num_Coauthors_In_Different_Community
RETURN a.name, Num_Coauthors_In_Different_Community"""

df_diff_community = query_to_df(query, graph)
degree_df = df_diff_community['Num_Coauthors_In_Different_Community']

print("Coauthorship statistics:\n")
df_diff_community.describe()

In [None]:
print("Rank by number of coauthors in different communities:")
df_diff_community.sort_values('Num_Coauthors_In_Different_Community', ascending=False).head(5)


#### Proportion of coauthors in different communities

In [None]:
df_coauthors.head(5)

In [None]:
df_diff_community.head(5)

#### Number of coauthors affiliated with different organizations

In [None]:
query = """MATCH (a:NatureAuthor)-[:AFFILIATED_WITH]-(b)
WITH a, collect(b) as organizations
RETURN a.name, organizations"""

df = query_to_df(query, graph)

In [None]:
df['organizations'][0]

In [None]:
MATCH ()