In [None]:
import time
import json
import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from py2neo import Graph, Node, Relationship
%matplotlib inline



In [None]:
graph = Graph("bolt://neo4j-quanta:7687", auth=('neo4j','myneo'))

print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

In [None]:
def query_to_df(query, graph):
    #print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    #print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [None]:
'''
## Histogram of community size

# Get number of Louvain iterations run
n_iters = query_to_df("MATCH (a:NatureAuthor) RETURN size(a.louvain) as n_iters LIMIT 1", graph)['n_iters'][0]

# Plot histogram at each iteration
for i in range(n_iters):
    query = """
    MATCH (a:NatureAuthor) 
    RETURN 
        a.louvain[{}] as community_id, 
        COUNT(a) as community_size""".format(i)
    df = query_to_df(query, graph)
    plt.figure()
    sns.distplot(df['community_size'])
'''

In [None]:
# Number of authors

d = graph.run("MATCH (a:NatureAuthor) RETURN COUNT(a) as number_of_nature_authors").data()[0]
num_nature_authors = d['number_of_nature_authors']
print("Number of authors: {:,}".format(num_nature_authors))

# Number of communities

d = graph.run("MATCH (a:NatureAuthor) RETURN COUNT(DISTINCT tail(a.louvain)) as number_of_communities").data()[0]
num_communities = d['number_of_communities']
print("Number of communities: {:,}".format(num_communities))
print("------------------------------\n")

# Distribution of community sizes
n_iters = 4

for i in range(n_iters):
    query = """
    MATCH (a:NatureAuthor) 
    RETURN 
        a.louvain[{}] as community_id, 
        COUNT(a) as community_size""".format(i)

    df = query_to_df(query, graph)
    
    d = graph.run("MATCH (a:NatureAuthor) RETURN COUNT(DISTINCT a.louvain[{}]) as number_of_communities".format(i)).data()[0]
    num_communities = d['number_of_communities']
    
    #d = graph.run("MATCH (a:NatureAuthor) RETURN COUNT(DISTINCT a.louvain[{}]) as number_of_communities".format(i)).data()[0]
    num_communities_size_1 = len(df.loc[df['community_size'] == 1].index)
    
    
    print("Statistics after iteration {}:".format(i+1))
    print("Number of communities: {:,}".format(num_communities))
    print("Average community size:", df['community_size'].mean())
    print("Number of communities of size 1: {:,}".format(num_communities_size_1))
    print("Percentage of communities of size 1: {}".format(num_communities_size_1/float(num_communities)))
    print("------------------------------")
    
    if i == n_iters-1:
        print("Community statistics:\n")
        print(df['community_size'].describe())
    
    plt.figure()
    sns.distplot(df['community_size'])

louvain_community_sizes_array = df['community_size'].values

In [None]:
# Number of authors

d = graph.run("MATCH (a:NatureAuthor) RETURN COUNT(a) as number_of_nature_authors").data()[0]
num_nature_authors = d['number_of_nature_authors']
print("Number of authors: {:,}".format(num_nature_authors))

##### Distribution of community sizes #####

query = """
MATCH (a:NatureAuthor) 
RETURN 
    a.labelprop as community_id, 
    COUNT(a) as community_size""".format(i)

df = query_to_df(query, graph)
community_size_df = df['community_size']

d = graph.run("MATCH (a:NatureAuthor) RETURN COUNT(DISTINCT a.labelprop) as number_of_communities").data()[0]
num_communities = d['number_of_communities']
num_communities_size_1 = len(df.loc[community_size_df == 1].index)


print("Number of communities: {:,}".format(num_communities))
print("Average community size:", community_size_df.mean())
print("Number of communities of size 1: {:,}".format(num_communities_size_1))
print("Percentage of communities of size 1: {}".format(num_communities_size_1/float(num_communities)))
print("------------------------------\n")

plt.figure()
sns.distplot(community_size_df)
print("Community statistics:\n")
print(community_size_df.describe())
print()

##### Coauthor Distributions #####

query = """MATCH (a:NatureAuthor)
WITH a, size((a)-[:COAUTHOR]-()) as Num_Coauthors
RETURN a.name, Num_Coauthors"""

df = query_to_df(query, graph)
degree_df = df['Num_Coauthors']

print("Coauthorship statistics:\n")
print(degree_df.describe())

plt.figure()
sns.distplot(degree_df)
labelprop_community_sizes_array = community_size_df.values

# Features from Communities Formed by Louvain Algorithm

Box Plots for Average Intercommunity Edge Weights and Average Intracommunity Edge Weights 

In [None]:
inner = pd.read_csv('./CSV/louvain/avg_inner_edge_weight_nature.csv')['avg_inner_edge_weight'].values
outer = pd.read_csv('./CSV/louvain/avg_outer_edge_weight_nature.csv')['avg_outer_edge_weight'].values

sns.boxplot(data=[inner,outer])
plt.ylim(0, 7)

In [None]:
_, p = stats.mannwhitneyu(inner, outer)
p
# Mann Whitney U test on box plot data

Box Plots for Authors' focus

In [None]:
inner = pd.read_csv('./CSV/louvain/author_focus_nature.csv')['NatureInterCommunityFocus'].values
outer = pd.read_csv('./CSV/louvain/author_focus_nature.csv')['NatureInterCommunityFocus'].values

sns.boxplot(data=[inner,outer])
inner

In [None]:
_, p = stats.mannwhitneyu(inner, outer)
p
# Mann Whitney U test on box plot data

In [None]:
_, p = stats.mannwhitneyu(louvain_community_sizes_array, labelprop_community_sizes_array)
p

# Features from Communities Formed by Label Propagation Algorithm

Box Plots for Average Intercommunity Edge Weights and Average Intracommunity Edge Weights 

In [None]:
inner = pd.read_csv('./CSV/labelprop/avg_inner_edge_weight.csv')['avg_inner_edge_weight'].values
outer = pd.read_csv('./CSV/labelprop/avg_outer_edge_weight.csv')["avg_outer_edge_weight"].values

sns.boxplot(data=[inner,outer])
plt.ylim(0, 7)

In [None]:
_, p = stats.mannwhitneyu(inner, outer)
p
# Mann Whitney U test on box plot data

Box Plots for Authors' focus

In [None]:
inner = pd.read_csv('./CSV/labelprop/author_focus.csv')['NatureIntraCommunityFocus'].values
outer = pd.read_csv('./CSV/labelprop/author_focus.csv')['NatureInterCommunityFocus'].values

sns.boxplot(data=[inner,outer])
inner

In [None]:
_, p = stats.mannwhitneyu(inner, outer)
p
# Mann Whitney U test on box plot data