### Analyse node statistics for benchmark results

In this notebook we analyse the node statistics, such as e.g. average degree, for correctly and
misclassified nodes, given the benchmark results of any community detection method.

First, we import the necessary packages.

In [None]:
%reload_ext autoreload
%autoreload 2
import os

import matplotlib.pyplot as plt
import numpy as np
from clusim.clustering import Clustering

from src.utils.cluster_analysis import get_cluster_properties, get_node_properties
from src.utils.plotting import plot_histogram, init_plot_style
from src.wrappers.igraph import read_graph

%pylab

init_plot_style()

First, we specify the network to be analyzed, load the network and glance at its basic properties.

In [None]:
# select network
network = 'github'

# assemble paths
graph_file = '../data/empirical/clean/' + network + '.txt'
results_dir = '../results/empirical/' + network + '/'
os.makedirs(results_dir, exist_ok=True)

# output directory for storing generated figures
fig_dir = '../figures/'
os.makedirs(fig_dir, exist_ok=True)

# load network
graph = read_graph(graph_file)
node_degrees = graph.degree()
avg_degree = np.mean(node_degrees)
print(f'Network size is {len(graph.vs)} nodes, {len(graph.es)} edges')
print (f'Min/Max/Average degrees are {np.min(node_degrees)}, {np.max(node_degrees)}, {avg_degree}.')

Here we compute single-number characteristics of the detected clusters.

In [None]:
methods = ['infomap', 'synwalk', 'walktrap']
graph = read_graph(graph_file)

for method in methods:
    clu = Clustering().load(results_dir + 'clustering_' + method + '.json')
    trivial_clu_sizes = [len(cluster) for cluster in clu.to_cluster_list() if len(cluster) < 3]
    num_trivial = len(trivial_clu_sizes)
    num_non_trivial = clu.n_clusters - num_trivial

    print ('\nCluster statistics for ' + method + ': ')
    print (f'Number of detected clusters: {clu.n_clusters}')
    # print (f'Number of trivial clusters: {clu.n_clusters - num_non_trivial}')
    print (f'Number of non-trivial clusters: {num_non_trivial}')
    print (f'Fraction of non-trivial clusters: {num_non_trivial/clu.n_clusters}')
    print (f'Fraction of nodes in non-trivial clusters: {1.0 - sum(trivial_clu_sizes)/clu.n_elements}')
    print (f'Modularity: {graph.modularity(clu.to_membership_list())}')

Here we plot the degree occurances of the network.

In [None]:
# plot parameters
bin_size = 1 # integer bin size for aggregating degrees
save_figure = False # if True, we save the figure as .pdf in ´fig_dir´
plt.close('all')

graph = read_graph(graph_file)
node_degrees = graph.degree()
avg_degree = np.mean(node_degrees)

# compute degree pmf
min_deg = np.min(node_degrees)
max_deg = np.max(node_degrees)
bin_edges = np.array(range(min_deg - 1, max_deg+1, bin_size)) + 0.5
bin_centers = bin_edges[:-1] + 0.5
occurances,_ =  np.histogram(node_degrees, bins=bin_edges, density=True)

# plot the degree distribution
fig, ax = plt.subplots(figsize=(12,9))
ax.plot(bin_centers, occurances, 'x', label=f'Node Degrees')
ax.plot([avg_degree, avg_degree], [0, np.max(occurances)], color='crimson',
        label=fr'Average Degree, $\bar{{k}} = {avg_degree:.2f}$')

ax.set_ylabel(r'Probability Mass, $p(k_\alpha)$')
ax.set_xlabel(r'Node Degree, $k_\alpha$')
ax.loglog()
ax.legend(loc='upper right')
plt.tight_layout()

# save figure as .pdf
if save_figure:
    fig_path = fig_dir + 'degrees_' + network + '.pdf'
    plt.savefig(fig_path, dpi=600, format='pdf')
    plt.close()


The next cell plots the histogram of cluster sizes.

In [None]:
feature = 'size'
n_bins = 25
xmax = 1e3
plt.close('all')
save_figure = True # if True, we save the figure as .pdf in ´fig_dir´

# compute cluster properties
data = []
for method in methods:
    clu = Clustering().load(results_dir + 'clustering_' + method + '.json')
    data.append(get_cluster_properties(graph, clu, feature=feature))

# plot histogram
_, ax = plt.subplots(figsize=(12,9))
plot_histogram(ax, data, methods, n_bins, normalization = 'pmf', log_scale=True, xmax=xmax)

ax.set_xlabel(r'Cluster sizes, $|\mathcal{Y}_i|$')
ax.set_ylabel(r'Bin Probability Mass, $p(|\mathcal{Y}_i|)$')
ax.legend(loc='best')
plt.tight_layout()

# save figure as .pdf
if save_figure:
    fig_path = fig_dir + feature + '_' + network + '.pdf'
    plt.savefig(fig_path, dpi=600, format='pdf')
    plt.close()

The next cell plots the histogram of cluster densities.

In [None]:
feature = 'density'
xmin=1e-2
n_bins = 25
plt.close('all')
save_figure = True # if True, we save the figure as .pdf in ´fig_dir´

# compute cluster properties
data = []
for method in methods:
    clu = Clustering().load(results_dir + 'clustering_' + method + '.json')
    data.append(get_cluster_properties(graph, clu, feature=feature))

# plot histogram
_, ax = plt.subplots(figsize=(12,9))
plot_histogram(ax, data, methods, n_bins, normalization = 'pmf', log_scale=True, xmin=xmin)

ax.set_xlabel(r'Cluster Density, $\rho(\mathcal{Y}_i)$')
ax.set_ylabel(r'Bin Probability Mass, $p(\rho(\mathcal{Y}_i))$')
ax.legend(loc='best')
plt.tight_layout()

# save figure as .pdf
if save_figure:
    fig_path = fig_dir + feature + '_' + network + '.pdf'
    plt.savefig(fig_path, dpi=600, format='pdf')
    plt.close()

The next cell plots the histogram of clustering coefficients.

In [None]:
feature = 'clustering_coefficient'
n_bins = 25
xmin = 1e-2
plt.close('all')
save_figure = True # if True, we save the figure as .pdf in ´fig_dir´

# compute cluster properties
data = []
for method in methods:
    clu = Clustering().load(results_dir + 'clustering_' + method + '.json')
    data.append(get_cluster_properties(graph, clu, feature=feature))

# plot histogram
_, ax = plt.subplots(figsize=(12,9))
plot_histogram(ax, data, methods, n_bins, normalization = 'pmf', log_scale=True, xmin=xmin)

ax.set_xlabel(r'Clustering coefficient, $c(\mathcal{Y}_i)$')
ax.set_ylabel(r'Bin Probability Mass, $p(c(\mathcal{Y}_i))$')
ax.legend(loc='best')
plt.tight_layout()

# save figure as .pdf
if save_figure:
    fig_path = fig_dir + feature + '_' + network + '.pdf'
    plt.savefig(fig_path, dpi=600, format='pdf')
    plt.close()


The next cell plots the histogram of cluster conductances.

In [None]:
feature = 'conductance'
n_bins = 25
plt.close('all')
save_figure = True # if True, we save the figure as .pdf in ´fig_dir´

# compute cluster properties
data = []
for method in methods:
    clu = Clustering().load(results_dir + 'clustering_' + method + '.json')
    data.append(get_cluster_properties(graph, clu, feature=feature))

# plot histogram
_, ax = plt.subplots(figsize=(12,9))
plot_histogram(ax, data, methods, n_bins, normalization = 'pmf', log_scale=False)

ax.set_xlabel(r'Conductance, $\kappa(\mathcal{Y}_i)$')
ax.set_ylabel(r'Bin Probability Mass, $p(\kappa(\mathcal{Y}_i))$')
ax.legend(loc='best')
plt.tight_layout()

# save figure as .pdf
if save_figure:
    fig_path = fig_dir + feature + '_' + network + '.pdf'
    plt.savefig(fig_path, dpi=600, format='pdf')
    plt.close()



The next cell plots the histogram of cluster cut ratios.

In [None]:
feature = 'cut_ratio'
xmin = None
n_bins = 25
plt.close('all')
save_figure = True # if True, we save the figure as .pdf in ´fig_dir´

# compute cluster properties
data = []
for method in methods:
    clu = Clustering().load(results_dir + 'clustering_' + method + '.json')
    data.append(get_cluster_properties(graph, clu, feature=feature))

# plot histogram
_, ax = plt.subplots(figsize=(12,9))
plot_histogram(ax, data, methods, n_bins, normalization = 'pmf', log_scale=True, xmin=xmin)

ax.set_xlabel(r'Cut Ratio, $\xi(\mathcal{Y}_i)$')
ax.set_ylabel(r'Bin Probability Mass, $p(\xi(\mathcal{Y}_i))$')
ax.legend(loc='best')
plt.tight_layout()

# save figure as .pdf
if save_figure:
    fig_path = fig_dir + feature + '_' + network + '.pdf'
    plt.savefig(fig_path, dpi=600, format='pdf')
    plt.close()

The next cell plots the histogram of node mixing parameters.

In [None]:
feature = 'mixing_parameter'
xmin = 1e-2
n_bins = 15
plt.close('all')
save_figure = True # if True, we save the figure as .pdf in ´fig_dir´

# compute cluster properties
data = []
for method in methods:
    clu = Clustering().load(results_dir + 'clustering_' + method + '.json')
    data.append(get_node_properties(graph, clu, feature=feature))

# plot histogram
_, ax = plt.subplots(figsize=(12,9))
plot_histogram(ax, data, methods, n_bins, normalization = 'pmf', log_scale=True, xmin=xmin)

ax.set_xlabel(r'Mixing parameter, $\mu_\alpha$')
ax.set_ylabel(r'Bin Probability Mass, $p(\mu_\alpha)$')
ax.legend(loc='best')
plt.tight_layout()

# save figure as .pdf
if save_figure:
    fig_path = fig_dir + feature + '_' + network + '.pdf'
    plt.savefig(fig_path, dpi=600, format='pdf')
    plt.close()


The next cell plots the histogram of normalized local degrees.

In [None]:
feature = 'nld'
n_bins = 25
plt.close('all')
save_figure = True # if True, we save the figure as .pdf in ´fig_dir´

# compute cluster properties
data = []
for method in methods:
    clu = Clustering().load(results_dir + 'clustering_' + method + '.json')
    data.append(get_node_properties(graph, clu, feature=feature))

# plot histogram
_, ax = plt.subplots(figsize=(12,9))
plot_histogram(ax, data, methods, n_bins, normalization = 'pmf', log_scale=True)

ax.set_xlabel(r'Normalized local degree, $\hat{k}_\alpha$')
ax.set_ylabel(r'Probability Mass, $p(\hat{k}_\alpha)$')
ax.legend(loc='best')
plt.tight_layout()

# save figure as .pdf
if save_figure:
    fig_path = fig_dir + feature + '_' + network + '.pdf'
    plt.savefig(fig_path, dpi=600, format='pdf')
    plt.close()