### Classification analysis using node statistics

In this notebook we conduct a classification analysis by considering node statistics, such as e.g. average degree,
for correct and misclassified nodes, given the benchmark results of any community detection method.

First, we import the necessary packages.

In [1]:
%reload_ext autoreload
%autoreload 2
import os
from collections import deque

import matplotlib.pyplot as plt
import numpy as np
from clusim.clustering import Clustering

from src.data.lfr_io import get_benchmark_files
from src.utils.cluster_analysis import normalized_local_degrees, get_misclassfied_nodes
from src.utils.plotting import init_plot_style, plot_histogram
from src.wrappers.igraph import read_graph
%pylab

init_plot_style()

# output directory for storing generated figures
fig_dir = '../figures/'
os.makedirs(fig_dir, exist_ok=True)


Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


Here we load the graph & clustering data and extract correctly classified and misclassified nodes.

In [2]:

# analysis parameters
method = 'walktrap'
avg_degree = 50
n = 1200
mu = 0.63
num_samples = 100

# assemble file paths
dir_suffix = f'{avg_degree}deg/{n}n/{int(100*mu)}mu/'
pred_dir = '../results/lfr/clustering/' +  method + '/'+ dir_suffix
benchmark_dir = '../data/lfr_benchmark/' + dir_suffix

degrees_correct = deque()
degrees_wrong = deque()

nlds_correct = deque()
nlds_wrong = deque()

# iterate over graph files and evaluate
graph_files, clustering_files_true = get_benchmark_files(benchmark_dir)
_, clustering_files_pred = get_benchmark_files(pred_dir)
for sample_idx, seed in enumerate(graph_files.keys()):
    if sample_idx >= num_samples:
        break

    # load graph & clusterings
    graph = read_graph(graph_files[seed])
    clu_true = Clustering().load(clustering_files_true[seed])
    clu_pred = Clustering().load(clustering_files_pred[seed])

    # extract correctly classified and misclassified nodes
    correctly_classified, misclassified = get_misclassfied_nodes(clu_true, clu_pred)

    # gather node degrees
    node_degrees = graph.degree()
    degrees_correct.extend([node_degrees[node] for node in correctly_classified])
    degrees_wrong.extend([node_degrees[node] for node in misclassified])

    # gather normalized local degrees
    nlds = normalized_local_degrees(graph, clu_true)
    nlds_correct.extend([nlds[node] for node in correctly_classified])
    nlds_wrong.extend([nlds[node] for node in misclassified])


Plot the degree distributions.

In [3]:
save_figure = False # if True, we save the figure as .pdf in ´fig_dir´

min_deg = np.min(degrees_correct + degrees_wrong)
max_deg = np.max(degrees_correct + degrees_wrong)
bin_edges = np.array(range(min_deg - 1, max_deg+1)) + 0.5

data = [degrees_correct, degrees_wrong]
labels = [f'$C$ = Correct ({len(degrees_correct)} nodes)',
          f'$C$ = Misclassified ({len(degrees_wrong)} nodes)']

_, ax = plt.subplots(figsize=(12,9))
plot_histogram(ax, data, labels, normalization = 'pmf', bin_edges=bin_edges, tick_fmt='%d')

ax.set_xlabel(r'Node Degree, $k_\alpha$')
ax.set_ylabel(r'Probability mass, $p(k_\alpha | C)$')
ax.legend(loc=0)
plt.tight_layout()

# save figure as .pdf
if save_figure:
    fig_path = fig_dir + 'degrees_' + method + f'_{avg_degree}k_{n}n_{int(100*mu)}mu.pdf'
    plt.savefig(fig_path, dpi=600, format='pdf')
    plt.close()


Plot the distributions of the normalized local degrees on a log scale.

In [4]:
save_figure = False # if True, we save the figure as .pdf in ´fig_dir´

n_bins = max_deg - min_deg
data = [nlds_correct, nlds_wrong]
labels = [f'$C$ = Correct ({len(nlds_correct)} nodes)',
          f'$C$ = Misclassified ({len(nlds_wrong)} nodes)']

_, ax = plt.subplots(figsize=(12,9))
plot_histogram(ax, data, labels, n_bins, normalization = 'pmf', log_scale=True)

ax.set_xlabel(r'Normalized local degree, $\hat{k}_\alpha$')
ax.set_ylabel(r'Bin probability mass, $p(\hat{k}_\alpha | C)$')
ax.legend(loc='upper left')
plt.tight_layout()

# save figure as .pdf
if save_figure:
    fig_path = fig_dir + 'nld_' + method + f'_{avg_degree}k_{n}n_{int(100*mu)}mu.pdf'
    plt.savefig(fig_path, dpi=600, format='pdf')
    plt.close()