# Benchmark for Community detection

In [1]:
"""Load/import helper functions"""

import time
import random
from LocalPopular import locally_popular_clustering, extract_labels_from_communities, time_tester, calculate_scores_CD

from GraphFunctions import generate_agents, calculate_euclidian_relationships, create_graph, \
    my_make_circles, create_graphs_euclid, create_graphs_kNN, \
    generate_graph,create_graphs_hop_distance, create_graphs_hop_distance_abs,randomize_graph_node_labels

from PlotHelperFunctions import plot_clustering, plot_stuff

from sklearn.cluster import KMeans, DBSCAN
from sklearn.datasets import make_moons
from sklearn.metrics import rand_score
import numpy as np
import networkx as nx
from scipy.spatial import distance

from community_detection.leiden import leiden
from community_detection.louvain import louvain
from community_detection.quality_functions import CPM, Modularity


import data.jazz as jazz
import data.cora as cora

## Create Graphs

In [2]:
cora_graph = cora.get_graph()

cora_graph = nx.relabel_nodes(cora_graph, {list(cora_graph.nodes())[i] : i for i in range(len(cora_graph.nodes()))} )
cora_graph,_ = randomize_graph_node_labels(cora_graph,None)

cora_truth = list(map(lambda x: cora_graph.nodes[x]['subject'], cora_graph.nodes()))

jazz_graph = jazz.get_graph()
jazz_graph = nx.relabel_nodes(jazz_graph, {i : i-1 for i in range(len(jazz_graph)+1)} )
jazz_graph,_ = randomize_graph_node_labels(jazz_graph,None)
jazz_truth = None


karate_graph = nx.karate_club_graph()
karate_graph,_ = randomize_graph_node_labels(karate_graph,None)
karate_truth = list(map(lambda x: karate_graph.nodes[x]["club"], range(34)))

graph,graph_truth = generate_graph(10,25,0.2,0.05)
graph,graph_truth = randomize_graph_node_labels(graph,graph_truth)

## Run the algorithms


In [3]:
import itertools
import timeit

r = 1

graphs = [ karate_graph, jazz_graph, cora_graph,graph]
expected_clusters = [2,None,7,25]
graph_names = [ 'Karate Club', 'Jazz Musicians', 'Cora Citations','25 quasi cliques (Name WIP)']
graph_truths =  [karate_truth,jazz_truth,cora_truth,graph_truth]

𝓗 = Modularity(1.0)

fn_louvain_mod = lambda G: louvain(G, 𝓗)
fn_leiden_mod  = lambda G: leiden(G, 𝓗)

algorithms = [ fn_louvain_mod, fn_leiden_mod]
algo_names = [ 'Louvain (Mod)', 'Leiden (Mod)']

lp_a_b =lambda agents, G_F, G_E, initial_clustering: locally_popular_clustering(agents, G_F, G_E, initial_clustering,mode='B')
lp_a_f =lambda agents, G_F, G_E, initial_clustering: locally_popular_clustering(agents, G_F, G_E, initial_clustering,mode='F')
lp_a_e =lambda agents, G_F, G_E, initial_clustering: locally_popular_clustering(agents, G_F, G_E, initial_clustering,mode='E')

algorithms2 = [ lp_a_b,lp_a_f,lp_a_e,fn_louvain_mod, fn_leiden_mod]
algo_names2 = [ 'LP (Balanced) Heuristic',\
               'LP (Friend-Oriented) Heuristic','LP (Enemy-Averse) Heuristic',\
               'Louvain (Mod)', 'Leiden (Mod)']
is_lp_heuristic = [True,True,True, False, False]

collected_data = {}
for ((graph, g_name,clusters,truth), (algo, a_name,lp_heuristic)) in \
    itertools.product(zip(graphs, graph_names, expected_clusters,graph_truths), zip(algorithms2, algo_names2,is_lp_heuristic)):

    agents = graph.nodes()
    G_F,G_E = create_graphs_hop_distance(graph,0.2,0.2)

    if lp_heuristic:
        # start with everyone alone
        a_name_modified = a_name + ' starting with everyone alone'
        initial_clustering = {i : i for i in range(len(agents))}
        if graph == cora_graph:
            a_name_modified += ' *6 starting clusters'
            initial_clustering = {i : i%6 for i in range(len(agents))}
        print(f"Running {a_name_modified} on {g_name} … ", end='')
        
        test_callable = lambda: algo(agents,G_F,G_E,initial_clustering)
        times,outputs = time_tester(test_callable,r)
        avg_time = sum(times)/len(times)
        scores = calculate_scores_CD(outputs,truth,graph)
        scores['Time'] = avg_time

        collected_data[(a_name_modified,g_name)] = scores
        print(f"execution time: ~ {avg_time:.6f} seconds.")
        for score_name in scores.keys():
            print(score_name,": ~",scores.get(score_name))

        # start with random clustering 
        if clusters is not None:
            a_name_modified = a_name + ' starting with predicted number of clusters'
            print(f"Running {a_name_modified} on {g_name} … ", end='')
            initial_clustering = {i : i%clusters for i in range(len(agents))}
            test_callable = lambda: algo(agents,G_F,G_E,initial_clustering)
            times,outputs = time_tester(test_callable,r)
            avg_time = sum(times)/len(times)
            scores = calculate_scores_CD(outputs,truth,graph)
            scores['Time'] = avg_time
    
            collected_data[(a_name_modified,g_name)] = scores
            print(f"execution time: ~ {avg_time:.6f} seconds.")
            for score_name in scores.keys():
                print(score_name,": ~",scores.get(score_name))

        # start with the output of leiden
        a_name_modified = a_name + ' starting with the output of leiden'
        print(f"Running {a_name_modified} on {g_name} … ", end='')

        Q = fn_leiden_mod(graph)
        leiden_labels = extract_labels_from_communities(Q.communities)
        initial_clustering = {i : leiden_labels[i] for i in range(len(agents))}
        test_callable = lambda: algo(agents,G_F,G_E,initial_clustering)
        times,outputs = time_tester(test_callable,r)
        avg_time = sum(times)/len(times)
        scores = calculate_scores_CD(outputs,truth,graph)
        scores['Time'] = avg_time

        collected_data[(a_name_modified,g_name)] = scores
        print(f"execution time: ~ {avg_time:.6f} seconds.")
        for score_name in scores.keys():
            print(score_name,": ~",scores.get(score_name))

    else:
        print(f"Running {a_name} on {g_name} … ", end='')
        test_callable = lambda: algo(graph)
        times,outputs = time_tester(test_callable,r)
        outputs = [extract_labels_from_communities(c.communities) for c in outputs]
        avg_time = sum(times)/len(times)
        scores = calculate_scores_CD(outputs,truth,graph)
        scores['Time'] = avg_time
        collected_data[(a_name,g_name)] = scores
        print(f"execution time: ~ {avg_time:.6f} seconds.")
        for score_name in scores.keys():
            print(score_name,": ~",scores.get(score_name))


Running LP (Balanced) Heuristic starting with everyone alone on Karate Club … execution time: ~ 0.005057 seconds.
Rand Index : ~ 0.5632798573975044
Modularity : ~ 0.24550701823429086
Time : ~ 0.005057499976828694
Running LP (Balanced) Heuristic starting with predicted number of clusters on Karate Club … execution time: ~ 0.000563 seconds.
Rand Index : ~ 0.8859180035650623
Modularity : ~ 0.3998332115215232
Time : ~ 0.00056310003856197
Running LP (Balanced) Heuristic starting with the output of leiden on Karate Club … execution time: ~ 0.000567 seconds.
Rand Index : ~ 0.6827094474153298
Modularity : ~ 0.4287119806600326
Time : ~ 0.0005673999548889697
Running LP (Friend-Oriented) Heuristic starting with everyone alone on Karate Club … execution time: ~ 0.005337 seconds.
Rand Index : ~ 0.5704099821746881
Modularity : ~ 0.2655216356515058
Time : ~ 0.005337499955203384
Running LP (Friend-Oriented) Heuristic starting with predicted number of clusters on Karate Club … execution time: ~ 0.00055

## Gather the numbers

We can use the collected_data dictionairy to build a table for better comparison


In [4]:
print(collected_data)

{('LP (Balanced) Heuristic starting with everyone alone', 'Karate Club'): {'Rand Index': np.float64(0.5632798573975044), 'Modularity': 0.24550701823429086, 'Time': 0.005057499976828694}, ('LP (Balanced) Heuristic starting with predicted number of clusters', 'Karate Club'): {'Rand Index': np.float64(0.8859180035650623), 'Modularity': 0.3998332115215232, 'Time': 0.00056310003856197}, ('LP (Balanced) Heuristic starting with the output of leiden', 'Karate Club'): {'Rand Index': np.float64(0.6827094474153298), 'Modularity': 0.4287119806600326, 'Time': 0.0005673999548889697}, ('LP (Friend-Oriented) Heuristic starting with everyone alone', 'Karate Club'): {'Rand Index': np.float64(0.5704099821746881), 'Modularity': 0.2655216356515058, 'Time': 0.005337499955203384}, ('LP (Friend-Oriented) Heuristic starting with predicted number of clusters', 'Karate Club'): {'Rand Index': np.float64(0.8859180035650623), 'Modularity': 0.3998332115215232, 'Time': 0.0005548999761231244}, ('LP (Friend-Oriented) H