# Benchmark for Clustering

In [1]:
"""Load/import helper functions"""

import time
import random
from LocalPopular import locally_popular_clustering, extract_labels_from_communities, time_tester, calculate_scores_clustering, \
    locally_popular_clustering_with_euclid_graphs

from GraphFunctions import generate_agents, calculate_euclidian_relationships, create_graph, \
    my_make_circles, create_graphs_euclid, create_graphs_kNN, \
    generate_graph,create_graphs_hop_distance, create_graphs_hop_distance_abs,randomize_graph_pos_labels

from PlotHelperFunctions import plot_clustering, plot_stuff

from sklearn.cluster import KMeans, DBSCAN
from sklearn.datasets import make_moons, load_breast_cancer, load_iris
from sklearn.metrics import rand_score
import numpy as np
import networkx as nx
from scipy.spatial import distance

from community_detection.leiden import leiden
from community_detection.louvain import louvain
from community_detection.quality_functions import CPM, Modularity



## Create Graphs

In [2]:
repetitions = 1    #Number of random isomorph permutation for each graph


moon_agents,moon_truth = make_moons(n_samples=300, noise=0.05)
moon_perm_agents = []
moon_perm_truth = []
for i in range(repetitions):
    g,t = randomize_graph_pos_labels(moon_agents,moon_truth)
    moon_perm_agents += [g]
    moon_perm_truth += [t]



circle_agents, circle_truth = my_make_circles(300)

circle_perm_agents = []
circle_perm_truth = []
for i in range(repetitions):
    g,t = randomize_graph_pos_labels(circle_agents,circle_truth)
    circle_perm_agents += [g]
    circle_perm_truth += [t]

cancer = load_breast_cancer()

cancer_agents = cancer['data']
cancer_truth = cancer['target']

cancer_perm_agents = []
cancer_perm_truth = []
for i in range(repetitions):
    g,t = randomize_graph_pos_labels(cancer_agents,cancer_truth)
    cancer_perm_agents += [g]
    cancer_perm_truth += [t]

iris = load_iris()
iris_agents = iris['data']
iris_truth = iris['target']


iris_perm_agents = []
iris_perm_truth = []
for i in range(repetitions):
    g,t = randomize_graph_pos_labels(iris_agents,iris_truth)
    iris_perm_agents += [g]
    iris_perm_truth += [t]


## Run the algorithms


In [3]:
import itertools
import timeit

f = 0.2   #f-bound
e = 0.3   #e-bound

data = [ moon_perm_agents,circle_perm_agents,cancer_perm_agents,iris_perm_agents]
expected_clusters = [2,3,2,3]
graph_names = ['Moons','My 3 Circles', 'Cancer', 'Iris']
graph_truths =  [moon_perm_truth,circle_perm_truth,cancer_perm_truth,iris_perm_truth]


kmeans = lambda agents, clusters: KMeans(n_clusters = clusters).fit_predict(agents)
dbscan = lambda agents, clusters: DBSCAN(eps=0.2, min_samples=5).fit_predict(agents)

lp_a_b =lambda agents, initial_clustering, pre: locally_popular_clustering_with_euclid_graphs(agents, f, e, initial_clustering,mode='B',pre=pre)
lp_a_f =lambda agents, initial_clustering, pre: locally_popular_clustering_with_euclid_graphs(agents, f, e, initial_clustering,mode='F',pre=pre)
lp_a_e =lambda agents, initial_clustering, pre: locally_popular_clustering_with_euclid_graphs(agents, f, e, initial_clustering,mode='E',pre=pre)

algorithms = [ lp_a_b,lp_a_f,lp_a_e,kmeans, dbscan]
algo_names = [ 'LP (Balanced) Heuristic',\
               'LP (Friend-Oriented) Heuristic','LP (Enemy-Averse) Heuristic',\
               'kmeans', 'dbscan']
is_lp_heuristic = [True,True,True, False, False]

collected_data = {}

for ((graph, g_name,clusters,truth), (algo, a_name,lp_heuristic)) in \
    itertools.product(zip(data, graph_names, expected_clusters,graph_truths), zip(algorithms, algo_names,is_lp_heuristic)):

        
    agents = graph

    if lp_heuristic:
        # start with everyone alone
        a_name_modified = a_name + ' starting with everyone alone'
        print(f"Running {a_name_modified} on {g_name} … ", end='')
        test_callable = lambda a: list(algo(a,len(agents[0]),None).values())
        times,outputs = time_tester(test_callable,graph)
        avg_time = sum(times)/len(times)
        scores = calculate_scores_clustering(outputs,truth,agents)
        scores['Time'] = avg_time

        collected_data[(a_name_modified,g_name)] = scores
        print(f"execution time: ~ {avg_time:.6f} seconds.")
        for score_name in scores.keys():
            print(score_name,": ~",scores.get(score_name))

        # start with random clustering 

        a_name_modified = a_name + ' starting with predicted number of clusters'
        print(f"Running {a_name_modified} on {g_name} … ", end='')
        test_callable = lambda a: list(algo(a,clusters,None).values())
        times,outputs = time_tester(test_callable,graph)
        avg_time = sum(times)/len(times)
        scores = calculate_scores_clustering(outputs,truth,agents)
        scores['Time'] = avg_time

        collected_data[(a_name_modified,g_name)] = scores
        print(f"execution time: ~ {avg_time:.6f} seconds.")
        for score_name in scores.keys():
            print(score_name,": ~",scores.get(score_name))


        # start with the output of k-means
        a_name_modified = a_name + ' starting with the output of k-means'
        print(f"Running {a_name_modified} on {g_name} … ", end='')
        test_callable = lambda a: list(algo(a,clusters,kmeans).values())
        times,outputs = time_tester(test_callable,graph)
        avg_time = sum(times)/len(times)
        scores = calculate_scores_clustering(outputs,truth,agents)
        scores['Time'] = avg_time

        collected_data[(a_name_modified,g_name)] = scores
        print(f"execution time: ~ {avg_time:.6f} seconds.")
        for score_name in scores.keys():
            print(score_name,": ~",scores.get(score_name))

        # start with the output of dbscan
        a_name_modified = a_name + ' starting with the output of dbscan'
        print(f"Running {a_name_modified} on {g_name} … ", end='')
        test_callable = lambda a: list(algo(a,clusters,kmeans).values())
        times,outputs = time_tester(test_callable,graph)
        avg_time = sum(times)/len(times)
        scores = calculate_scores_clustering(outputs,truth,agents)
        scores['Time'] = avg_time

        collected_data[(a_name_modified,g_name)] = scores
        print(f"execution time: ~ {avg_time:.6f} seconds.")
        for score_name in scores.keys():
            print(score_name,": ~",scores.get(score_name))

            
    else:

        print(f"Running {a_name} on {g_name} … ", end='')
        test_callable = lambda a : algo(a, clusters)
            
        times,outputs = time_tester(test_callable,graph)
        avg_time = sum(times)/len(times)
        scores = calculate_scores_clustering(outputs,truth,agents)
        scores['Time'] = avg_time
        
        collected_data[(a_name,g_name)] = scores
        print(f"execution time: ~ {avg_time:.6f} seconds.")
        for score_name in scores.keys():
            print(score_name,": ~",scores.get(score_name))



Running LP (Balanced) Heuristic starting with everyone alone on Moons … execution time: ~ 4.244243 seconds.
Rand Index : ~ 0.6267112597547381
Silhouette Score : ~ 0.18982124107834342
Davies Bouldin Score : ~ 0.7144303028665547
Time : ~ 4.244242799934
Running LP (Balanced) Heuristic starting with predicted number of clusters on Moons … execution time: ~ 0.348634 seconds.
Rand Index : ~ 0.4983500557413601
Silhouette Score : ~ 0.1923356358663492
Davies Bouldin Score : ~ 149.99864020231377
Time : ~ 0.34863379993475974
Running LP (Balanced) Heuristic starting with the output of k-means on Moons … execution time: ~ 0.417323 seconds.
Rand Index : ~ 0.7488517279821628
Silhouette Score : ~ 0.4663885745588683
Davies Bouldin Score : ~ 0.8255088522736317
Time : ~ 0.4173232999164611
Running LP (Balanced) Heuristic starting with the output of dbscan on Moons … execution time: ~ 0.303922 seconds.
Rand Index : ~ 0.7488517279821628
Silhouette Score : ~ 0.4663885745588683
Davies Bouldin Score : ~ 0.8255

## Gather the numbers

We can use the collected_data dictionairy to build a table for better comparison


In [4]:
print(collected_data)

{('LP (Balanced) Heuristic starting with everyone alone', 'Moons'): {'Rand Index': np.float64(0.6267112597547381), 'Silhouette Score': np.float64(0.18982124107834342), 'Davies Bouldin Score': np.float64(0.7144303028665547), 'Time': 4.244242799934}, ('LP (Balanced) Heuristic starting with predicted number of clusters', 'Moons'): {'Rand Index': np.float64(0.4983500557413601), 'Silhouette Score': np.float64(0.1923356358663492), 'Davies Bouldin Score': np.float64(149.99864020231377), 'Time': 0.34863379993475974}, ('LP (Balanced) Heuristic starting with the output of k-means', 'Moons'): {'Rand Index': np.float64(0.7488517279821628), 'Silhouette Score': np.float64(0.4663885745588683), 'Davies Bouldin Score': np.float64(0.8255088522736317), 'Time': 0.4173232999164611}, ('LP (Balanced) Heuristic starting with the output of dbscan', 'Moons'): {'Rand Index': np.float64(0.7488517279821628), 'Silhouette Score': np.float64(0.4663885745588683), 'Davies Bouldin Score': np.float64(0.8255088522736317),