# Benchmark for Clustering

In [1]:
"""Load/import helper functions"""

import time
import random
from LocalPopular import locally_popular_clustering, extract_labels_from_communities, time_tester, calculate_scores_clustering

from GraphFunctions import generate_agents, calculate_euclidian_relationships, create_graph, \
    my_make_circles, create_graphs_euclid, create_graphs_kNN, \
    generate_graph,create_graphs_hop_distance, create_graphs_hop_distance_abs,randomize_graph_node_labels

from PlotHelperFunctions import plot_clustering, plot_stuff

from sklearn.cluster import KMeans, DBSCAN
from sklearn.datasets import make_moons, load_breast_cancer, load_iris
from sklearn.metrics import rand_score
import numpy as np
import networkx as nx
from scipy.spatial import distance

from community_detection.leiden import leiden
from community_detection.louvain import louvain
from community_detection.quality_functions import CPM, Modularity



## Create Graphs

In [2]:

moon_agents,moon_truth = make_moons(n_samples=150, noise=0.05)

circle_agents, circle_truth = my_make_circles(150)

cancer = load_breast_cancer()

cancer_agents = cancer['data']
cancer_truth = cancer['target']

iris = load_iris()
iris_agents = iris['data']
iris_truth = iris['target']


## Run the algorithms


In [3]:
import itertools
import timeit



data = [ moon_agents, circle_agents, cancer_agents,iris_agents]
expected_clusters = [2,3,2,2]
graph_names = [ 'Moons', 'Three Circles', 'Breast Cancer', 'Iris']
graph_truths =  [moon_truth,circle_truth, cancer_truth,iris_truth]


kmeans = lambda clusters, agents: KMeans(n_clusters = clusters).fit_predict(agents)
dbscan = lambda agents: DBSCAN(eps=0.2, min_samples=5).fit_predict(agents)

lp_a_b =lambda agents, G_F, G_E, initial_clustering: locally_popular_clustering(agents, G_F, G_E, initial_clustering,mode='B')
lp_a_f =lambda agents, G_F, G_E, initial_clustering: locally_popular_clustering(agents, G_F, G_E, initial_clustering,mode='F')
lp_a_e =lambda agents, G_F, G_E, initial_clustering: locally_popular_clustering(agents, G_F, G_E, initial_clustering,mode='E')

algorithms2 = [ lp_a_b,lp_a_f,lp_a_e,kmeans, dbscan]
algo_names2 = [ 'LP (Balanced) Heuristic',\
               'LP (Friend-Oriented) Heuristic','LP (Enemy-Averse) Heuristic',\
               'kmeans', 'dbscan']
is_lp_heuristic = [True,True,True, False, False]

collected_data = {}

for ((graph, g_name,clusters,truth), (algo, a_name,lp_heuristic)) in \
    itertools.product(zip(data, graph_names, expected_clusters,graph_truths), zip(algorithms2, algo_names2,is_lp_heuristic)):
    agents = graph
    G_F, G_E = create_graphs_euclid(agents,0.2,0.3)
    if clusters is not None:
        initial_clustering = {i : i%clusters for i in range(len(agents))}
    else:
        initial_clustering = {i : i for i in range(len(agents))}

        
    if lp_heuristic:
        # start with everyone alone
        a_name_modiified = a_name + ' starting with everyone alone'
        print(f"Running {a_name_modiified} on {g_name} … ", end='')
        initial_clustering = {i : i for i in range(len(agents))}
        test_callable = lambda: list(algo(agents,G_F,G_E,initial_clustering).values())
        times,outputs = time_tester(test_callable,5)
        avg_time = sum(times)/len(times)
        scores = calculate_scores_clustering(outputs,truth,agents)
        scores['Time'] = avg_time

        collected_data[(a_name_modiified,g_name)] = scores
        print(f"execution time: ~ {avg_time:.6f} seconds.")
        for score_name in scores.keys():
            print(score_name,": ~",scores.get(score_name))
        
        
        # start with random clustering 

        a_name_modified = a_name + ' starting with predicted number of clusters'
        print(f"Running {a_name_modified} on {g_name} … ", end='')
        initial_clustering = {i : i%clusters for i in range(len(agents))}
        test_callable = lambda: list(algo(agents,G_F,G_E,initial_clustering).values())
        times,outputs = time_tester(test_callable,5)
        avg_time = sum(times)/len(times)
        scores = calculate_scores_clustering(outputs,truth,agents)
        scores['Time'] = avg_time

        collected_data[(a_name_modified,g_name)] = scores
        print(f"execution time: ~ {avg_time:.6f} seconds.")
        for score_name in scores.keys():
            print(score_name,": ~",scores.get(score_name))

        # start with the output of k-means
        a_name_modifiied = a_name + ' starting with the output of k-means'
        print(f"Running {a_name_modifiied} on {g_name} … ", end='')
        kmeans_pre = KMeans(n_clusters = clusters)
        k_means_labels = kmeans_pre.fit_predict(agents)
        initial_clustering = {i : k_means_labels[i] for i in range(len(agents))}
        test_callable = lambda: list(algo(agents,G_F,G_E,initial_clustering).values())
        times,outputs = time_tester(test_callable,5)
        avg_time = sum(times)/len(times)
        scores = calculate_scores_clustering(outputs,truth,agents)

        collected_data[(a_name_modifiied,g_name)] = scores
        print(f"execution time: ~ {avg_time:.6f} seconds.")
        for score_name in scores.keys():
            print(score_name,": ~",scores.get(score_name))

        # start with the output of DBSCAN
        a_name_modifiied = a_name + ' starting with the output of DBSCAN'
        print(f"Running {a_name_modifiied} on {g_name} … ", end='')
        dbscan_pre = DBSCAN(eps=0.2, min_samples=5)
        dbscan_labels = dbscan_pre.fit_predict(agents)
        initial_clustering = {i : dbscan_labels[i] for i in range(len(agents))}
        test_callable = lambda: list(algo(agents,G_F,G_E,initial_clustering).values())
        times,outputs = time_tester(test_callable,5)
        avg_time = sum(times)/len(times)
        scores = calculate_scores_clustering(outputs,truth,agents)
        scores['Time'] = avg_time
        
        collected_data[(a_name_modifiied,g_name)] = scores
        print(f"execution time: ~ {avg_time:.6f} seconds.")
        for score_name in scores.keys():
            print(score_name,": ~",scores.get(score_name))
        
    else:

        print(f"Running {a_name} on {g_name} … ", end='')
        test_callable = lambda: algo(agents)
        if algo == kmeans:
            if clusters is not None:
                test_callable = lambda: algo(clusters,agents)
            
        times,outputs = time_tester(test_callable,5)
        avg_time = sum(times)/len(times)
        scores = calculate_scores_clustering(outputs,truth,agents)
        scores['Time'] = avg_time
        
        collected_data[(a_name,g_name)] = scores
        print(f"execution time: ~ {avg_time:.6f} seconds.")
        for score_name in scores.keys():
            print(score_name,": ~",scores.get(score_name))



Running LP (Balanced) Heuristic starting with everyone alone on Moons … execution time: ~ 0.521910 seconds.
Rand Index : ~ 0.6193288590604027
Silhouette Score : ~ 0.3198500064515449
Davies Bouldin Score : ~ 1.174834402968167
Time : ~ 0.5219102399889379
Running LP (Balanced) Heuristic starting with predicted number of clusters on Moons … execution time: ~ 0.007470 seconds.
Rand Index : ~ 0.7386129753914988
Silhouette Score : ~ 0.3198500064515449
Davies Bouldin Score : ~ 1.174834402968167
Time : ~ 0.007469599996693432
Running LP (Balanced) Heuristic starting with the output of k-means on Moons … execution time: ~ 0.004509 seconds.
Rand Index : ~ 0.7386129753914988
Silhouette Score : ~ 0.3198500064515449
Davies Bouldin Score : ~ 1.174834402968167
Running LP (Balanced) Heuristic starting with the output of DBSCAN on Moons … execution time: ~ 0.005119 seconds.
Rand Index : ~ 0.7386129753914988
Silhouette Score : ~ 0.3198500064515449
Davies Bouldin Score : ~ 1.174834402968167
Time : ~ 0.0051

## Gather the numbers

We can use the collected_data dictionairy to build a table for better comparison


In [None]:
print(collected_data)