# Experiments on real data

### Imports

In [None]:
import time
import numpy as np
import networkx as nx

In [None]:
from paris import paris
from louvain import louvain
from spectral import spectral
from hierarchy import select_clustering, top_clusterings
from real_data import load_dataset
from plot_tools import plot_clusterings, plot_running_times

### Openstreet

In [None]:
data = "openstreet"
G, pos, name = load_dataset(data)

In [None]:
print(nx.info(G))

In [None]:
nodes = list(G.nodes())
D = paris(G)
C_list = top_clusterings(D, nodes, 2)

In [None]:
plot_clusterings(G, C_list, pos)

### Openflights

In [None]:
data = "openflights"
G, pos, name = load_dataset(data)

In [None]:
print(nx.info(G))

In [None]:
nodes = list(G.nodes())
D = paris(G)
C_list = top_clusterings(D, nodes, 3)

In [None]:
plot_clusterings(G, C_list, pos)

### SchoolsWikipedia

In [None]:
def show_largest_clusters(C, G, name, k = 10, nb_nodes = 10):
    index = np.argsort([-len(c) for c in C])
    for l in range(min(len(C), k)):
        c = C[index[l]]
        index_node = np.argsort([-G.degree(u) for u in c])
        print("#"+str(l+1))
        print("Size = "+str(len(c)))
        cluster_list = ""
        for i in range(min(nb_nodes, len(c))):
            u = c[index_node[i]]
            cluster_list += name[u]+", "
        print(cluster_list[:-2]+"\n")

In [None]:
data = "wikipedia-school"
G, pos, name = load_dataset(data)

In [None]:
print(nx.info(G))

In [None]:
D = paris(G)
nodes = list(G.nodes())
C_list = top_clusterings(D, nodes, 6)

In [None]:
[len(C) for C in C_list]

In [None]:
# Best clustering (first in the list)
show_largest_clusters(C_list[0], G, name)

In [None]:
# Best clustering (last in the list)
show_largest_clusters(C_list[-1], G, name, k = 20)

In [None]:
# Clustering with 100 clusters
n = len(nodes)
k = 100
C, dist = select_clustering(D, nodes, n - k)
show_largest_clusters(C, G, name)

In [None]:
# Clustering with 500 clusters
n = len(nodes)
k = 500
C, dist = select_clustering(D, nodes, n - k)
show_largest_clusters(C, G, name)

### Running times

In [None]:
def test_algo(G, algo, nb_samples):
    result = []
    for l in range(nb_samples):
        t0 = time.time()
        if algo == "paris":
            D = paris(G)
        elif algo == "louvain":
            C = louvain(G)
        elif algo == "spectral":
            D = spectral(G)
        t1 = time.time()
        result.append(t1 - t0)
    return result

In [None]:
def get_running_times(nb_samples = 100):
    datasets = ["openstreet", "openflights", "wikipedia-school"]
    algos = ["paris", "louvain", "spectral"]

    results = [[],[],[]]
    for dataset in datasets:
        print("* " + dataset)
        G, pos, name = load_dataset(dataset)
        for i, algo in enumerate(algos):
            print(algo)
            results[i].append(test_algo(G, algo, nb_samples))
    return results

In [None]:
# For quick test
nb_samples = 5

results = get_running_times(nb_samples)

In [None]:
plot_running_times(results)