# Do logarithmic proximity measures outperform plain ones in graph clustering?

In [None]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [1]:
import sys
from collections import defaultdict
from itertools import combinations, product
import pickle
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import adjusted_rand_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
sys.path.append('..')
from py_graphs.graphs.generator import StochasticBlockModel
from py_graphs.measure import *
from py_graphs.measure import H_kernels_plus_RSP_FE
from py_graphs.cluster.ward import Ward
from py_graphs.colors import d3
from py_graphs.scenario import *
from py_graphs.scorer import copeland

## 3. Competition by Copeland’s score
The competition has been performed on random graphs generated with the $G(N,(m)p_{in}, p_{out})$ model and the following parameters: $N \in {100, 200}$, the number of classes $m \in {2, 4}$, $p_{in} = 0.3$, $p_{out} \in {0.1, 0.15}$. For every combination of parameters, we generated 50 graphs and for each of them we computed the best ARI’s the measure families reached. The results are presented in Table 1(a).

### find best params and 95% percentile

In [8]:
# calc data to find best params
results = defaultdict(lambda: defaultdict(lambda: 0))
for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
    n_nodes, n_classes, p_out = column
    graphs, info = StochasticBlockModel(n_nodes, n_classes, 0.3, p_out).generate_graphs(2)
    classic_plot = ParallelByGraphs(adjusted_rand_score, np.linspace(0, 1, 31), progressbar=False)
    for measure_class in tqdm(H_kernels_plus_RSP_FE, desc=str(column)):
        results[column][measure_class.name] = classic_plot.perform(Ward, measure_class, graphs, n_classes, n_jobs=-1)

FloatingPointError: divide by zero encountered in double_scalars

In [None]:
# find best params
best_params = defaultdict(lambda: defaultdict(lambda: 0))
percentile_params = defaultdict(lambda: defaultdict(lambda: 0))
for column, measures in results.items():
    for measure_name, measure_results in measures.items():
        x, y, error = measure_results
        best_idx = np.argmax(y)
        percentile_idx = list(y).index(np.percentile(y, 95, interpolation='lower'))
        
        print('{}\t{}\t{:0.2f} ({:0.2f})\t{:0.2f} ({:0.2f})'.format(
            column, measure_name.ljust(8, ' '), x[best_idx], y[best_idx], x[percentile_idx], y[percentile_idx]))
        best_params[column][measure_name] = x[best_idx]
        percentile_params[column][measure_name] = x[percentile_idx]

### calc competition for given params

In [None]:
results = defaultdict(lambda: defaultdict(0))
for column in tqdm(product([100, 200], [2, 4], [0.1, 0.15])):  # n_nodes, n_classes, p_out
    graphs, info = StochasticBlockModel(n_nodes, n_classes, 0.3, p_out).generate_graphs(50)
    for nodes, edges in tqdm(graphs, desc=str(column)):
        single_competition_best = {}
        for measure_class in H_kernels_plus_RSP_FE:
            best_param = best_params[column][measure_class.name]
            kernel = kernel_class(edges)
            param = kernel.scaler.scale(best_param)
            K = kernel.get_K(param)
            y_pred = clf.predict(K)
            ari = adjusted_rand_score(nodes, y_pred)
            single_competition_best[measure_class] = ari
        single_competition_score = copeland(single_competition_best.items())
        for measure_name, delta in single_competition_score.items():
            results[measure_name] += delta

In [None]:
results