# Do logarithmic proximity measures outperform plain ones in graph clustering?

In [1]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.append('../..')

In [2]:
from collections import defaultdict
from itertools import product
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import adjusted_rand_score
from scipy import stats

In [3]:
from pygraphs.graphs.generator import StochasticBlockModel
from pygraphs.measure import kernels
from pygraphs.cluster import KWard
from pygraphs.scenario import ParallelByGraphs
from pygraphs.scorer import copeland
from pygraphs.util import load_or_calc_and_save, ddict2dict

## 3. Competition by Copeland’s score
The competition has been performed on random graphs generated with the $G(N,(m)p_{in}, p_{out})$ model and the following parameters: $N \in {100, 200}$, the number of classes $m \in {2, 4}$, $p_{in} = 0.3$, $p_{out} \in {0.1, 0.15}$. For every combination of parameters, we generated 50 graphs and for each of them we computed the best ARI’s the measure families reached. The results are presented in Table 1(a).

### find best params and 95% percentile

In [4]:
# KKMeans - results/3_best_params_3_100_ward_2.pkl
@load_or_calc_and_save('results/3_best_params_3_100_ward_3.pkl')
def calc():
    # calc data to find best params
    results = defaultdict(lambda: defaultdict(lambda: 0))
    for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
        n_nodes, n_classes, p_out = column
        graphs, info = StochasticBlockModel(n_nodes, n_classes, p_in=0.3, p_out=p_out).generate_graphs(100)
        classic_plot = ParallelByGraphs(adjusted_rand_score, np.linspace(0, 1, 31), progressbar=False)
        for measure_class in tqdm(kernels, desc=str(column)):
            results[column][measure_class.name] = classic_plot.perform(KWard, measure_class, graphs, n_classes, n_jobs=10)
            
    # find best params
    best_params = defaultdict(lambda: defaultdict(lambda: 0))
    percentile_params = defaultdict(lambda: defaultdict(lambda: 0))
    for column, measures in results.items():
        for measure_name, measure_results in measures.items():
            x, y, error = measure_results
            best_idx = np.argmax(y)
            percentile_idx = list(y).index(np.percentile(y, 90, interpolation='lower'))

            print('{}\t{}\t{:0.2f} ({:0.2f})\t{:0.2f} ({:0.2f})'.format(
                column, measure_name.ljust(8, ' '), x[best_idx], y[best_idx], x[percentile_idx], y[percentile_idx]))
            best_params[column][measure_name] = x[best_idx]
            percentile_params[column][measure_name] = x[percentile_idx]
    return (ddict2dict(best_params), ddict2dict(percentile_params))

best_params, percentile_params = calc()

File exist! Skip calculations


### calc competition for given params

In [5]:
def calc_competitions(best_params):
    results = defaultdict(lambda: defaultdict(lambda: 0))
    for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
        n_nodes, n_classes, p_out = column
        graphs, info = StochasticBlockModel(n_nodes, n_classes, p_in=0.3, p_out=p_out).generate_graphs(600)
        success = 0
        for edges, nodes in tqdm(graphs, desc=str(column)):
            try:
                single_competition_best = {}
                for kernel_class in kernels:
                    best_param = best_params[column][kernel_class.name]
                    kernel = kernel_class(edges)
                    param = kernel.scaler.scale(best_param)
                    K = kernel.get_K(param)
                    y_pred = KWard(n_classes).fit_predict(K)
                    ari = adjusted_rand_score(nodes, y_pred)
                    single_competition_best[kernel_class.name] = ari
                single_competition_score = copeland(single_competition_best.items())
                for measure_name, delta in single_competition_score.items():
                    results[column][measure_name] += delta
                    results['sum'][measure_name] += delta
                success += 1
            except Exception or FloatingPointError as e:
                print(e)
            if success == 200:
                break
    return results

def print_results(results):
    mr_transposed = {}
    for column_name, column_results in results.items():
        mr_transposed[str(column_name)] = stats.rankdata([-column_results[x.name] for x in kernels], 'min')

    columns_right_order = [
        '(100, 2, 0.1)',
        '(100, 2, 0.15)',
        '(100, 4, 0.1)',
        '(100, 4, 0.15)',
        '(200, 2, 0.1)',
        '(200, 2, 0.15)',
        '(200, 4, 0.1)',
        '(200, 4, 0.15)',
        'sum'
    ]

    print('\t'.join(columns_right_order))
    for idx, kernel in enumerate(kernels):
        print(kernel.name, '\t', '\t'.join([str(mr_transposed[col_name][idx]) for col_name in columns_right_order]))

#### best

In [6]:
results = calc_competitions(best_params)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 2, 0.1)', max=600), HTML(value='')))

divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by 

divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by 

divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by 

divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by 

HBox(children=(IntProgress(value=0, description='(100, 2, 0.15)', max=600), HTML(value='')))

Exception in thread Thread-5:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.6/dist-packages/tqdm/_monitor.py", line 62, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



HBox(children=(IntProgress(value=0, description='(100, 4, 0.1)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 4, 0.15)', max=600), HTML(value='')))

Singular matrix
Singular matrix
Singular matrix
Singular matrix
Singular matrix
Singular matrix
Singular matrix
Singular matrix
Singular matrix
Singular matrix
Singular matrix
Singular matrix
Singular matrix
Singular matrix
Singular matrix


HBox(children=(IntProgress(value=0, description='(200, 2, 0.1)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 2, 0.15)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 4, 0.1)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 4, 0.15)', max=600), HTML(value='')))

Singular matrix
Singular matrix
Singular matrix
Singular matrix



In [7]:
for column_name, column_results in results.items():
    print(column_name, [(x.name, column_results[x.name]) for x in kernels])

(100, 2, 0.1) [('pWalk', -10), ('Walk', -2), ('For', -19), ('logFor', 7), ('Comm', 7), ('logComm', 7), ('Heat', 7), ('logHeat', -2), ('NHeat', 17), ('logNHeat', 17), ('SCT', -2), ('SCCT', 17), ('RSP', -16), ('FE', 7), ('PPR', -6), ('logPPR', -10), ('ModifPPR', -19), ('logModifPPR', -10), ('HeatPPR', 17), ('logHeatPPR', 7), ('SP-CT', -14)]
sum [('pWalk', 3941), ('Walk', -6680), ('For', -5822), ('logFor', -5164), ('Comm', 9775), ('logComm', 14948), ('Heat', 6301), ('logHeat', 256), ('NHeat', 14232), ('logNHeat', 13658), ('SCT', -776), ('SCCT', 9918), ('RSP', -6836), ('FE', -5128), ('PPR', 5626), ('logPPR', -5130), ('ModifPPR', -27103), ('logModifPPR', -2209), ('HeatPPR', 19874), ('logHeatPPR', -13118), ('SP-CT', -20563)]
(100, 2, 0.15) [('pWalk', -88), ('Walk', -372), ('For', -3605), ('logFor', 88), ('Comm', 325), ('logComm', 3168), ('Heat', -2256), ('logHeat', 135), ('NHeat', 1580), ('logNHeat', 3018), ('SCT', 90), ('SCCT', 1658), ('RSP', -187), ('FE', 97), ('PPR', -48), ('logPPR', 78),

In [8]:
print_results(results)

(100, 2, 0.1)	(100, 2, 0.15)	(100, 4, 0.1)	(100, 4, 0.15)	(200, 2, 0.1)	(200, 2, 0.15)	(200, 4, 0.1)	(200, 4, 0.15)	sum
pWalk 	 15	14	5	1	13	17	15	1	9
Walk 	 11	16	14	16	16	16	14	17	17
For 	 20	20	5	1	20	20	17	1	16
logFor 	 5	10	15	15	17	13	8	13	15
Comm 	 5	6	5	1	8	8	7	1	6
logComm 	 5	1	1	11	4	1	1	8	2
Heat 	 5	19	5	1	3	6	17	1	7
logHeat 	 11	7	11	17	6	7	6	15	10
NHeat 	 1	5	5	1	5	4	4	1	3
logNHeat 	 1	3	3	12	1	3	2	9	4
SCT 	 11	9	18	8	9	10	13	10	11
SCCT 	 1	4	4	8	7	5	5	10	5
RSP 	 19	15	17	18	11	14	16	18	18
FE 	 5	8	16	13	12	15	9	16	13
PPR 	 14	13	5	1	10	9	10	1	8
logPPR 	 15	11	12	14	14	11	11	14	14
ModifPPR 	 20	21	21	21	21	21	21	21	21
logModifPPR 	 15	11	12	10	14	11	11	12	12
HeatPPR 	 1	2	2	1	2	2	3	1	1
logHeatPPR 	 5	17	19	19	18	18	19	19	19
SP-CT 	 18	18	20	20	19	19	20	20	20


#### percentile

In [9]:
results = calc_competitions(percentile_params)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 2, 0.1)', max=600), HTML(value='')))

Exception in thread Thread-6:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.6/dist-packages/tqdm/_monitor.py", line 62, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



HBox(children=(IntProgress(value=0, description='(100, 2, 0.15)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 4, 0.1)', max=600), HTML(value='')))

Singular matrix
Singular matrix
Singular matrix
Singular matrix
Singular matrix


HBox(children=(IntProgress(value=0, description='(100, 4, 0.15)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 2, 0.1)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 2, 0.15)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 4, 0.1)', max=600), HTML(value='')))

Singular matrix
Singular matrix
Singular matrix
Singular matrix


HBox(children=(IntProgress(value=0, description='(200, 4, 0.15)', max=600), HTML(value='')))




In [10]:
for column_name, column_results in results.items():
    print(column_name, [(x.name, column_results[x.name]) for x in kernels])

(100, 2, 0.1) [('pWalk', 48), ('Walk', -273), ('For', -2522), ('logFor', 138), ('Comm', 1200), ('logComm', 3037), ('Heat', 2087), ('logHeat', 1471), ('NHeat', 0), ('logNHeat', 834), ('SCT', 327), ('SCCT', 2287), ('RSP', -1834), ('FE', 23), ('PPR', -25), ('logPPR', 208), ('ModifPPR', -3513), ('logModifPPR', 208), ('HeatPPR', 526), ('logHeatPPR', -1411), ('SP-CT', -2816)]
sum [('pWalk', 1462), ('Walk', 2061), ('For', -27862), ('logFor', 3222), ('Comm', 7591), ('logComm', 26549), ('Heat', -6285), ('logHeat', 8100), ('NHeat', 4764), ('logNHeat', 7815), ('SCT', 1769), ('SCCT', 18469), ('RSP', -9226), ('FE', 2273), ('PPR', 1632), ('logPPR', 1631), ('ModifPPR', -29361), ('logModifPPR', 1576), ('HeatPPR', 9311), ('logHeatPPR', -6737), ('SP-CT', -18754)]
(100, 2, 0.15) [('pWalk', 427), ('Walk', 432), ('For', -3608), ('logFor', 568), ('Comm', 715), ('logComm', 3439), ('Heat', -2060), ('logHeat', 623), ('NHeat', 671), ('logNHeat', 1143), ('SCT', 216), ('SCCT', 2000), ('RSP', -344), ('FE', 364), (

In [11]:
print_results(results)

(100, 2, 0.1)	(100, 2, 0.15)	(100, 4, 0.1)	(100, 4, 0.15)	(200, 2, 0.1)	(200, 2, 0.15)	(200, 4, 0.1)	(200, 4, 0.15)	sum
pWalk 	 12	10	15	10	12	12	13	13	15
Walk 	 16	9	8	6	16	11	11	11	10
For 	 19	21	20	20	19	20	20	20	20
logFor 	 11	8	6	13	11	13	7	7	8
Comm 	 5	5	7	5	5	8	6	5	6
logComm 	 1	1	1	1	1	1	1	1	1
Heat 	 3	19	19	19	2	3	17	19	16
logHeat 	 4	7	9	8	4	4	3	14	4
NHeat 	 14	6	5	3	10	7	12	6	7
logNHeat 	 6	3	4	9	7	5	4	4	5
SCT 	 8	13	14	7	8	10	15	15	11
SCCT 	 2	2	2	2	3	2	2	2	2
RSP 	 18	17	17	17	18	18	18	17	18
FE 	 13	11	10	11	13	16	8	8	9
PPR 	 15	12	11	16	9	9	14	12	12
logPPR 	 9	14	12	12	14	14	9	10	13
ModifPPR 	 21	20	21	21	20	21	21	21	21
logModifPPR 	 9	14	12	15	14	14	9	9	14
HeatPPR 	 7	4	3	4	6	6	5	3	3
logHeatPPR 	 17	16	16	14	17	17	16	16	17
SP-CT 	 20	18	18	18	21	19	19	18	19
