# Do logarithmic proximity measures outperform plain ones in graph clustering?

In [1]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.append('../..')

In [31]:
from collections import defaultdict
from itertools import product
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import adjusted_rand_score
from scipy import stats

In [4]:
from pygraphs.graphs.generator import StochasticBlockModel
from pygraphs.measure import kernels
from pygraphs.cluster import KKMeans
from pygraphs.scenario import ParallelByGraphs
from pygraphs.scorer import copeland
from pygraphs.util import load_or_calc_and_save, ddict2dict

## 3. Competition by Copeland’s score
The competition has been performed on random graphs generated with the $G(N,(m)p_{in}, p_{out})$ model and the following parameters: $N \in {100, 200}$, the number of classes $m \in {2, 4}$, $p_{in} = 0.3$, $p_{out} \in {0.1, 0.15}$. For every combination of parameters, we generated 50 graphs and for each of them we computed the best ARI’s the measure families reached. The results are presented in Table 1(a).

### find best params and 95% percentile

In [7]:
# KKMeans - results/3_best_params_3_100_ward_2.pkl
@load_or_calc_and_save('results/3_best_params_3_100_ward_2.pkl')
def calc():
    # calc data to find best params
    results = defaultdict(lambda: defaultdict(lambda: 0))
    for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
        n_nodes, n_classes, p_out = column
        graphs, info = StochasticBlockModel(n_nodes, n_classes, p_in=0.3, p_out=p_out).generate_graphs(100)
        classic_plot = ParallelByGraphs(adjusted_rand_score, np.linspace(0, 1, 31), progressbar=False)
        for measure_class in tqdm(kernels, desc=str(column)):
            results[column][measure_class.name] = classic_plot.perform(KKMeans, measure_class, graphs, n_classes, n_jobs=12)
            
    # find best params
    best_params = defaultdict(lambda: defaultdict(lambda: 0))
    percentile_params = defaultdict(lambda: defaultdict(lambda: 0))
    for column, measures in results.items():
        for measure_name, measure_results in measures.items():
            x, y, error = measure_results
            best_idx = np.argmax(y)
            percentile_idx = list(y).index(np.percentile(y, 90, interpolation='lower'))

            print('{}\t{}\t{:0.2f} ({:0.2f})\t{:0.2f} ({:0.2f})'.format(
                column, measure_name.ljust(8, ' '), x[best_idx], y[best_idx], x[percentile_idx], y[percentile_idx]))
            best_params[column][measure_name] = x[best_idx]
            percentile_params[column][measure_name] = x[percentile_idx]
    return (ddict2dict(best_params), ddict2dict(percentile_params))

best_params, percentile_params = calc()

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 2, 0.1)', max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 2, 0.15)', max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 4, 0.1)', max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 4, 0.15)', max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 2, 0.1)', max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 2, 0.15)', max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 4, 0.1)', max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 4, 0.15)', max=21), HTML(value='')))


(100, 2, 0.1)	pWalk   	1.00 (0.01)	0.00 (-0.00)
(100, 2, 0.1)	Walk    	0.03 (0.97)	0.33 (0.97)
(100, 2, 0.1)	For     	0.00 (0.00)	0.63 (-0.00)
(100, 2, 0.1)	logFor  	0.27 (0.97)	0.20 (0.97)
(100, 2, 0.1)	Comm    	0.33 (0.74)	0.40 (0.44)
(100, 2, 0.1)	logComm 	0.10 (0.97)	0.23 (0.97)
(100, 2, 0.1)	Heat    	0.50 (0.94)	0.70 (0.92)
(100, 2, 0.1)	logHeat 	0.03 (0.97)	0.17 (0.96)
(100, 2, 0.1)	NHeat   	0.93 (0.93)	0.97 (0.05)
(100, 2, 0.1)	logNHeat	0.07 (0.97)	0.43 (0.97)
(100, 2, 0.1)	SCT     	0.90 (0.92)	0.87 (0.83)
(100, 2, 0.1)	SCCT    	0.70 (0.96)	0.67 (0.96)
(100, 2, 0.1)	RSP     	0.83 (0.77)	0.87 (0.70)
(100, 2, 0.1)	FE      	0.60 (0.57)	0.70 (0.50)
(100, 2, 0.1)	PPR     	0.00 (0.00)	0.17 (-0.00)
(100, 2, 0.1)	logPPR  	0.03 (0.97)	0.13 (0.97)
(100, 2, 0.1)	ModifPPR	0.00 (0.00)	0.10 (-0.00)
(100, 2, 0.1)	logModifPPR	0.07 (0.97)	0.20 (0.97)
(100, 2, 0.1)	HeatPPR 	0.90 (0.59)	0.97 (0.00)
(100, 2, 0.1)	logHeatPPR	0.03 (0.97)	0.20 (0.97)
(100, 2, 0.1)	SP-CT   	1.00 (0.16)	0.97 (0.14)
(10

In [70]:
for setup in best_params.keys():
    print(setup, end='\t')
print()
for kernel in kernels:
    print(kernel.name, end='\t')
    for setup in best_params.keys():
        print(f"{best_params[setup][kernel.name]:.2f}", end='\t')
    print()

(100, 2, 0.1)	(100, 2, 0.15)	(100, 4, 0.1)	(100, 4, 0.15)	(200, 2, 0.1)	(200, 2, 0.15)	(200, 4, 0.1)	(200, 4, 0.15)	
pWalk	1.00	0.00	0.00	0.00	1.00	0.00	0.03	0.07	
Walk	0.03	0.30	0.30	0.07	0.03	0.47	0.07	0.17	
For	0.00	0.00	0.00	0.13	0.00	0.67	0.00	0.00	
logFor	0.27	0.03	0.03	0.07	0.03	0.03	0.03	0.03	
Comm	0.33	0.37	0.43	0.37	0.23	0.23	0.27	0.23	
logComm	0.10	0.67	0.30	0.73	0.03	0.10	0.10	0.67	
Heat	0.50	0.63	0.63	0.47	0.43	0.33	0.47	0.47	
logHeat	0.03	0.17	0.30	0.17	0.03	0.07	0.07	0.10	
NHeat	0.93	0.93	0.93	0.93	0.93	0.93	0.93	0.93	
logNHeat	0.07	0.33	0.97	0.97	0.03	0.10	0.67	0.97	
SCT	0.90	0.93	0.93	0.97	0.93	0.97	0.97	0.97	
SCCT	0.70	0.90	0.90	0.93	0.03	0.80	0.97	0.90	
RSP	0.83	0.83	0.87	0.60	0.67	0.67	0.80	0.77	
FE	0.60	0.47	0.23	0.37	0.20	0.47	0.43	0.17	
PPR	0.00	0.00	1.00	1.00	1.00	0.00	0.00	0.03	
logPPR	0.03	0.23	0.17	0.20	0.03	0.07	0.10	0.23	
ModifPPR	0.00	1.00	0.00	0.23	1.00	1.00	0.00	1.00	
logModifPPR	0.07	0.33	0.27	0.33	0.03	0.27	0.03	0.03	
HeatPPR	0.90	0.93	0.90	0.90	0.90	0

### calc competition for given params

In [49]:
def calc_competitions(best_params):
    results = defaultdict(lambda: defaultdict(lambda: 0))
    for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
        n_nodes, n_classes, p_out = column
        graphs, info = StochasticBlockModel(n_nodes, n_classes, p_in=0.3, p_out=p_out).generate_graphs(600)
        success = 0
        for edges, nodes in tqdm(graphs, desc=str(column)):
            try:
                single_competition_best = {}
                for kernel_class in kernels:
                    best_param = best_params[column][kernel_class.name]
                    kernel = kernel_class(edges)
                    param = kernel.scaler.scale(best_param)
                    K = kernel.get_K(param)
                    y_pred = KKMeans(n_classes).fit_predict(K)
                    ari = adjusted_rand_score(nodes, y_pred)
                    single_competition_best[kernel_class.name] = ari
                single_competition_score = copeland(single_competition_best.items())
                for measure_name, delta in single_competition_score.items():
                    results[column][measure_name] += delta
                    results['sum'][measure_name] += delta
                success += 1
            except Exception or FloatingPointError as e:
                print(e)
            if success == 200:
                break
    return results

def print_results(results):
    mr_transposed = {}
    for column_name, column_results in results.items():
        mr_transposed[str(column_name)] = stats.rankdata([-column_results[x.name] for x in kernels], 'min')

    columns_right_order = [
        '(100, 2, 0.1)',
        '(100, 2, 0.15)',
        '(100, 4, 0.1)',
        '(100, 4, 0.15)',
        '(200, 2, 0.1)',
        '(200, 2, 0.15)',
        '(200, 4, 0.1)',
        '(200, 4, 0.15)',
        'sum'
    ]

    print('\t'.join(columns_right_order))
    for idx, kernel in enumerate(kernels):
        print(kernel.name, '\t', '\t'.join([str(mr_transposed[col_name][idx]) for col_name in columns_right_order]))

#### best

In [None]:
results = calc_competitions(best_params)

In [47]:
for column_name, column_results in results.items():
    print(column_name, [(x.name, column_results[x.name]) for x in kernels])

(100, 2, 0.1) [('pWalk', -3067), ('Walk', 1460), ('For', -3180), ('logFor', 1392), ('Comm', 170), ('logComm', 1433), ('Heat', 628), ('logHeat', 1490), ('NHeat', 953), ('logNHeat', 1541), ('SCT', 1023), ('SCCT', 1165), ('RSP', 405), ('FE', -651), ('PPR', -2984), ('logPPR', 1539), ('ModifPPR', -3017), ('logModifPPR', 1454), ('HeatPPR', -976), ('logHeatPPR', 1484), ('SP-CT', -2262)]
sum [('pWalk', -25526), ('Walk', 13011), ('For', -25821), ('logFor', 12646), ('Comm', -4896), ('logComm', 17847), ('Heat', 2255), ('logHeat', 15200), ('NHeat', 742), ('logNHeat', 19591), ('SCT', 4496), ('SCCT', 13577), ('RSP', -2535), ('FE', -8554), ('PPR', -25056), ('logPPR', 13015), ('ModifPPR', -23788), ('logModifPPR', 13181), ('HeatPPR', -12359), ('logHeatPPR', 14570), ('SP-CT', -11596)]
(100, 2, 0.15) [('pWalk', -2853), ('Walk', 2068), ('For', -2649), ('logFor', 2140), ('Comm', -810), ('logComm', 2373), ('Heat', -181), ('logHeat', 2527), ('NHeat', -376), ('logNHeat', 2274), ('SCT', -911), ('SCCT', 2192), 

In [50]:
print_results(results)

(100, 2, 0.1)	(100, 2, 0.15)	(100, 4, 0.1)	(100, 4, 0.15)	(200, 2, 0.1)	(200, 2, 0.15)	(200, 4, 0.1)	(200, 4, 0.15)	sum
pWalk 	 20	21	19	18	21	19	20	19	20
Walk 	 5	9	8	7	1	8	4	7	8
For 	 21	19	20	21	20	21	21	20	21
logFor 	 8	8	7	6	5	6	8	9	9
Comm 	 14	12	13	14	14	14	14	14	14
logComm 	 7	3	2	2	5	7	1	2	2
Heat 	 12	10	11	12	10	12	11	10	11
logHeat 	 3	1	3	4	5	5	6	4	3
NHeat 	 11	11	10	11	12	13	12	13	12
logNHeat 	 1	4	1	1	1	4	5	1	1
SCT 	 10	13	12	10	3	9	10	11	10
SCCT 	 9	5	5	3	13	10	9	3	5
RSP 	 13	15	14	13	11	11	13	12	13
FE 	 15	16	16	15	15	16	15	15	15
PPR 	 18	20	18	19	19	20	19	21	19
logPPR 	 2	6	9	8	5	3	3	8	7
ModifPPR 	 19	17	21	20	18	17	18	18	18
logModifPPR 	 6	6	6	9	5	2	7	6	6
HeatPPR 	 16	14	15	17	17	18	16	17	17
logHeatPPR 	 4	2	4	5	4	1	2	5	4
SP-CT 	 17	18	17	16	16	15	17	16	16


#### percentile

In [51]:
results = calc_competitions(percentile_params)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 2, 0.1)', max=600), HTML(value='')))

Exception in thread Thread-11:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.6/dist-packages/tqdm/_monitor.py", line 62, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



HBox(children=(IntProgress(value=0, description='(100, 2, 0.15)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 4, 0.1)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 4, 0.15)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 2, 0.1)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 2, 0.15)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 4, 0.1)', max=600), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 4, 0.15)', max=600), HTML(value='')))




In [52]:
for column_name, column_results in results.items():
    print(column_name, [(x.name, column_results[x.name]) for x in kernels])

(100, 2, 0.1) [('pWalk', -2738), ('Walk', 1869), ('For', -2989), ('logFor', 1803), ('Comm', -800), ('logComm', 1737), ('Heat', 1000), ('logHeat', 1839), ('NHeat', -2016), ('logNHeat', 1968), ('SCT', 837), ('SCCT', 1515), ('RSP', 772), ('FE', -941), ('PPR', -3026), ('logPPR', 1982), ('ModifPPR', -2886), ('logModifPPR', 1912), ('HeatPPR', -2014), ('logHeatPPR', 1865), ('SP-CT', -1689)]
sum [('pWalk', -22595), ('Walk', 15349), ('For', -23630), ('logFor', 12131), ('Comm', -9759), ('logComm', 19443), ('Heat', 5183), ('logHeat', 17352), ('NHeat', -20380), ('logNHeat', 17790), ('SCT', 1083), ('SCCT', 15972), ('RSP', 1643), ('FE', -4699), ('PPR', -23014), ('logPPR', 15182), ('ModifPPR', -22399), ('logModifPPR', 15204), ('HeatPPR', -19896), ('logHeatPPR', 17106), ('SP-CT', -7066)]
(100, 2, 0.15) [('pWalk', -2409), ('Walk', 2157), ('For', -2644), ('logFor', 1796), ('Comm', -1281), ('logComm', 2501), ('Heat', 297), ('logHeat', 2623), ('NHeat', -2460), ('logNHeat', 2618), ('SCT', -1418), ('SCCT', 

In [53]:
print_results(results)

(100, 2, 0.1)	(100, 2, 0.15)	(100, 4, 0.1)	(100, 4, 0.15)	(200, 2, 0.1)	(200, 2, 0.15)	(200, 4, 0.1)	(200, 4, 0.15)	sum
pWalk 	 18	17	17	20	19	17	19	20	19
Walk 	 4	7	7	5	2	3	7	7	6
For 	 20	21	21	21	21	18	21	21	21
logFor 	 7	9	9	9	2	7	8	10	9
Comm 	 13	12	13	15	15	15	14	15	15
logComm 	 8	4	3	1	2	8	2	1	1
Heat 	 10	10	10	11	10	11	10	9	10
logHeat 	 6	2	2	3	1	6	5	4	3
NHeat 	 17	19	16	17	17	16	17	17	17
logNHeat 	 2	3	1	6	6	1	1	5	2
SCT 	 11	13	11	10	11	12	12	12	12
SCCT 	 9	5	5	2	11	9	9	2	5
RSP 	 12	11	12	12	9	10	11	11	11
FE 	 14	15	14	13	13	13	13	14	13
PPR 	 21	20	20	18	20	21	20	18	20
logPPR 	 1	8	7	8	6	5	6	6	8
ModifPPR 	 19	18	18	19	18	19	16	16	18
logModifPPR 	 3	6	6	7	2	2	4	8	7
HeatPPR 	 16	14	19	16	16	20	18	19	16
logHeatPPR 	 5	1	4	4	6	4	3	3	4
SP-CT 	 15	16	15	14	14	14	15	13	14
