# Do logarithmic proximity measures outperform plain ones in graph clustering?

In [1]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.append('../..')

In [2]:
from collections import defaultdict
from itertools import product
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import adjusted_rand_score

In [3]:
from pygraphs.graphs.generator import StochasticBlockModel
from pygraphs.measure import H_kernels_plus_RSP_FE
from pygraphs.cluster.ward import Ward
from pygraphs.scenario import ParallelByGraphs, measures_right_order
from pygraphs.scorer import copeland
from pygraphs.util import load_or_calc_and_save, ddict2dict

## 3. Competition by Copeland’s score
The competition has been performed on random graphs generated with the $G(N,(m)p_{in}, p_{out})$ model and the following parameters: $N \in {100, 200}$, the number of classes $m \in {2, 4}$, $p_{in} = 0.3$, $p_{out} \in {0.1, 0.15}$. For every combination of parameters, we generated 50 graphs and for each of them we computed the best ARI’s the measure families reached. The results are presented in Table 1(a).

### find best params and 95% percentile

In [4]:
@load_or_calc_and_save('results/3_best_params_3_100.pkl')
def calc():
    # calc data to find best params
    results = defaultdict(lambda: defaultdict(lambda: 0))
    for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
        n_nodes, n_classes, p_out = column
        graphs, info = StochasticBlockModel(n_nodes, n_classes, 0.3, p_out).generate_graphs(100)
        classic_plot = ParallelByGraphs(adjusted_rand_score, np.linspace(0, 1, 31), progressbar=False)
        for measure_class in tqdm(H_kernels_plus_RSP_FE, desc=str(column)):
            results[column][measure_class.name] = classic_plot.perform(Ward, measure_class, graphs, n_classes, n_jobs=12)
            
    # find best params
    best_params = defaultdict(lambda: defaultdict(lambda: 0))
    percentile_params = defaultdict(lambda: defaultdict(lambda: 0))
    for column, measures in results.items():
        for measure_name, measure_results in measures.items():
            x, y, error = measure_results
            best_idx = np.argmax(y)
            percentile_idx = list(y).index(np.percentile(y, 90, interpolation='lower'))

            print('{}\t{}\t{:0.2f} ({:0.2f})\t{:0.2f} ({:0.2f})'.format(
                column, measure_name.ljust(8, ' '), x[best_idx], y[best_idx], x[percentile_idx], y[percentile_idx]))
            best_params[column][measure_name] = x[best_idx]
            percentile_params[column][measure_name] = x[percentile_idx]
    return (ddict2dict(best_params), ddict2dict(percentile_params))

best_params, percentile_params = calc()

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 2, 0.1)', max=13), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 2, 0.15)', max=13), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 4, 0.1)', max=13), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 4, 0.15)', max=13), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 2, 0.1)', max=13), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 2, 0.15)', max=13), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 4, 0.1)', max=13), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 4, 0.15)', max=13), HTML(value='')))


(100, 4, 0.1)	pWalk H 	0.00 (0.40)	0.67 (0.32)
(100, 4, 0.1)	FE K    	0.93 (0.34)	0.80 (0.33)
(100, 4, 0.1)	Comm H  	0.00 (0.40)	0.33 (0.33)
(100, 4, 0.1)	logFor H	0.53 (0.34)	0.47 (0.34)
(100, 4, 0.1)	logComm H	0.67 (0.47)	0.70 (0.45)
(100, 4, 0.1)	Walk H  	0.53 (0.35)	0.60 (0.34)
(100, 4, 0.1)	For H   	0.00 (0.40)	0.87 (0.01)
(100, 4, 0.1)	SCT H   	0.90 (0.33)	0.83 (0.32)
(100, 4, 0.1)	Heat H  	0.00 (0.40)	0.90 (0.09)
(100, 4, 0.1)	logHeat H	0.27 (0.35)	0.17 (0.34)
(100, 4, 0.1)	SCCT H  	0.83 (0.40)	0.87 (0.39)
(100, 4, 0.1)	SP-CT H 	0.27 (0.23)	0.33 (0.23)
(100, 4, 0.1)	RSP K   	0.97 (0.33)	0.87 (0.29)
(100, 2, 0.1)	pWalk H 	0.77 (0.82)	0.90 (0.81)
(100, 2, 0.1)	FE K    	0.93 (0.84)	0.80 (0.83)
(100, 2, 0.1)	Comm H  	0.33 (0.87)	0.27 (0.84)
(100, 2, 0.1)	logFor H	0.47 (0.85)	0.43 (0.83)
(100, 2, 0.1)	logComm H	0.57 (0.96)	0.70 (0.95)
(100, 2, 0.1)	Walk H  	0.70 (0.84)	0.77 (0.83)
(100, 2, 0.1)	For H   	0.97 (0.45)	0.87 (0.43)
(100, 2, 0.1)	SCT H   	0.43 (0.84)	0.67 (0.83)
(100, 2, 

### calc competition for given params

In [5]:
results = defaultdict(lambda: defaultdict(lambda: 0))
for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
    n_nodes, n_classes, p_out = column
    graphs, info = StochasticBlockModel(n_nodes, n_classes, 0.3, p_out).generate_graphs(200)
    success = 0
    for edges, nodes in tqdm(graphs, desc=str(column)):
        try:
            single_competition_best = {}
            for kernel_class in H_kernels_plus_RSP_FE:
                best_param = best_params[column][kernel_class.name]
                kernel = kernel_class(edges)
                param = kernel.scaler.scale(best_param)
                K = kernel.get_K(param)
                y_pred = Ward(n_classes).predict(K)
                ari = adjusted_rand_score(nodes, y_pred)
                single_competition_best[kernel_class.name] = ari
            single_competition_score = copeland(single_competition_best.items())
            for measure_name, delta in single_competition_score.items():
                results[column][measure_name] += delta
                results['sum'][measure_name] += delta
            success += 1
        except Exception or FloatingPointError as e:
            print(e)
        if success == 50:
            break

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 2, 0.1)', max=200), HTML(value='')))

divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by 

divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars


HBox(children=(IntProgress(value=0, description='(100, 2, 0.15)', max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 4, 0.1)', max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 4, 0.15)', max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 2, 0.1)', max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 2, 0.15)', max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 4, 0.1)', max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 4, 0.15)', max=200), HTML(value='')))

In [6]:
print('\t'.join(measures_right_order))
for column_name, column_results in results.items():
    print('{}\t'.format(column_name), end=" ")
    for measure_name in measures_right_order:
        measure_results = column_results[measure_name]
        print('{}\t'.format(measure_results), end=" ")
    print()

pWalk H	Walk H	For H	logFor H	Comm H	logComm H	Heat H	logHeat H	SCT H	SCCT H	RSP K	FE K	SP-CT H	SP K	CT H
sum	 647	 -1141	 -968	 -635	 1952	 2471	 884	 169	 151	 1694	 -969	 -907	 -3348	 0	 0	 
(100, 2, 0.1)	 -3	 4	 -10	 -8	 4	 10	 10	 4	 -6	 10	 -3	 0	 -12	 0	 0	 
(200, 2, 0.15)	 -41	 -144	 -568	 3	 244	 580	 136	 202	 -39	 289	 -107	 -97	 -458	 0	 0	 
(200, 2, 0.1)	 -106	 -221	 -546	 -83	 297	 333	 332	 332	 69	 238	 -1	 -102	 -542	 0	 0	 
(200, 4, 0.15)	 442	 -346	 442	 -262	 442	 152	 442	 -292	 76	 76	 -336	 -294	 -542	 0	 0	 
(100, 4, 0.15)	 448	 -316	 448	 -298	 448	 -122	 448	 -294	 148	 148	 -358	 -294	 -406	 0	 0	 
(100, 4, 0.1)	 86	 -40	 86	 -70	 86	 406	 86	 -16	 -106	 188	 -132	 -104	 -470	 0	 0	 
(200, 4, 0.1)	 -138	 -98	 -260	 50	 246	 600	 -260	 166	 -42	 458	 -124	 -4	 -594	 0	 0	 
(100, 2, 0.15)	 -41	 20	 -560	 33	 185	 512	 -310	 67	 51	 287	 92	 -12	 -324	 0	 0	 


In [7]:
results = defaultdict(lambda: defaultdict(lambda: 0))
for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
    n_nodes, n_classes, p_out = column
    graphs, info = StochasticBlockModel(n_nodes, n_classes, 0.3, p_out).generate_graphs(100)
    success = 0
    for edges, nodes in tqdm(graphs, desc=str(column)):
        try:
            single_competition_best = {}
            for kernel_class in H_kernels_plus_RSP_FE:
                best_param = percentile_params[column][kernel_class.name]
                kernel = kernel_class(edges)
                param = kernel.scaler.scale(best_param)
                K = kernel.get_K(param)
                y_pred = Ward(n_classes).predict(K)
                ari = adjusted_rand_score(nodes, y_pred)
                single_competition_best[kernel_class.name] = ari
            single_competition_score = copeland(single_competition_best.items())
            for measure_name, delta in single_competition_score.items():
                results[column][measure_name] += delta
                results['sum'][measure_name] += delta
            success += 1
        except Exception or FloatingPointError as e:
            print(e)
        if success == 50:
            break

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 2, 0.1)'), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 2, 0.15)'), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 4, 0.1)'), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 4, 0.15)'), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 2, 0.1)'), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 2, 0.15)'), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 4, 0.1)'), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(200, 4, 0.15)'), HTML(value='')))

In [8]:
print('\t'.join(measures_right_order))
for column_name, column_results in results.items():
    print('{}\t'.format(column_name), end=" ")
    for measure_name in measures_right_order:
        measure_results = column_results[measure_name]
        print('{}\t'.format(measure_results), end=" ")
    print()

pWalk H	Walk H	For H	logFor H	Comm H	logComm H	Heat H	logHeat H	SCT H	SCCT H	RSP K	FE K	SP-CT H	SP K	CT H
sum	 -1	 264	 -4363	 444	 1068	 3866	 -1015	 1055	 84	 2602	 -1402	 266	 -2868	 0	 0	 
(100, 2, 0.1)	 45	 -79	 -391	 -23	 109	 440	 296	 236	 -27	 263	 -313	 -74	 -482	 0	 0	 
(200, 2, 0.15)	 -31	 -4	 -587	 44	 60	 565	 249	 11	 -6	 397	 -268	 30	 -460	 0	 0	 
(200, 2, 0.1)	 -62	 -84	 -495	 -126	 198	 398	 386	 391	 52	 295	 -338	 -30	 -585	 0	 0	 
(200, 4, 0.15)	 26	 76	 -578	 94	 158	 582	 -472	 98	 30	 370	 -84	 52	 -352	 0	 0	 
(100, 4, 0.15)	 10	 102	 -561	 116	 120	 340	 -445	 32	 -32	 184	 14	 120	 0	 0	 0	 
(100, 4, 0.1)	 20	 114	 -590	 148	 106	 466	 -492	 80	 4	 360	 -16	 50	 -250	 0	 0	 
(200, 4, 0.1)	 -10	 20	 -599	 134	 168	 574	 -215	 148	 36	 438	 -280	 38	 -452	 0	 0	 
(100, 2, 0.15)	 1	 119	 -562	 57	 149	 501	 -322	 59	 27	 295	 -117	 80	 -287	 0	 0	 
