# Do logarithmic proximity measures outperform plain ones in graph clustering?

In [1]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.append('../..')

In [2]:
from collections import defaultdict
from itertools import product
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import adjusted_rand_score

In [3]:
from pygraphs.graphs.generator import StochasticBlockModel
from pygraphs.measure import ALL_kernels
from pygraphs.cluster import KernelWardSklearn
from pygraphs.scenario import ParallelByGraphs, measures_right_order
from pygraphs.scorer import copeland
from pygraphs.util import load_or_calc_and_save, ddict2dict

In [4]:
measures_right_order = [
    'pWalk H',
    'Walk H',
    'For H',
    'logFor H',
    'Comm H',
    'logComm H',
    'Heat H',
    'logHeat H',
    'SCT H',
    'SCCT H',
    'RSP K',
    'FE K',
    'SP-CT H',
    'NormalizedHeat R',
    'PersonalizedPageRank R',
    'ModifiedPersonalizedPageRank R',
    'HeatPersonalizedPageRank R'
]

## 3. Competition by Copeland’s score
The competition has been performed on random graphs generated with the $G(N,(m)p_{in}, p_{out})$ model and the following parameters: $N \in {100, 200}$, the number of classes $m \in {2, 4}$, $p_{in} = 0.3$, $p_{out} \in {0.1, 0.15}$. For every combination of parameters, we generated 50 graphs and for each of them we computed the best ARI’s the measure families reached. The results are presented in Table 1(a).

### find best params and 95% percentile

In [5]:
@load_or_calc_and_save('results/3_best_params_3_100_ward.pkl')
def calc():
    # calc data to find best params
    results = defaultdict(lambda: defaultdict(lambda: 0))
    for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
        n_nodes, n_classes, p_out = column
        graphs, info = StochasticBlockModel(n_nodes, n_classes, p_in=0.3, p_out=p_out).generate_graphs(100)
        classic_plot = ParallelByGraphs(adjusted_rand_score, np.linspace(0, 1, 31), progressbar=False)
        for measure_class in tqdm(ALL_kernels, desc=str(column)):
            results[column][measure_class.name] = classic_plot.perform(KernelWardSklearn, measure_class, graphs, n_classes, n_jobs=12)
            
    # find best params
    best_params = defaultdict(lambda: defaultdict(lambda: 0))
    percentile_params = defaultdict(lambda: defaultdict(lambda: 0))
    for column, measures in results.items():
        for measure_name, measure_results in measures.items():
            x, y, error = measure_results
            best_idx = np.argmax(y)
            percentile_idx = list(y).index(np.percentile(y, 90, interpolation='lower'))

            print('{}\t{}\t{:0.2f} ({:0.2f})\t{:0.2f} ({:0.2f})'.format(
                column, measure_name.ljust(8, ' '), x[best_idx], y[best_idx], x[percentile_idx], y[percentile_idx]))
            best_params[column][measure_name] = x[best_idx]
            percentile_params[column][measure_name] = x[percentile_idx]
    return (ddict2dict(best_params), ddict2dict(percentile_params))

best_params, percentile_params = calc()

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 2, 0.1)', max=17, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='(100, 2, 0.15)', max=17, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='(100, 4, 0.1)', max=17, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='(100, 4, 0.15)', max=17, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='(200, 2, 0.1)', max=17, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='(200, 2, 0.15)', max=17, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='(200, 4, 0.1)', max=17, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='(200, 4, 0.15)', max=17, style=ProgressStyle(description_widt…


(100, 2, 0.1)	pWalk H 	0.73 (0.84)	0.60 (0.82)
(100, 2, 0.1)	Walk H  	0.27 (0.92)	0.50 (0.91)
(100, 2, 0.1)	For H   	0.97 (0.64)	0.90 (0.63)
(100, 2, 0.1)	logFor H	0.10 (0.92)	0.13 (0.91)
(100, 2, 0.1)	Comm H  	0.20 (0.87)	0.13 (0.84)
(100, 2, 0.1)	logComm H	0.20 (0.93)	0.10 (0.92)
(100, 2, 0.1)	Heat H  	0.57 (0.79)	0.43 (0.79)
(100, 2, 0.1)	logHeat H	0.40 (0.94)	0.27 (0.94)
(100, 2, 0.1)	SCT H   	0.87 (0.93)	0.73 (0.92)
(100, 2, 0.1)	SCCT H  	0.07 (0.96)	0.03 (0.95)
(100, 2, 0.1)	RSP K   	0.80 (0.93)	0.77 (0.92)
(100, 2, 0.1)	FE K    	0.60 (0.92)	0.67 (0.92)
(100, 2, 0.1)	SP-CT H 	0.03 (0.85)	0.97 (0.85)
(100, 2, 0.1)	NormalizedHeat R	0.87 (0.94)	0.70 (0.93)
(100, 2, 0.1)	PersonalizedPageRank R	0.43 (0.92)	0.47 (0.92)
(100, 2, 0.1)	ModifiedPersonalizedPageRank R	0.43 (0.92)	0.47 (0.92)
(100, 2, 0.1)	HeatPersonalizedPageRank R	0.87 (0.95)	0.80 (0.93)
(100, 2, 0.15)	pWalk H 	0.70 (0.45)	0.50 (0.43)
(100, 2, 0.15)	Walk H  	0.10 (0.55)	0.03 (0.54)
(100, 2, 0.15)	For H   	0.00 (0.04)	0.70

### calc competition for given params

In [6]:
results = defaultdict(lambda: defaultdict(lambda: 0))
for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
    n_nodes, n_classes, p_out = column
    graphs, info = StochasticBlockModel(n_nodes, n_classes, p_in=0.3, p_out=p_out).generate_graphs(600)
    success = 0
    for edges, nodes in tqdm(graphs, desc=str(column)):
        try:
            single_competition_best = {}
            for kernel_class in ALL_kernels:
                best_param = best_params[column][kernel_class.name]
                kernel = kernel_class(edges)
                param = kernel.scaler.scale(best_param)
                K = kernel.get_K(param)
                y_pred = KernelWardSklearn(n_classes).predict(K)
                ari = adjusted_rand_score(nodes, y_pred)
                single_competition_best[kernel_class.name] = ari
            single_competition_score = copeland(single_competition_best.items())
            for measure_name, delta in single_competition_score.items():
                results[column][measure_name] += delta
                results['sum'][measure_name] += delta
            success += 1
        except Exception or FloatingPointError as e:
            print(e)
        if success == 200:
            break

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 2, 0.1)', max=600, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='(100, 2, 0.15)', max=600, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='(100, 4, 0.1)', max=600, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='(100, 4, 0.15)', max=600, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='(200, 2, 0.1)', max=600, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='(200, 2, 0.15)', max=600, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='(200, 4, 0.1)', max=600, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='(200, 4, 0.15)', max=600, style=ProgressStyle(description_wid…

In [7]:
print('\t'.join(measures_right_order))
for column_name, column_results in results.items():
    print('{}\t'.format(column_name), end=" ")
    for measure_name in measures_right_order:
        measure_results = column_results[measure_name]
        print('{}\t'.format(measure_results), end=" ")
    print()

pWalk H	Walk H	For H	logFor H	Comm H	logComm H	Heat H	logHeat H	SCT H	SCCT H	RSP K	FE K	SP-CT H	NormalizedHeat R	PersonalizedPageRank R	ModifiedPersonalizedPageRank R	HeatPersonalizedPageRank R
(100, 2, 0.1)	 -1870	 -148	 -2231	 318	 -1016	 243	 529	 998	 491	 1184	 309	 312	 -1536	 556	 381	 381	 1099	 
sum	 -9561	 -1260	 -17703	 1782	 -4456	 279	 -6769	 6010	 5280	 14648	 4388	 1787	 -10827	 4146	 2013	 2013	 8230	 
(100, 2, 0.15)	 -1223	 -88	 -2906	 400	 -819	 97	 -2066	 983	 725	 2099	 576	 219	 -893	 771	 428	 428	 1269	 
(100, 4, 0.1)	 -1077	 -76	 -2520	 516	 -795	 -20	 -2520	 642	 1206	 1994	 828	 810	 -970	 256	 390	 390	 946	 
(100, 4, 0.15)	 2288	 -1106	 2288	 -796	 2288	 -1008	 2288	 -892	 -464	 176	 -608	 -912	 -1286	 -794	 -588	 -588	 -286	 
(200, 2, 0.1)	 -1674	 274	 -3056	 360	 -281	 345	 819	 800	 569	 791	 586	 283	 -1858	 541	 380	 380	 741	 
(200, 2, 0.15)	 -2375	 8	 -3174	 -78	 -1321	 196	 -479	 1550	 741	 2664	 705	 145	 -1830	 1060	 280	 280	 1628	 
(200, 4, 0.1)	

In [8]:
results = defaultdict(lambda: defaultdict(lambda: 0))
for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
    n_nodes, n_classes, p_out = column
    graphs, info = StochasticBlockModel(n_nodes, n_classes, p_in=0.3, p_out=p_out).generate_graphs(600)
    success = 0
    for edges, nodes in tqdm(graphs, desc=str(column)):
        try:
            single_competition_best = {}
            for kernel_class in ALL_kernels:
                best_param = percentile_params[column][kernel_class.name]
                kernel = kernel_class(edges)
                param = kernel.scaler.scale(best_param)
                K = kernel.get_K(param)
                y_pred = KernelWardSklearn(n_classes).predict(K)
                ari = adjusted_rand_score(nodes, y_pred)
                single_competition_best[kernel_class.name] = ari
            single_competition_score = copeland(single_competition_best.items())
            for measure_name, delta in single_competition_score.items():
                results[column][measure_name] += delta
                results['sum'][measure_name] += delta
            success += 1
        except Exception or FloatingPointError as e:
            print(e)
        if success == 200:
            break

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, description='(100, 2, 0.1)', max=600, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='(100, 2, 0.15)', max=600, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='(100, 4, 0.1)', max=600, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='(100, 4, 0.15)', max=600, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='(200, 2, 0.1)', max=600, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='(200, 2, 0.15)', max=600, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='(200, 4, 0.1)', max=600, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='(200, 4, 0.15)', max=600, style=ProgressStyle(description_wid…

In [9]:
print('\t'.join(measures_right_order))
for column_name, column_results in results.items():
    print('{}\t'.format(column_name), end=" ")
    for measure_name in measures_right_order:
        measure_results = column_results[measure_name]
        print('{}\t'.format(measure_results), end=" ")
    print()

pWalk H	Walk H	For H	logFor H	Comm H	logComm H	Heat H	logHeat H	SCT H	SCCT H	RSP K	FE K	SP-CT H	NormalizedHeat R	PersonalizedPageRank R	ModifiedPersonalizedPageRank R	HeatPersonalizedPageRank R
(100, 2, 0.1)	 -1911	 240	 -2398	 492	 -1381	 297	 402	 949	 466	 1459	 575	 561	 -1590	 458	 281	 281	 819	 
sum	 -10901	 1816	 -23923	 3944	 -9787	 -6280	 -10406	 6939	 5623	 16726	 6470	 4865	 -7093	 6367	 4122	 4122	 7396	 
(100, 2, 0.15)	 -1331	 315	 -3033	 511	 -1320	 -194	 -1878	 930	 574	 2347	 695	 531	 -586	 749	 531	 531	 628	 
(100, 4, 0.1)	 -1024	 235	 -3128	 594	 -788	 -326	 -2724	 508	 891	 1984	 1012	 810	 -596	 710	 620	 620	 602	 
(100, 4, 0.15)	 -183	 284	 -3108	 114	 -45	 -515	 -2568	 521	 648	 1445	 554	 376	 222	 583	 434	 434	 804	 
(200, 2, 0.1)	 -1691	 325	 -3029	 335	 -529	 400	 850	 850	 307	 925	 303	 451	 -1602	 728	 425	 425	 527	 
(200, 2, 0.15)	 -1743	 17	 -2914	 422	 -2060	 -2700	 643	 1251	 719	 2843	 900	 468	 -1297	 1055	 513	 513	 1370	 
(200, 4, 0.1)	 -1892	