# Do logarithmic proximity measures outperform plain ones in graph clustering?

In [1]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [13]:
import sys
from pylab import *
from collections import defaultdict
from itertools import combinations, product
import pickle
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import adjusted_rand_score

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
sys.path.append('..')
from py_graphs.graphs.generator import StochasticBlockModel
from py_graphs.measure import *
from py_graphs.measure import H_kernels_plus_RSP_FE
from py_graphs.cluster.ward import Ward
from py_graphs.colors import d3
from py_graphs.scenario import *
from py_graphs.scorer import copeland

In [5]:
def load_or_calc_and_save(filename):
    def my_decorator(func):
        def wrapped():
            if os.path.exists(filename):
                print('File exist! Skip calculations')
                with open(filename, 'rb') as f:
                    result = pickle.load(f)
            else:
                result = func()
                with open(filename, 'wb') as f:
                    pickle.dump(result, f)
            return result
        return wrapped
    return my_decorator

In [6]:
def ddict2dict(d):
    for k, v in d.items():
        if isinstance(v, dict):
            d[k] = ddict2dict(v)
    return dict(d)

In [16]:
measures_right_order = [
    'pWalk H',
    'Walk H',
    'For H',
    'logFor H',
    'Comm H',
    'logComm H',
    'Heat H',
    'logHeat H',
    'SCT H',
    'SCCT H',
    'RSP K',
    'FE K',    
    'SP-CT H'
]

## 3. Competition by Copeland’s score
The competition has been performed on random graphs generated with the $G(N,(m)p_{in}, p_{out})$ model and the following parameters: $N \in {100, 200}$, the number of classes $m \in {2, 4}$, $p_{in} = 0.3$, $p_{out} \in {0.1, 0.15}$. For every combination of parameters, we generated 50 graphs and for each of them we computed the best ARI’s the measure families reached. The results are presented in Table 1(a).

### find best params and 95% percentile

In [7]:
@load_or_calc_and_save('results/3_best_params.pkl')
def calc():
    # calc data to find best params
    results = defaultdict(lambda: defaultdict(lambda: 0))
    for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
        n_nodes, n_classes, p_out = column
        graphs, info = StochasticBlockModel(n_nodes, n_classes, 0.3, p_out).generate_graphs(50)
        classic_plot = ParallelByGraphs(adjusted_rand_score, np.linspace(0, 1, 31), progressbar=False)
        for measure_class in tqdm(H_kernels_plus_RSP_FE, desc=str(column)):
            results[column][measure_class.name] = classic_plot.perform(Ward, measure_class, graphs, n_classes, n_jobs=6)
            
    # find best params
    best_params = defaultdict(lambda: defaultdict(lambda: 0))
    percentile_params = defaultdict(lambda: defaultdict(lambda: 0))
    for column, measures in results.items():
        for measure_name, measure_results in measures.items():
            x, y, error = measure_results
            best_idx = np.argmax(y)
            percentile_idx = list(y).index(np.percentile(y, 95, interpolation='lower'))

            print('{}\t{}\t{:0.2f} ({:0.2f})\t{:0.2f} ({:0.2f})'.format(
                column, measure_name.ljust(8, ' '), x[best_idx], y[best_idx], x[percentile_idx], y[percentile_idx]))
            best_params[column][measure_name] = x[best_idx]
            percentile_params[column][measure_name] = x[percentile_idx]
    return (ddict2dict(best_params), ddict2dict(percentile_params))

best_params, percentile_params = calc()

File exist! Skip calculations


### calc competition for given params

In [14]:
results = defaultdict(lambda: defaultdict(lambda: 0))
for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
    n_nodes, n_classes, p_out = column
    graphs, info = StochasticBlockModel(n_nodes, n_classes, 0.3, p_out).generate_graphs(100)
    success = 0
    for edges, nodes in tqdm(graphs, desc=str(column)):
        try:
            single_competition_best = {}
            for kernel_class in H_kernels_plus_RSP_FE:
                best_param = best_params[column][kernel_class.name]
                kernel = kernel_class(edges)
                param = kernel.scaler.scale(best_param)
                K = kernel.get_K(param)
                y_pred = Ward(n_classes).predict(K)
                ari = adjusted_rand_score(nodes, y_pred)
                single_competition_best[kernel_class.name] = ari
            single_competition_score = copeland(single_competition_best.items())
            for measure_name, delta in single_competition_score.items():
                results[column][measure_name] += delta
                results['sum'][measure_name] += delta
            success += 1
        except Exception or FloatingPointError as e:
            print(e)
        if success == 50:
            break

Exception in thread Thread-4:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 148, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by zero encountered in double_scalars
divide by 

KeyboardInterrupt: 

In [19]:
print('\t'.join(measures_right_order))
for column_name, column_results in results.items():
    print('{}\t'.format(column_name), end=" ")
    for measure_name in measures_right_order:
        measure_results = column_results[measure_name]
        print('{}\t'.format(measure_results), end=" ")
    print()

pWalk H	Walk H	For H	logFor H	Comm H	logComm H	Heat H	logHeat H	SCT H	SCCT H	RSP K	FE K	SP-CT H
(100, 4, 0.1)	 56	 78	 -456	 -24	 308	 476	 -456	 180	 78	 166	 62	 -44	 -424	 
(100, 2, 0.1)	 -81	 -90	 -387	 -92	 249	 383	 221	 308	 -74	 316	 -96	 -202	 -455	 
sum	 165	 -560	 -2568	 -611	 1953	 2798	 -137	 1503	 151	 1922	 -449	 -972	 -3195	 
(200, 2, 0.15)	 -155	 -125	 -574	 -92	 212	 565	 205	 321	 -24	 392	 -85	 -169	 -471	 
(200, 2, 0.1)	 -105	 -149	 -525	 -63	 291	 301	 295	 301	 47	 293	 -32	 -135	 -519	 
(100, 4, 0.15)	 418	 -222	 418	 -264	 418	 -66	 418	 -264	 44	 44	 -272	 -224	 -448	 
(200, 4, 0.1)	 6	 -26	 -478	 -32	 325	 592	 -478	 343	 44	 412	 -30	 -134	 -544	 
(100, 2, 0.15)	 26	 -26	 -566	 -44	 150	 547	 -342	 314	 36	 299	 4	 -64	 -334	 


In [20]:
results = defaultdict(lambda: defaultdict(lambda: 0))
for column in tqdm(list(product([100, 200], [2, 4], [0.1, 0.15]))):
    n_nodes, n_classes, p_out = column
    graphs, info = StochasticBlockModel(n_nodes, n_classes, 0.3, p_out).generate_graphs(100)
    success = 0
    for edges, nodes in tqdm(graphs, desc=str(column)):
        try:
            single_competition_best = {}
            for kernel_class in H_kernels_plus_RSP_FE:
                best_param = percentile_params[column][kernel_class.name]
                kernel = kernel_class(edges)
                param = kernel.scaler.scale(best_param)
                K = kernel.get_K(param)
                y_pred = Ward(n_classes).predict(K)
                ari = adjusted_rand_score(nodes, y_pred)
                single_competition_best[kernel_class.name] = ari
            single_competition_score = copeland(single_competition_best.items())
            for measure_name, delta in single_competition_score.items():
                results[column][measure_name] += delta
                results['sum'][measure_name] += delta
            success += 1
        except Exception or FloatingPointError as e:
            print(e)
        if success == 50:
            break

Exception in thread Thread-5:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.5/dist-packages/tqdm/_tqdm.py", line 148, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration






In [21]:
print('\t'.join(measures_right_order))
for column_name, column_results in results.items():
    print('{}\t'.format(column_name), end=" ")
    for measure_name in measures_right_order:
        measure_results = column_results[measure_name]
        print('{}\t'.format(measure_results), end=" ")
    print()

pWalk H	Walk H	For H	logFor H	Comm H	logComm H	Heat H	logHeat H	SCT H	SCCT H	RSP K	FE K	SP-CT H
(100, 4, 0.1)	 54	 132	 -588	 56	 222	 448	 -502	 290	 46	 136	 -8	 34	 -320	 
(100, 2, 0.1)	 -125	 -84	 -390	 -114	 168	 413	 299	 318	 58	 269	 -151	 -198	 -463	 
sum	 -295	 -5	 -4377	 34	 1768	 3881	 -1472	 1846	 499	 2271	 -521	 -411	 -3218	 
(200, 2, 0.15)	 -79	 -85	 -583	 -115	 258	 568	 154	 273	 45	 391	 -179	 -186	 -462	 
(200, 2, 0.1)	 -163	 -140	 -490	 -55	 286	 332	 340	 340	 26	 340	 -138	 -120	 -558	 
(200, 4, 0.15)	 -14	 66	 -583	 88	 178	 598	 -515	 98	 104	 360	 -16	 32	 -396	 
(100, 4, 0.15)	 74	 86	 -588	 110	 172	 356	 -490	 142	 90	 78	 78	 120	 -228	 
(200, 4, 0.1)	 -2	 30	 -591	 4	 333	 598	 -417	 202	 38	 423	 -106	 -66	 -446	 
(100, 2, 0.15)	 -40	 -10	 -564	 60	 151	 568	 -341	 183	 92	 274	 -1	 -27	 -345	 
