# Do logarithmic proximity measures outperform plain ones in graph clustering?

In [1]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [2]:
import sys
import inspect
from collections import defaultdict
from contextlib import contextmanager
from itertools import combinations, product
import pickle
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import adjusted_rand_score

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
sys.path.append('..')
from py_graphs.graphs.generator import StochasticBlockModel
from py_graphs.graphs.dataset import football, polbooks, zachary, news_2cl_1, news_2cl_2, news_2cl_3, news_3cl_1, news_3cl_2, news_3cl_3
from py_graphs.measure import *
from py_graphs.measure import H_kernels_plus_RSP_FE
from py_graphs.cluster.ward import Ward
from py_graphs.colors import d3, d3_category20
from py_graphs.scenario import *
from py_graphs.scorer import copeland

In [5]:
def load_or_calc_and_save(filename):
    def my_decorator(func):
        def wrapped():
            if os.path.exists(filename):
                print('File exist! Skip calculations')
                with open(filename, 'rb') as f:
                    result = pickle.load(f)
            else:
                result = func()
                with open(filename, 'wb') as f:
                    pickle.dump(result, f)
            return result
        return wrapped
    return my_decorator

In [6]:
def ddict2dict(d):
    for k, v in d.items():
        if isinstance(v, dict):
            d[k] = ddict2dict(v)
    return dict(d)

## 6. Cluster analysis on several classical datasets

For each dataset and each measure family, we sorted 55 values of the family parameter in the descending order of the
corresponding ARI. ARI against the rank of the family parameter value is shown in Fig. 9.

In [11]:
measures_right_order = [
    'pWalk H',
    'Walk H',
    'For H',
    'logFor H',
    'Comm H',
    'logComm H',
    'Heat H',
    'logHeat H',
    'SCT H',
    'SCCT H',
    'RSP K',
    'FE K',    
    'SP-CT H'
]
all_datasets = [
    football,
    polbooks,
    # polblogs,
    zachary,
    news_2cl_1, news_2cl_2, news_2cl_3,
#     news_3cl_1, news_3cl_2, news_3cl_3,
#     news_5cl_1, news_5cl_2, news_5cl_3
]


In [12]:
def perform(classic_plot, dataset):
    dataset_results = {}
    graphs, info = dataset
    for measure_class in tqdm(H_kernels_plus_RSP_FE, desc=info['name']):
        x, y, error = classic_plot.perform(Ward, measure_class, graphs, 2, n_jobs=1)
        _, best_y = sorted(zip(x, y), key=lambda x: -x[1])[0]
        mean_y = np.mean(y)
        dataset_results[measure_class.name] = (best_y, mean_y)
    return info['name'], dataset_results


@load_or_calc_and_save('results/6_1.pkl')
def calc():
    classic_plot = ParallelByGraphs(adjusted_rand_score, np.linspace(0, 1, 51), progressbar=False)
    return dict(Parallel(n_jobs=9)(delayed(perform)(classic_plot, dataset) for dataset in all_datasets))

results = calc()




















In [13]:
for result in results:
    plt.plot()

{'football': {'Comm H': (0.14197724019447658, 0.12438597991220736),
  'FE K': (0.12841915388744732, 0.12811841333567556),
  'For H': (0.14107416881713133, 0.12772682230672402),
  'Heat H': (0.14107416881713133, 0.12747759890966803),
  'RSP K': (0.12841915388744732, 0.060206619279147625),
  'SCCT H': (0.12841915388744732, 0.12597753726863148),
  'SCT H': (0.12841915388744732, 0.12597753726863148),
  'SP-CT H': (0.14666565762831613, 0.1287769284706016),
  'Walk H': (0.1467775645582882, 0.12903942239945781),
  'logComm H': (0.14660373425393092, 0.12940169007242025),
  'logFor H': (0.12841915388744732, 0.12841915388744732),
  'logHeat H': (0.12841915388744732, 0.12821124629646688),
  'pWalk H': (0.14197724019447658, 0.13169438510514886)},
 'news_2cl_1': {'Comm H': (0.40830925407204083, 0.02906726568347924),
  'FE K': (0.6881211627231513, 0.529622905090953),
  'For H': (0.40830925407204083, 0.008166185081440817),
  'Heat H': (0.40830925407204083, 0.008166185081440817),
  'RSP K': (0.5539244