In [1]:
import sys
import numpy as np
import pickle
import json
import networkx as nx
import pandas as pd
import powerlaw
from collections import defaultdict, Counter
from networkx.algorithms.community.quality import modularity as nx_modularity


sys.path.append('../../pygkernels')
from pygkernels.data import Datasets
from pygkernels.measure import kernels, Kernel

In [2]:
def np2nx(A: np.ndarray, partition: np.ndarray):
    G = nx.from_numpy_matrix(A)
    nx.set_node_attributes(G, dict(enumerate(partition)), 'community')
    return G

def partition2communities(partition):
    result = defaultdict(list)
    for idx, class_ in enumerate(partition):
        result[class_].append(idx)
    return list(result.values())

def power_law(values, maxval=200):
    tau = powerlaw.Fit(values, verbose=False).alpha
    if tau > maxval or np.isnan(tau):
        tau = maxval
    return tau

feature_inv = lambda x, power: (1 - (1 / x**power))

In [3]:
CACHE_ROOT = '/data/phd/pygkernels/cache/kkmeans_init_datasets'
dataset_names = [
    'dolphins',
    'football',
    'karate',
    'news_2cl1_0.1', 'news_2cl2_0.1', 'news_2cl3_0.1',
    'news_3cl1_0.1', 'news_3cl2_0.1', 'news_3cl3_0.1',
    'news_5cl1_0.1', 'news_5cl2_0.1', 'news_5cl3_0.1',
    'polblogs',
    'polbooks',
    'sp_school_day_1', 'sp_school_day_2',
    'cora_DB', 'cora_EC', 'cora_HA', 'cora_HCI', 'cora_IR', 'cora_Net',
    'eu-core',
    'eurosis'
]

In [4]:
param_norms = {
    'tau1|sqrtinv': {'min': 0.27, 'max': 0.94},
    'avg_degree|log': {'min': 0.86, 'max': 7.07},
    'modularity': {'min': -0.46, 'max': 0.84}
}

for param, norms in param_norms.items():
    param_norms[param]['width'] = norms['max'] - norms['min']
    
def normalize(param, param_name):
    return (param - param_norms[param_name]['min']) / param_norms[param_name]['width']

# Сравнение результатов на датасетах

Шаги:
* Формируем список всех датасетов и мер, а также их параметров: tau1, avg degree, modularity
* Открываем всё насчитанное
* Находим для каждого датасета лучший скор каждой меры
* Стратегия 1: выбираем для каждого датасета лучшую меру (upper bound)
* Стратегия 2: выбираем для всех датасетов общую лучшую меру (SCT или SCCT)
* Стратегия 3: выбираем для каждого датасета меру согласно его параметрам

In [5]:
with open('../paper_reproducible_results/lfr_result_grid.pkl', 'rb') as f:
    grid = pickle.load(f)

NameError: name 'json' is not defined

In [None]:
data = defaultdict(dict)

In [None]:
datasets_holder = Datasets()
for dataset_name in dataset_names:
    (A, partition), info = datasets_holder[dataset_name]
    G = np2nx(A, partition)
    
    # Parameters
    avg_degree = 2 * G.number_of_edges() / G.number_of_nodes()
    node_degrees = [d for n, d in G.degree()]
    tau1 = power_law(node_degrees, 100)
    community_sizes = list(Counter(partition).values())
    modularity = nx_modularity(G, partition2communities(partition))
    data[dataset_name]['params'] = {
        'tau1': tau1, 
        'tau1|sqrtinv': feature_inv(tau1, 0.5),
        'avg_degree': avg_degree,
        'avg_degree|log': np.log(avg_degree),
        'modularity': modularity
    }
    
    # Best measure according LFR
    closest_dist, closest_recomendation = 1000, None
    for out_x, out_y in zip(grid['out_x'], grid['out_y']):
        diff1 = np.abs(out_x[0] - normalize(data[dataset_name]['params']['tau1|sqrtinv'], 'tau1|sqrtinv'))
        diff2 = np.abs(out_x[1] - normalize(data[dataset_name]['params']['avg_degree|log'], 'avg_degree|log'))
        diff3 = np.abs(out_x[2] - normalize(data[dataset_name]['params']['modularity'], 'modularity'))
        dist = np.sqrt(diff1*diff1 + diff2*diff2 + diff3*diff3)
        if dist < closest_dist:
            closest_dist, closest_recomendation = dist, out_y
    data[dataset_name]['lfr_recomendation'] = {
        'closest_dist': closest_dist,
        'closest_recomendation': closest_recomendation
    }
    
    # Results
    data[dataset_name]['results'] = {}
    best_dataset_kernel_name, best_dataset_kernel_ari = None, -1
    best_dataset_kernel_name_top6, best_dataset_kernel_ari_top6 = None, -1
    for kernel in kernels:
        dataset_kernel_results_mean = defaultdict(list)
        for i in range(5):
            with open(f'{CACHE_ROOT}/by_column_and_kernel/{dataset_name}_{kernel.name}_results_0{i}.pkl', 'rb') as f:
                dataset_kernel_results = pickle.load(f)
            for param, param_results in dataset_kernel_results.items():
                if len(param_results) > 0:
                    best_init = sorted(param_results, key=lambda x: -x['modularity'])[0]
                    dataset_kernel_results_mean[param].append(best_init['score_ari'])
                else:
                    dataset_kernel_results_mean[param].append(0)
        for k, v in dataset_kernel_results_mean.items():
            dataset_kernel_results_mean[k] = np.mean(v)
        best_ari = np.max(list(dataset_kernel_results_mean.values()))
        data[dataset_name]['results'][kernel.name] = best_ari
        
        if best_ari > best_dataset_kernel_ari:
            best_dataset_kernel_name, best_dataset_kernel_ari = kernel.name, best_ari
        elif best_ari == best_dataset_kernel_ari:
            if type(best_dataset_kernel_name) == list:
                best_dataset_kernel_name.append(kernel.name)
            else:
                best_dataset_kernel_name = [best_dataset_kernel_name, kernel.name]
                
        if kernel.name in ['SCCT', 'logComm', 'logDF', 'RSP', 'Comm', 'NHeat']:
            if best_ari > best_dataset_kernel_ari_top6:
                best_dataset_kernel_name_top6, best_dataset_kernel_ari_top6 = kernel.name, best_ari
            elif best_ari == best_dataset_kernel_ari_top6:
                if type(best_dataset_kernel_name_top6) == list:
                    best_dataset_kernel_name_top6.append(kernel.name)
                else:
                    best_dataset_kernel_name_top6 = [best_dataset_kernel_name_top6, kernel.name]

    data[dataset_name]['best_measure'] = {
        'name': best_dataset_kernel_name, 
        'ari': best_dataset_kernel_ari
    }
    
    data[dataset_name]['best_measure_top6'] = {
        'name': best_dataset_kernel_name_top6, 
        'ari': best_dataset_kernel_ari_top6
    }

In [None]:
data['dolphins']

In [None]:
group_by_measure = defaultdict(list)
for dataset_name, results in data.items():
    for kernel_name, ari in results['results'].items():
        group_by_measure[kernel_name].append(ari)

mean_by_measure = {k: np.mean(v) for k, v in group_by_measure.items()}

In [None]:
df = pd.DataFrame(list(mean_by_measure.items()), columns=['measure', 'ari'])
df.sort_values('ari', ascending=False)

In [None]:
strategies_table = []
for dataset_name, results in data.items():
    strategies_table.append({
        'dataset_name': dataset_name,
        'best_measure name': 'SCCT',
        'best_measure ari': results['results']['SCCT'],
        'lfr_recomendation name': results['lfr_recomendation']['closest_recomendation'],
        'lfr_recomendation ari': results['results'][results['lfr_recomendation']['closest_recomendation']],
        'upper_bound_top6 name': results['best_measure_top6']['name'],
        'upper_bound_top6 ari': results['best_measure_top6']['ari'],
        'upper_bound name': results['best_measure']['name'],
        'upper_bound ari': results['best_measure']['ari'],
    })

In [None]:
pd.DataFrame(strategies_table).to_excel('strategies_table.xlsx')