In [1]:
import os
import sys
import pickle
import json
from collections import defaultdict
from tqdm.notebook import tqdm
import numpy as np
from itertools import product
import random
from joblib import Parallel, delayed

sys.path.append('../../..')
from pygraphs.measure import kernels

In [2]:
shuffle = lambda x: sorted(x, key=lambda k: random.random())

# kkmeans init experiments, results

Final goal: compare both different initialization (one, all, k-means++, any) and initialization quality measures (inertia, modularity) 

In [3]:
CACHE_ROOT = '/home/illusionww/Documents/HDD/phd/pygraphs/kkmeans_init_sbm/by_column_and_kernel'
columns = [
    (100, 2, 0.2, 0.05),
    (100, 2, 0.3, 0.05),
    (100, 2, 0.3, 0.1),
    (100, 2, 0.3, 0.15),
    (102, 3, 0.3, 0.1),
    (100, 4, 0.3, 0.1),
    (100, 4, 0.3, 0.15),
    (200, 2, 0.3, 0.05),
    (200, 2, 0.3, 0.1),
    (200, 2, 0.3, 0.15),
    (201, 3, 0.3, 0.1),
    (200, 4, 0.3, 0.1),
    (200, 4, 0.3, 0.15)
]
kernels_names = [
    'pWalk', 'Walk',
    'For', 'logFor',
    'Comm', 'logComm',
    'Heat', 'logHeat',
    'NHeat', 'logNHeat',
    'SCT', 'SCCT',
    'RSP', 'FE',
    'PPR', 'logPPR',
    'ModifPPR', 'logModifPPR',
    'HeatPR', 'logHeatPR',
    'DF', 'logDF',
    'Abs', 'logAbs',
    'SP-CT'
]
init_names=['one', 'all', 'kmp', 'any', 'any2', 'best']

## Инициализация, дающая наилучший результат

Для каждого графа определяем наилучшую инициализацию **для каждого параметра** (по inertia, ARI и NMI). После этого считаем статистику внутри каждого графа, сколько побед у каждого типа инициализаций (one, all, k-kmeans++), выбираем победителя для графа. После этого считаем статистику для всех графов, выбираем победителя. Делаем это для каждой меры и каждого сетапа генерации графов

In [4]:
def group_inits(param_results):
    """
    * Сначала группируем инициализации по one, all, k-kmeans++, any,
    * Потом для каждой группы выбираем лучшую согласно inertia или modularity
    """
    one_inits = shuffle([x for x in param_results if x['init'] == 'one'])
    all_inits = shuffle([x for x in param_results if x['init'] == 'all'])
    kmp_inits = shuffle([x for x in param_results if x['init'] == 'k-means++'])
    if len(one_inits) == 0 or len(all_inits) == 0 or len(kmp_inits) == 0:
        return None
    
    best_ari_init = one_inits[0]
    for init in one_inits + all_inits + kmp_inits:
        if init['score_ari'] > best_ari_init['score_ari']:
            best_ari_init = init
    
    init_results = {
        'one': one_inits,
        'all': all_inits,
        'kmp': kmp_inits,
        'any': one_inits[:10] + all_inits[:10] + kmp_inits[:10],
        'any2': all_inits[:15] + kmp_inits[:15],
        'best': [best_ari_init]
    }
    
    bestby = {
        'one': {'inertia': init_results['one'][0], 'modularity': init_results['one'][0]},
        'all': {'inertia': init_results['all'][0], 'modularity': init_results['all'][0]},
        'kmp': {'inertia': init_results['kmp'][0], 'modularity': init_results['kmp'][0]},
        'any': {'inertia': init_results['any'][0], 'modularity': init_results['any'][0]},
        'any2': {'inertia': init_results['any2'][0], 'modularity': init_results['any2'][0]},
        'best': {'inertia': init_results['best'][0], 'modularity': init_results['best'][0]}
    }
    for init_name, inits in init_results.items():
        for init in inits:
            if init['inertia'] < bestby[init_name]['inertia']['inertia']:  # choose best by inertia
                bestby[init_name]['inertia'] = init
            if init['modularity'] > bestby[init_name]['modularity']['modularity']:  # choose best by modularity
                bestby[init_name]['modularity'] = init
    return bestby

In [5]:
def perform_column_kernel_internal(data, score_name='score_ari'):
    reaching_value = dict([(f'inertia_{x}', 0) for x in init_names] + [(f'modularity_{x}', 0) for x in init_names])
    best_scores = dict([(f'inertia_{x}', []) for x in init_names] + [(f'modularity_{x}', []) for x in init_names])
    for graph_idx, graph_results in enumerate(data):
        graph_results = graph_results['results']
        best_score = dict([(f'inertia_{x}', 0) for x in init_names] + [(f'modularity_{x}', 0) for x in init_names])
        # choose best score accross kernel param
        for param, param_results in graph_results.items():
            bestby = group_inits(param_results)
            if bestby is not None:
                for unsup, init_name in product(['inertia', 'modularity'], init_names):
                    try:
                        score = bestby[init_name][unsup][score_name]
                    except Exception as e:
                        print(f'graph_idx: {graph_idx}, param: {param}, init: {init_name}, unsup: {unsup}')
                        print(f'best_init: {bestby[init_name][unsup]}')
                        raise e
                    if score > best_score[f'{unsup}_{init_name}']:
                        best_score[f'{unsup}_{init_name}'] = score
        for unsup, init_name in product(['inertia', 'modularity'], init_names):
            best_scores[f'{unsup}_{init_name}'].append(best_score[f'{unsup}_{init_name}'])
            
        # add 1 if setup reaches maximum
        max_value = max(best_score.values())
        for unsup, init_name in product(['inertia', 'modularity'], init_names):
            reaching_value[f'{unsup}_{init_name}'] += best_score[f'{unsup}_{init_name}'] == max_value
    
    mean_scores = {}
    for unsup, init_name in product(['inertia', 'modularity'], init_names):
        mean_scores[f'{unsup}_{init_name}'] = np.mean(best_scores[f'{unsup}_{init_name}'])
    
    return reaching_value, mean_scores

In [6]:
def perform_column_kernel(column, kernel_name):
    n, k, p_in, p_out = column
    column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
    filename = f'{column_str}_{kernel_name}_results.pkl'
#     print(f'{column_str}, {kernel_name}')
#     data = request_cache(filename)
    with open(f'{CACHE_ROOT}/{filename}', 'rb') as f:
        data = pickle.load(f)
    reaching_value, mean_scores = perform_column_kernel_internal(data)
    return (column_str, kernel_name), {
        'reaching_value': reaching_value,
        'mean_scores': mean_scores
    }

results = defaultdict(lambda: dict())
raw_results = Parallel(n_jobs=2)(delayed(perform_column_kernel)(column, kernel_name)
                                  for column, kernel_name in tqdm(list(product(columns, kernels_names))))
for (column_str, kernel_name), result in raw_results:
    results[column_str][kernel_name] = result

HBox(children=(IntProgress(value=0, max=325), HTML(value='')))




In [7]:
# print as table

with open('reaching_value.tsv', 'w') as f:
    f.write('\t')
    for column in columns:
        for init_name in [f'inertia_{x}' for x in init_names] + [f'modularity_{x}' for x in init_names]:
            f.write(f'{column}\t')
    f.write('\n\t')
    for column in columns:
        for init_name in [f'inertia_{x}' for x in init_names] + [f'modularity_{x}' for x in init_names]:
            f.write(f'{init_name}\t')
    f.write('\n')
    for kernel_name in kernels_names:
        f.write(f'{kernel_name}\t')
        for column in columns:
            n, k, p_in, p_out = column
            column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
            for init_name in [f'inertia_{x}' for x in init_names] + [f'modularity_{x}' for x in init_names]:
                f.write(f'{results[column_str][kernel_name]["reaching_value"][init_name]}\t')
        f.write('\n')

In [8]:
with open('mean_scores.tsv', 'w') as f:
    f.write('\t')
    for column in columns:
        for init_name in [f'inertia_{x}' for x in init_names] + [f'modularity_{x}' for x in init_names]:
            f.write(f'{column}\t')
    f.write('\n\t')
    for column in columns:
        for init_name in [f'inertia_{x}' for x in init_names] + [f'modularity_{x}' for x in init_names]:
            f.write(f'{init_name}\t')
    f.write('\n')
    for kernel_name in kernels_names:
        f.write(f'{kernel_name}\t')
        for column in columns:
            n, k, p_in, p_out = column
            column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
            for init_name in [f'inertia_{x}' for x in init_names] + [f'modularity_{x}' for x in init_names]:
                f.write(f'{results[column_str][kernel_name]["mean_scores"][init_name]}\t')
        f.write('\n')