In [1]:
import os
import sys
import pickle
import json
from collections import defaultdict
from tqdm.notebook import tqdm
import numpy as np
from itertools import product
import random
from joblib import Parallel, delayed

sys.path.append('../../..')
from pygraphs.measure import kernels

In [2]:
shuffle = lambda x: sorted(x, key=lambda k: random.random())

# kkmeans init experiments, results

Final goal: compare both different initialization (one, all, k-means++, any) and initialization quality measures (inertia, modularity) 

In [3]:
CACHE_ROOT = '/home/illusionww/Documents/HDD/phd/pygraphs/kkmeans_init_sbm/by_column_and_kernel'
columns = [
    (100, 2, 0.2, 0.05),
    (100, 2, 0.3, 0.05),
    (100, 2, 0.3, 0.1),
    (100, 2, 0.3, 0.15),
    (102, 3, 0.3, 0.1),
    (100, 4, 0.3, 0.1),
    (100, 4, 0.3, 0.15),
    (200, 2, 0.3, 0.05),
    (200, 2, 0.3, 0.1),
    (200, 2, 0.3, 0.15),
    (201, 3, 0.3, 0.1),
    (200, 4, 0.3, 0.1),
    (200, 4, 0.3, 0.15)
]
kernels_names = [
    'pWalk', 'Walk',
    'For', 'logFor',
    'Comm', 'logComm',
    'Heat', 'logHeat',
    'NHeat', 'logNHeat',
    'SCT', 'SCCT',
    'RSP', 'FE',
    'PPR', 'logPPR',
    'ModifPPR', 'logModifPPR',
    'HeatPR', 'logHeatPR',
    'DF', 'logDF',
    'Abs', 'logAbs',
    'SP-CT'
]
init_names=['one', 'all', 'kmp', 'any', 'any2', 'best']

## Инициализация, дающая наилучший результат

Для каждого графа определяем наилучшую инициализацию **для каждого параметра** (по inertia, ARI и NMI). После этого считаем статистику внутри каждого графа, сколько побед у каждого типа инициализаций (one, all, k-kmeans++), выбираем победителя для графа. После этого считаем статистику для всех графов, выбираем победителя. Делаем это для каждой меры и каждого сетапа генерации графов

In [4]:
def group_inits(param_results):
    """
    * Сначала группируем инициализации по one, all, k-kmeans++, any,
    * Потом для каждой группы выбираем лучшую согласно inertia или modularity
    """
    one_inits = shuffle([x for x in param_results if x['init'] == 'one'])
    all_inits = shuffle([x for x in param_results if x['init'] == 'all'])
    kmp_inits = shuffle([x for x in param_results if x['init'] == 'k-means++'])
    if len(one_inits) == 0 or len(all_inits) == 0 or len(kmp_inits) == 0:
        return None
    
    bestby_any3_modularity = one_inits[0]
    for init in one_inits[:10] + all_inits[:10] + kmp_inits[:10]:
        if init['modularity'] > bestby_any3_modularity['modularity']:  # choose best by modularity
            bestby_any3_modularity = init
    return bestby_any3_modularity

In [5]:
def perform_column_kernel_internal(data, score_name='score_ari'):
    best_scores = []
    for graph_idx, graph_results in enumerate(data):
        graph_results = graph_results['results']
        
        # choose best score accross kernel param
        best_score = -1
        for param, param_results in graph_results.items():
            bestby_any3_modularity = group_inits(param_results)
            if bestby_any3_modularity is not None:
                    score = bestby_any3_modularity[score_name]
                    if score > best_score:
                        best_score = score
        best_scores.append(best_score)
    return best_scores

In [6]:
def perform_column_kernel(column, kernel_name):
    n, k, p_in, p_out = column
    column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
    filename = f'{column_str}_{kernel_name}_results.pkl'
    with open(f'{CACHE_ROOT}/{filename}', 'rb') as f:
        data = pickle.load(f)
    all_scores = perform_column_kernel_internal(data)
    return (column_str, kernel_name), all_scores

results = defaultdict(lambda: dict())
raw_results = Parallel(n_jobs=6)(delayed(perform_column_kernel)(column, kernel_name)
                                  for column, kernel_name in tqdm(list(product(columns, kernels_names))))
for (column_str, kernel_name), result in raw_results:
    results[column_str][kernel_name] = result

HBox(children=(IntProgress(value=0, max=325), HTML(value='')))




In [10]:
# print as table
with open('sbm_all_data_for_cd2.csv', 'w') as f:
    f.write('classifier_name,dataset_name,accuracy\n')
    for dataset_name, dataset_data in results.items():
        for measure_name, measure_data in dataset_data.items():
            for trial_idx, trial_ari in enumerate(measure_data):
                f.write(f'{measure_name},{dataset_name},{trial_ari}\n')