In [1]:
import os
import sys
import pickle
import json
from collections import defaultdict
from tqdm.notebook import tqdm
import numpy as np
from itertools import product
import random

sys.path.append('../../..')
from pygraphs.measure import kernels

In [2]:
shuffle = lambda x: sorted(x, key=lambda k: random.random())

# kkmeans init experiments, results

Final goal: compare both different initialization (one, all, k-means++, any) and initialization quality measures (inertia, modularity) 

In [3]:
CACHE_ROOT = '/home/illusionww/Documents/HDD/phd/pygraphs/kkmeans_init_sbm/by_column_and_kernel'
columns = [
    (100, 2, 0.2, 0.05),
    (100, 2, 0.3, 0.05),
    (100, 2, 0.3, 0.1),
    (100, 2, 0.3, 0.15),
    (102, 3, 0.3, 0.1),
    (100, 4, 0.3, 0.1),
    (100, 4, 0.3, 0.15),
    (200, 2, 0.3, 0.05),
    (200, 2, 0.3, 0.1),
    (200, 2, 0.3, 0.15),
    (201, 3, 0.3, 0.1),
    (200, 4, 0.3, 0.1),
    (200, 4, 0.3, 0.15)
]
kernels_names = [
    'pWalk', 'Walk',
    'For', 'logFor',
    'Comm', 'logComm',
    'Heat', 'logHeat',
    'NHeat', 'logNHeat',
    'SCT', 'SCCT',
    'RSP', 'FE',
    'PPR', 'logPPR',
    'ModifPPR', 'logModifPPR',
    'HeatPPR', 'logHeatPPR',
    'DF', 'logDF',
    'Abs', 'logAbs',
    'SP-CT'
]

In [4]:
# test
n, k, p_in, p_out = columns[0]
column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
kernel_name = 'Comm'
with open(f'{CACHE_ROOT}/{column_str}_{kernel_name}_results.pkl', 'rb') as f:
    data = pickle.load(f)
len(data[0]['results'][0.0])

90

## Инициализация, дающая наилучший результат

Для каждого графа определяем наилучшую инициализацию **для каждого параметра** (по inertia, ARI и NMI). После этого считаем статистику внутри каждого графа, сколько побед у каждого типа инициализаций (one, all, k-kmeans++), выбираем победителя для графа. После этого считаем статистику для всех графов, выбираем победителя. Делаем это для каждой меры и каждого сетапа генерации графов

In [5]:
def group_inits(param_results):
    """
    * Сначала группируем инициализации по one, all, k-kmeans++, any,
    * Потом для каждой группы выбираем лучшую согласно inertia или modularity
    """
    one_inits = shuffle([x for x in param_results if x['init'] == 'one'])
    all_inits = shuffle([x for x in param_results if x['init'] == 'all'])
    kmp_inits = shuffle([x for x in param_results if x['init'] == 'k-means++'])
    any_inits = one_inits[:10] + all_inits[:10] + kmp_inits[:10]
    if len(one_inits) == 0 or len(all_inits) == 0 or len(kmp_inits) == 0 or len(any_inits) == 0:
        return None
        
    bestby = {
        'one': {'inertia': one_inits[0], 'modularity': one_inits[0]},
        'all': {'inertia': all_inits[0], 'modularity': all_inits[0]},
        'kmp': {'inertia': kmp_inits[0], 'modularity': kmp_inits[0]},
        'any': {'inertia': any_inits[0], 'modularity': any_inits[0]}
    }
    for init_name, inits in {'one': one_inits, 'all': all_inits, 'kmp': kmp_inits, 'any': any_inits}.items():
        for init in inits:
            if init['inertia'] < bestby[init_name]['inertia']['inertia']:  # choose best by inertia
                bestby[init_name]['inertia'] = init
            if init['modularity'] > bestby[init_name]['modularity']['modularity']:  # choose best by modularity
                bestby[init_name]['modularity'] = init
    return bestby

In [6]:
def perform_column_kernel(data, score_name='score_nmi'):
    reaching_value = {
        'inertia_one': 0, 'inertia_all': 0, 'inertia_kmp': 0, 'inertia_any': 0,
        'modularity_one': 0, 'modularity_all': 0, 'modularity_kmp': 0, 'modularity_any': 0
    }
    best_scores = {
            'inertia_one': [], 'inertia_all': [], 'inertia_kmp': [], 'inertia_any': [],
            'modularity_one': [], 'modularity_all': [], 'modularity_kmp': [], 'modularity_any': []
        }
    for graph_idx, graph_results in enumerate(data):
        graph_results = graph_results['results']
        best_score = {
            'inertia_one': 0, 'inertia_all': 0, 'inertia_kmp': 0, 'inertia_any': 0,
            'modularity_one': 0, 'modularity_all': 0, 'modularity_kmp': 0, 'modularity_any': 0
        }
        # choose best score accross kernel param
        for param, param_results in graph_results.items():
            bestby = group_inits(param_results)
            if bestby is not None:
                for unsup, init_name in product(['inertia', 'modularity'], ['one', 'all', 'kmp', 'any']):
                    try:
                        score = bestby[init_name][unsup][score_name]
                    except Exception as e:
                        print(f'graph_idx: {graph_idx}, param: {param}, init: {init_name}, unsup: {unsup}')
                        print(f'best_init: {bestby[init_name][unsup]}')
                        raise e
                    if score > best_score[f'{unsup}_{init_name}']:
                        best_score[f'{unsup}_{init_name}'] = score
        for unsup, init_name in product(['inertia', 'modularity'], ['one', 'all', 'kmp', 'any']):
            best_scores[f'{unsup}_{init_name}'].append(best_score[f'{unsup}_{init_name}'])
            
        # add 1 if setup reaches maximum
        max_value = max(best_score.values())
        for unsup, init_name in product(['inertia', 'modularity'], ['one', 'all', 'kmp', 'any']):
            reaching_value[f'{unsup}_{init_name}'] += best_score[f'{unsup}_{init_name}'] == max_value
    
    mean_scores = {}
    for unsup, init_name in product(['inertia', 'modularity'], ['one', 'all', 'kmp', 'any']):
        mean_scores[f'{unsup}_{init_name}'] = np.mean(best_scores[f'{unsup}_{init_name}'])
    
    return reaching_value, mean_scores

In [16]:
results = defaultdict(lambda: dict())
for column, kernel_name in tqdm(list(product(columns, kernels_names))):
    n, k, p_in, p_out = column
    column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
    filename = f'{column_str}_{kernel_name}_results.pkl'
    print(f'{column_str}, {kernel_name}')
    with open(f'{CACHE_ROOT}/{filename}', 'rb') as f:
        data = pickle.load(f)
    reaching_value, mean_scores = perform_column_kernel(data)
    results[column_str][kernel_name] = {
        'reaching_value': reaching_value,
        'mean_scores': mean_scores
    }

HBox(children=(IntProgress(value=0, max=325), HTML(value='')))

100_2_0.2_0.05, pWalk
100_2_0.2_0.05, Walk
100_2_0.2_0.05, For
100_2_0.2_0.05, logFor
100_2_0.2_0.05, Comm
100_2_0.2_0.05, logComm
100_2_0.2_0.05, Heat
100_2_0.2_0.05, logHeat
100_2_0.2_0.05, NHeat
100_2_0.2_0.05, logNHeat
100_2_0.2_0.05, SCT
100_2_0.2_0.05, SCCT
100_2_0.2_0.05, RSP
100_2_0.2_0.05, FE
100_2_0.2_0.05, PPR
100_2_0.2_0.05, logPPR
100_2_0.2_0.05, ModifPPR
100_2_0.2_0.05, logModifPPR
100_2_0.2_0.05, HeatPPR
100_2_0.2_0.05, logHeatPPR
100_2_0.2_0.05, DF
100_2_0.2_0.05, logDF
100_2_0.2_0.05, Abs
100_2_0.2_0.05, logAbs
100_2_0.2_0.05, SP-CT
100_2_0.3_0.05, pWalk
100_2_0.3_0.05, Walk
100_2_0.3_0.05, For
100_2_0.3_0.05, logFor
100_2_0.3_0.05, Comm
100_2_0.3_0.05, logComm
100_2_0.3_0.05, Heat
100_2_0.3_0.05, logHeat
100_2_0.3_0.05, NHeat
100_2_0.3_0.05, logNHeat
100_2_0.3_0.05, SCT
100_2_0.3_0.05, SCCT
100_2_0.3_0.05, RSP
100_2_0.3_0.05, FE
100_2_0.3_0.05, PPR
100_2_0.3_0.05, logPPR
100_2_0.3_0.05, ModifPPR
100_2_0.3_0.05, logModifPPR
100_2_0.3_0.05, HeatPPR
100_2_0.3_0.05, logHe

In [19]:
with open('results.pkl', 'wb') as f:
    pickle.dump(dict(results), f)

In [24]:
# print as table

with open('reaching_value.tsv', 'w') as f:
    f.write('\t')
    for column in columns:
        for init_name in ['inertia_one', 'inertia_all', 'inertia_kmp', 'inertia_any',
            'modularity_one', 'modularity_all', 'modularity_kmp', 'modularity_any']:
            f.write(f'{column}\t')
    f.write('\n\t')
    for column in columns:
        for init_name in ['inertia_one', 'inertia_all', 'inertia_kmp', 'inertia_any',
            'modularity_one', 'modularity_all', 'modularity_kmp', 'modularity_any']:
            f.write(f'{init_name}\t')
    f.write('\n')
    for kernel_name in kernels_names:
        f.write(f'{kernel_name}\t')
        for column in columns:
            n, k, p_in, p_out = column
            column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
            for init_name in ['inertia_one', 'inertia_all', 'inertia_kmp', 'inertia_any',
                'modularity_one', 'modularity_all', 'modularity_kmp', 'modularity_any']:
                f.write(f'{results[column_str][kernel_name]["reaching_value"][init_name]}\t')
        f.write('\n')

In [25]:
with open('mean_scores.tsv', 'w') as f:
    f.write('\t')
    for column in columns:
        for init_name in ['inertia_one', 'inertia_all', 'inertia_kmp', 'inertia_any',
            'modularity_one', 'modularity_all', 'modularity_kmp', 'modularity_any']:
            f.write(f'{column}\t')
    f.write('\n\t')
    for column in columns:
        for init_name in ['inertia_one', 'inertia_all', 'inertia_kmp', 'inertia_any',
            'modularity_one', 'modularity_all', 'modularity_kmp', 'modularity_any']:
            f.write(f'{init_name}\t')
    f.write('\n')
    for kernel_name in kernels_names:
        f.write(f'{kernel_name}\t')
        for column in columns:
            n, k, p_in, p_out = column
            column_str = f'{n}_{k}_{p_in:.1f}_{p_out:.2f}'
            for init_name in ['inertia_one', 'inertia_all', 'inertia_kmp', 'inertia_any',
                'modularity_one', 'modularity_all', 'modularity_kmp', 'modularity_any']:
                f.write(f'{results[column_str][kernel_name]["mean_scores"][init_name]}\t')
        f.write('\n')