In [11]:
import os
import sys
import pickle
import json
from collections import defaultdict, Counter
from tqdm.notebook import tqdm
import numpy as np
from itertools import product
import random

sys.path.append('../../..')
from pygraphs.measure import kernels

In [2]:
shuffle = lambda x: sorted(x, key=lambda k: random.random())

In [3]:
CACHE_ROOT = '/home/illusionww/Documents/HDD/phd/pygraphs/kkmeans_init_datasets_modularity/by_column_and_kernel'
columns = [
    'cora_DB', 'cora_EC', 'cora_HA', 'cora_HCI', 'cora_IR', 'cora_Net',
    'dolphins',
    'eu-core',
    'eurosis',
    'football',
    'karate',
    'news_2cl_1', 'news_2cl_2', 'news_2cl_3',
    'news_3cl_1', 'news_3cl_2', 'news_3cl_3',
    'news_5cl_1', 'news_5cl_2', 'news_5cl_3',
    'polblogs',
    'polbooks',
    'sp_school_day_1', 'sp_school_day_2',
]
kernels_names = [
    'Katz', 'logKatz',
    'For', 'logFor',
    'Comm', 'logComm',
    'Heat', 'logHeat',
    'NHeat', 'logNHeat',
    'SCT', 'SCCT',
    'RSP', 'FE',
    'PPR', 'logPPR',
    'ModifPPR', 'logModifPPR',
    'HeatPR', 'logHeatPR',
    'DF', 'logDF',
    'Abs', 'logAbs',
    'SP-CT'
]

# kkmeans init experiments, results

Final goal: compare both different initialization (one, all, k-means++, any) and initialization quality measures (inertia, modularity) 

In [5]:
# test
column_str = columns[0]
kernel_name = 'Comm'
with open(f'{CACHE_ROOT}/{column_str}_{kernel_name}_results_00.pkl', 'rb') as f:
    data = pickle.load(f)
data[0.0][0]

{'labels': array([3, 3, 3, ..., 3, 3, 3]),
 'inertia': array(7029.0024, dtype=float32),
 'modularity': array([-6.830985e-06], dtype=float32),
 'init': 'one',
 'score_ari': 0.00046596517909929035,
 'score_nmi': 0.039346568477725836}

In [12]:
Counter([x['init'] for x in data[0.0]])

Counter({'one': 30, 'all': 30, 'k-means++': 30})

## Инициализация, дающая наилучший результат

Для каждого графа определяем наилучшую инициализацию **для каждого параметра** (по inertia, ARI и NMI). После этого считаем статистику внутри каждого графа, сколько побед у каждого типа инициализаций (one, all, k-kmeans++), выбираем победителя для графа. После этого считаем статистику для всех графов, выбираем победителя. Делаем это для каждой меры и каждого сетапа генерации графов

In [6]:
def group_inits(param_results):
    """
    * Сначала группируем инициализации по one, all, k-kmeans++, any,
    * Потом для каждой группы выбираем лучшую согласно inertia или modularity
    """
    one_inits = shuffle([x for x in param_results if x['init'] == 'one'])
    all_inits = shuffle([x for x in param_results if x['init'] == 'all'])
    kmp_inits = shuffle([x for x in param_results if x['init'] == 'k-means++'])
    any_inits = one_inits[:10] + all_inits[:10] + kmp_inits[:10]
    any2_inits = all_inits[:15] + kmp_inits[:15]
    if len(one_inits) == 0 or len(all_inits) == 0 or len(kmp_inits) == 0 or len(any_inits) == 0 \
        or len(any2_inits) == 0:
        return None
        
    bestby = {
        'one': {'inertia': one_inits[0], 'modularity': one_inits[0]},
        'all': {'inertia': all_inits[0], 'modularity': all_inits[0]},
        'kmp': {'inertia': kmp_inits[0], 'modularity': kmp_inits[0]},
        'any': {'inertia': any_inits[0], 'modularity': any_inits[0]},
        'any2': {'inertia': any2_inits[0], 'modularity': any2_inits[0]},
    }
    for init_name, inits in {'one': one_inits, 'all': all_inits, 'kmp': kmp_inits, 'any': any_inits,
                             'any2': any2_inits}.items():
        for init in inits:
            if init['inertia'] < bestby[init_name]['inertia']['inertia']:  # choose best by inertia
                bestby[init_name]['inertia'] = init
            if init['modularity'] > bestby[init_name]['modularity']['modularity']:  # choose best by modularity
                bestby[init_name]['modularity'] = init
    return bestby

In [7]:
def perform_column_kernel(data, score_name='score_ari'):
    scores = []
    # choose best score accross kernel param
    for param, param_results in data.items():
        bestby = group_inits(param_results)
        if bestby is not None:
            unsup, init_name = 'modularity', 'any2'
            scores.append(bestby[init_name][unsup][score_name])
    return scores

In [9]:
results = defaultdict(lambda: dict())
for column_str, kernel_name in tqdm(list(product(columns, kernels_names))):
    for i in range(5):
        filename = f'{column_str}_{kernel_name}_results_{str(i).zfill(2)}.pkl'
        with open(f'{CACHE_ROOT}/{filename}', 'rb') as f:
            data = pickle.load(f)
        results[column_str][kernel_name] = perform_column_kernel(data)
        break

HBox(children=(IntProgress(value=0, max=600), HTML(value='')))




In [10]:
with open('results.pkl', 'wb') as f:
    pickle.dump(dict(results), f)

In [11]:
with open('all_scores.tsv', 'w') as f:
    f.write('\t')
    for column in columns:
        for init_name in ['inertia_one', 'inertia_all', 'inertia_kmp', 'inertia_any', 'inertia_any2',
            'modularity_one', 'modularity_all', 'modularity_kmp', 'modularity_any', 'modularity_any2']:
            f.write(f'{column}\t')
    f.write('\n\t')
    for column in columns:
        for init_name in ['inertia_one', 'inertia_all', 'inertia_kmp', 'inertia_any', 'inertia_any2',
            'modularity_one', 'modularity_all', 'modularity_kmp', 'modularity_any', 'modularity_any2']:
            f.write(f'{init_name}\t')
    f.write('\n')
    for kernel_name in kernels_names:
        f.write(f'{kernel_name}\t')
        for column in columns:
            column_str = column
            for init_name in ['inertia_one', 'inertia_all', 'inertia_kmp', 'inertia_any', 'inertia_any2',
                'modularity_one', 'modularity_all', 'modularity_kmp', 'modularity_any', 'modularity_any2']:
                f.write(f'{results[column_str][kernel_name]["mean_scores"][init_name]}\t')
        f.write('\n')