In [1]:
import os
import sys
import pickle
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
import numpy as np

sys.path.append('/home/illusionww/Documents/GitHub/pygraphs')
from pygraphs.measure import kernels

# kkmeans init experiments, results

In [2]:
CACHE_ROOT = './kkmeans_init_datasets_modularity/by_column_and_kernel'
dataset_names = [
    'dolphins',
    'football',
    'karate',
    'polbooks',
    'news_2cl_1',
    'news_2cl_2',
    'news_2cl_3',
    'news_3cl_1',
    'news_3cl_2',
    'news_3cl_3',
    'news_5cl_1',
    'news_5cl_2',
    'news_5cl_3',
]

In [3]:
dataset = dataset_names[0]
kernel = kernels[0]
with open(f'{CACHE_ROOT}/{dataset}_{kernel.name}_results.pkl', 'rb') as f:
    data = pickle.load(f)

In [4]:
data[0][0.5][0]

{'labels': array([0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
        1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'inertia': -0.1579842569518608,
 'init': 'one',
 'score_ari': -0.04697883948486392,
 'score_nmi': 0.01958606105761375}

## Experiment 2: инициализация дающая наилучший результат

Для каждого графа определяем наилучшую инициализацию **для каждого параметра** (по inertia, ARI и NMI). После этого считаем статистику внутри каждого графа, сколько побед у каждого типа инициализаций (one, all, k-kmeans++), выбираем победителя для графа. После этого считаем статистику для всех графов, выбираем победителя. Делаем это для каждой меры и каждого сетапа генерации графов

In [5]:
def choose_init(param_results, choose_by_measure='inertia', request_measure='score_ari'):
    results = {
        'one': np.nan,
        'all': np.nan,
        'k-means++': np.nan,
        'any': np.nan
    }
    for init_name in results.keys():
        best_chooseby = np.inf if choose_by_measure == 'inertia' else -np.inf
        best_request = np.nan
        filtered_by_init = [x for x in param_results if x['init'] == init_name] \
                           if init_name != 'any' else param_results
        for init_results in filtered_by_init:
            chooseby, request = init_results[choose_by_measure], init_results[request_measure]
            if request > 1:
                continue
            if (choose_by_measure == 'inertia' and best_chooseby > chooseby) or \
               (choose_by_measure != 'inertia' and best_chooseby < chooseby):
                best_chooseby, best_request = chooseby, request
        results[init_name] = best_request
    return results

In [12]:
def choose_param(graph_results, init_name='one', choose_by_measure='inertia', request_measure='score_nmi'):
    results = []
    for flat_param, param_results in graph_results.items():
        param_results = [x for x in param_results if x['init'] == init_name] \
                           if init_name != 'any' else param_results
        resultss = choose_init(param_results, choose_by_measure, request_measure)
        results.append(resultss[init_name])
    return np.nanmax(results) 
        
def exp3(dataset_name, kernel):
    choose_by_measure, request_measure = 'inertia', 'score_nmi'
    
    with open(f'{CACHE_ROOT}/{dataset_name}_{kernel.name}_results.pkl', 'rb') as f:
        data = pickle.load(f)
        
    all_graphs_stat = defaultdict(lambda: 0)
    all_graphs_avg = defaultdict(lambda: [])
    for graph_results in data:
        graph_stat = defaultdict(lambda: 0)
        for init_name in ['one', 'all', 'k-means++', 'any']:
            graph_stat[init_name] = choose_param(graph_results, init_name=init_name,
                                                 choose_by_measure=choose_by_measure,
                                                 request_measure=request_measure)

        all_graphs_avg['one'].append(graph_stat['one'])
        all_graphs_avg['all'].append(graph_stat['all'])
        all_graphs_avg['k-means++'].append(graph_stat['k-means++'])
        all_graphs_avg['any'].append(graph_stat['any'])

        max_val = max(graph_stat.values())
        all_graphs_stat['one'] += int(graph_stat['one'] == max_val)
        all_graphs_stat['all'] += int(graph_stat['all'] == max_val)
        all_graphs_stat['k-means++'] += int(graph_stat['k-means++'] == max_val)
        all_graphs_stat['any'] += int(graph_stat['any'] == max_val)
        
    all_graphs_avg['one'] = np.nanmean(all_graphs_avg['one'])
    all_graphs_avg['all'] = np.nanmean(all_graphs_avg['all'])
    all_graphs_avg['k-means++'] = np.nanmean(all_graphs_avg['k-means++'])
    all_graphs_avg['any'] = np.nanmean(all_graphs_avg['any'])

    return all_graphs_stat, all_graphs_avg

In [13]:
cached = {}

def request_cache(dataset_name, kernel, func=exp3):
    key = f'{dataset_name}_{kernel.name}'
    
    if key not in cached:
        value = func(dataset_name, kernel)
        cached[key] = value

    return cached[key]

In [14]:
with open('exp3_vote.csv', 'w') as f:
    f.write('Inertia\t')
    for dataset_name in dataset_names:
        f.write(f'{dataset_name}\t\t\t\t')
    f.write('\n\t')
    for dataset_name in dataset_names:
        f.write('one\tall\tk-means++\tany\t')
    f.write('\n')

    for kernel in tqdm(kernels):
        f.write(f'{kernel.name}\t')
        for dataset_name in dataset_names:
            vote, ari = request_cache(dataset_name, kernel)
            f.write(f"{vote['one']}\t{vote['all']}\t{vote['k-means++']}\t{vote['any']}\t")
        f.write('\n')
        f.flush()
                    
with open('exp3_ari.csv', 'w') as f:
    f.write('Inertia\t')
    for dataset_name in dataset_names:
        f.write(f'{dataset_name}\t\t\t\t')
    f.write('\n\t')
    for dataset_name in dataset_names:
        f.write('one\tall\tk-means++\tany\t')
    f.write('\n')

    for kernel in tqdm(kernels):
        f.write(f'{kernel.name}\t')
        for dataset_name in dataset_names:
            vote, ari = request_cache(dataset_name, kernel)
            f.write(f"{ari['one']}\t{ari['all']}\t{ari['k-means++']}\t{ari['any']}\t")
        f.write('\n')
        f.flush()

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))




HBox(children=(IntProgress(value=0, max=21), HTML(value='')))


