In [1]:
%matplotlib inline

In [2]:
import json
import os
import sys
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from scipy.stats import rankdata
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

sys.path.append('../../pygkernels')
from pygkernels.measure import kernels

In [3]:
def calc_avranks(results):  # {dataset: {classifier: accuracy}}
    ranks = defaultdict(list)
    for _, classifier_accuracy in results.items():
        classifiers, accuracies = zip(*list(classifier_accuracy.items()))
        for classifier, rank in zip(classifiers, rankdata(-np.array(accuracies), method='min')):
            ranks[classifier].append(rank)
    ranks = {k: np.mean(v) for k, v in sorted(ranks.items(), key=lambda x: x[0])}
    return list(ranks.values()), list(ranks.keys()), len(results)

In [4]:
kernels_order = [x.name for x in kernels]
list(enumerate(kernels_order))

[(0, 'Katz'),
 (1, 'logKatz'),
 (2, 'For'),
 (3, 'logFor'),
 (4, 'Comm'),
 (5, 'logComm'),
 (6, 'Heat'),
 (7, 'logHeat'),
 (8, 'NHeat'),
 (9, 'logNHeat'),
 (10, 'SCT'),
 (11, 'SCCT'),
 (12, 'RSP'),
 (13, 'FE'),
 (14, 'PPR'),
 (15, 'logPPR'),
 (16, 'ModifPPR'),
 (17, 'logModifPPR'),
 (18, 'HeatPR'),
 (19, 'logHeatPR'),
 (20, 'DF'),
 (21, 'logDF'),
 (22, 'Abs'),
 (23, 'logAbs'),
 (24, 'SP-CT')]

# Leaderboards

In [5]:
with open('filtered_dataset.json', 'r') as f:
    dataset = json.load(f)
full_size = len(dataset)
print(len(dataset))

# dataset = [item for item in dataset if 1 <= item['estimated_params']['tau1'] <= 4 and item['estimated_params']['modularity'] > 0.0]
# print(len(dataset) / full_size)

7396


In [14]:
associative_dataset = [x for x in dataset if x['estimated_params']['modularity'] >= 0]
dissociative_dataset = [x for x in dataset if x['estimated_params']['modularity'] < 0]
len(associative_dataset), len(dissociative_dataset)

(2556, 4840)

In [19]:
lb = {}

# Leaderboard by ranks ">="
ranks, names, n_experiments = calc_avranks(dict(enumerate([data['measure_best_results'] for data in associative_dataset])))
ranks_ge = dict(zip(names, ranks))
lb['ranks(>=)'] = ranks_ge

# Leaderboard by wins ">", maxari < 1
wins_ge = {kernel_name: 0 for kernel_name in kernels_order}
meanari = {kernel_name: (0, 0) for kernel_name in kernels_order}
for kernel_name in kernels_order:
    for data in associative_dataset:
        item = data['measure_best_results']
        if item[kernel_name] >= np.max([v for k, v in item.items() if k != kernel_name]):
            wins_ge[kernel_name] += 1
        
        meanari[kernel_name] = (meanari[kernel_name][0] + item[kernel_name], meanari[kernel_name][1] + 1)
    meanari[kernel_name] = meanari[kernel_name][0] / meanari[kernel_name][1]
            
lb['wins(>=)'] = {k: v/full_size*100 for k, v in wins_ge.items()}
lb['meanari'] = meanari
# pd.DataFrame(Counter(ranks_ge).most_common(), columns=['measure', 'rank']).sort_values('rank')

df = pd.DataFrame(lb)
df.sort_values('ranks(>=)', ascending=True)

Unnamed: 0,ranks(>=),wins(>=),meanari
RSP,4.220657,14.440238,0.680692
SCCT,4.56338,19.429421,0.700559
logNHeat,5.280908,13.290968,0.667493
logHeatPR,5.296557,13.290968,0.667439
FE,5.582942,13.547864,0.66804
logKatz,5.758998,13.710114,0.667096
logComm,6.136541,15.156842,0.643482
logPPR,6.197574,13.31801,0.661114
logModifPPR,6.507042,13.128718,0.658402
SP-CT,6.548513,13.061114,0.659287


In [20]:
df.to_excel('associative_leaderboard.xlsx')

In [21]:
lb = {}

# Leaderboard by ranks ">="
ranks, names, n_experiments = calc_avranks(dict(enumerate([data['measure_best_results'] for data in dissociative_dataset])))
ranks_ge = dict(zip(names, ranks))
lb['ranks(>=)'] = ranks_ge

# Leaderboard by wins ">", maxari < 1
wins_ge = {kernel_name: 0 for kernel_name in kernels_order}
meanari = {kernel_name: (0, 0) for kernel_name in kernels_order}
for kernel_name in kernels_order:
    for data in dissociative_dataset:
        item = data['measure_best_results']
        if item[kernel_name] >= np.max([v for k, v in item.items() if k != kernel_name]):
            wins_ge[kernel_name] += 1
        
        meanari[kernel_name] = (meanari[kernel_name][0] + item[kernel_name], meanari[kernel_name][1] + 1)
    meanari[kernel_name] = meanari[kernel_name][0] / meanari[kernel_name][1]
            
lb['wins(>=)'] = {k: v/full_size*100 for k, v in wins_ge.items()}
lb['meanari'] = meanari
# pd.DataFrame(Counter(ranks_ge).most_common(), columns=['measure', 'rank']).sort_values('rank')

df = pd.DataFrame(lb)
df.sort_values('ranks(>=)', ascending=True)

Unnamed: 0,ranks(>=),wins(>=),meanari
SCCT,3.972314,40.684154,0.637098
RSP,7.811983,8.504597,0.385274
NHeat,8.198347,6.368307,0.336386
SCT,8.541942,7.030827,0.380617
SP-CT,8.953306,7.936723,0.374131
Katz,9.160124,2.46079,0.291131
Comm,9.351033,4.77285,0.285586
logNHeat,9.529339,6.003245,0.31681
FE,9.784504,6.828015,0.349098
logHeatPR,9.971074,6.003245,0.315509


In [22]:
df.to_excel('dissociative_leaderboard.xlsx')

In [8]:
df.to_excel('leaderboard.xlsx')