In [1]:
%matplotlib inline

In [2]:
import json
import os
import sys
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from scipy.stats import rankdata
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

sys.path.append('../../pygkernels')
from pygkernels.measure import kernels

In [3]:
def calc_avranks(results):  # {dataset: {classifier: accuracy}}
    ranks = defaultdict(list)
    for _, classifier_accuracy in results.items():
        classifiers, accuracies = zip(*list(classifier_accuracy.items()))
        for classifier, rank in zip(classifiers, rankdata(-np.array(accuracies), method='min')):
            ranks[classifier].append(rank)
    ranks = {k: np.mean(v) for k, v in sorted(ranks.items(), key=lambda x: x[0])}
    return list(ranks.values()), list(ranks.keys()), len(results)

In [4]:
kernels_order = [x.name for x in kernels]
list(enumerate(kernels_order))

[(0, 'Katz'),
 (1, 'logKatz'),
 (2, 'For'),
 (3, 'logFor'),
 (4, 'Comm'),
 (5, 'logComm'),
 (6, 'Heat'),
 (7, 'logHeat'),
 (8, 'NHeat'),
 (9, 'logNHeat'),
 (10, 'SCT'),
 (11, 'SCCT'),
 (12, 'RSP'),
 (13, 'FE'),
 (14, 'PPR'),
 (15, 'logPPR'),
 (16, 'ModifPPR'),
 (17, 'logModifPPR'),
 (18, 'HeatPR'),
 (19, 'logHeatPR'),
 (20, 'DF'),
 (21, 'logDF'),
 (22, 'Abs'),
 (23, 'logAbs'),
 (24, 'SP-CT')]

# Leaderboards

In [5]:
with open('filtered_dataset.json', 'r') as f:
    dataset = json.load(f)
    
for item in dataset:
    if 'several' in item['measure_best_results']:
        del item['measure_best_results']['several']
    
full_size = len(dataset)
print(len(dataset))

# dataset = [item for item in dataset if 1 <= item['estimated_params']['tau1'] <= 4 and item['estimated_params']['modularity'] > 0.0]
# print(len(dataset) / full_size)

9167


In [6]:
associative_dataset = [x for x in dataset if x['estimated_params']['modularity'] >= 0]
dissociative_dataset = [x for x in dataset if x['estimated_params']['modularity'] < 0]
len(associative_dataset), len(dissociative_dataset)

(5062, 4105)

In [7]:
lb = {}

# Leaderboard by ranks ">="
ranks, names, n_experiments = calc_avranks(dict(enumerate([data['measure_best_results'] for data in associative_dataset])))
ranks_ge = dict(zip(names, ranks))
lb['ranks(>=)'] = ranks_ge

# Leaderboard by wins ">", maxari < 1
wins_ge = {kernel_name: 0 for kernel_name in kernels_order}
meanari = {kernel_name: (0, 0) for kernel_name in kernels_order}
for kernel_name in kernels_order:
    for data in associative_dataset:
        item = data['measure_best_results']
        if item[kernel_name] >= np.max([v for k, v in item.items() if k != kernel_name]):
            wins_ge[kernel_name] += 1
        
        meanari[kernel_name] = (meanari[kernel_name][0] + item[kernel_name], meanari[kernel_name][1] + 1)
    meanari[kernel_name] = meanari[kernel_name][0] / meanari[kernel_name][1]
            
lb['wins(>=)'] = {k: v/full_size*100 for k, v in wins_ge.items()}
lb['meanari'] = meanari
# pd.DataFrame(Counter(ranks_ge).most_common(), columns=['measure', 'rank']).sort_values('rank')

df = pd.DataFrame(lb)
df.sort_values('ranks(>=)', ascending=True)

Unnamed: 0,ranks(>=),wins(>=),meanari
RSP,4.174437,22.101015,0.681007
SCCT,5.155077,27.577179,0.68871
logNHeat,5.259384,18.992037,0.665982
logHeatPR,5.268471,19.166576,0.666156
FE,5.493086,19.679284,0.665833
logKatz,5.577637,21.71921,0.669678
logComm,6.076847,22.548271,0.656212
logPPR,6.174437,19.188393,0.656736
logModifPPR,6.43955,18.970219,0.654337
SCT,7.201896,19.766554,0.650236


In [8]:
df.to_excel('associative_leaderboard.xlsx')

In [9]:
lb = {}

# Leaderboard by ranks ">="
ranks, names, n_experiments = calc_avranks(dict(enumerate([data['measure_best_results'] for data in dissociative_dataset])))
ranks_ge = dict(zip(names, ranks))
lb['ranks(>=)'] = ranks_ge

# Leaderboard by wins ">", maxari < 1
wins_ge = {kernel_name: 0 for kernel_name in kernels_order}
meanari = {kernel_name: (0, 0) for kernel_name in kernels_order}
for kernel_name in kernels_order:
    for data in dissociative_dataset:
        item = data['measure_best_results']
        if item[kernel_name] >= np.max([v for k, v in item.items() if k != kernel_name]):
            wins_ge[kernel_name] += 1
        
        meanari[kernel_name] = (meanari[kernel_name][0] + item[kernel_name], meanari[kernel_name][1] + 1)
    meanari[kernel_name] = meanari[kernel_name][0] / meanari[kernel_name][1]
            
lb['wins(>=)'] = {k: v/full_size*100 for k, v in wins_ge.items()}
lb['meanari'] = meanari
# pd.DataFrame(Counter(ranks_ge).most_common(), columns=['measure', 'rank']).sort_values('rank')

df = pd.DataFrame(lb)
df.sort_values('ranks(>=)', ascending=True)

Unnamed: 0,ranks(>=),wins(>=),meanari
SCCT,4.132278,26.104505,0.653922
RSP,6.686967,8.683321,0.454491
SCT,7.678928,6.861569,0.457158
SP-CT,7.904994,8.061525,0.443853
FE,8.807552,6.905203,0.40321
logHeatPR,9.018758,5.956147,0.360776
logNHeat,9.340317,5.748882,0.351566
logPPR,9.708161,5.781608,0.344514
NHeat,9.838246,3.17443,0.313808
Abs,10.180024,6.654303,0.348622


In [10]:
df.to_excel('dissociative_leaderboard.xlsx')

In [11]:
df.to_excel('leaderboard.xlsx')