In [1]:
import pickle
from collections import defaultdict

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from networkx.algorithms.approximation import clique
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import Normalizer
from tqdm.notebook import tqdm

from utils import SBM_Data, Datasets_Data, load_or_calc_and_save, ytrue_to_partition, calc_avranks, RFE, RFE_LOO

In [2]:
data_hub = Datasets_Data()
_, results_modularity_any3, modularity_results = data_hub.load_precalculated()

# Baseline: the best measure for all

In [3]:
by_kernel_results = defaultdict(list)
for column_name, column_results in results_modularity_any3.items():
    for graph_idx, graph_results in column_results.items():
        for kernel_name, kernel_result in graph_results.items():
            by_kernel_results[kernel_name].append(kernel_result)

by_kernel_results = {k: np.mean(v) for k, v in by_kernel_results.items()}

df = pd.DataFrame(list(by_kernel_results.items()), columns=['measure', 'mean ari'])
df.sort_values('mean ari', ascending=False)

Unnamed: 0,measure,mean ari
10,SCT,0.686819
11,SCCT,0.686724
12,RSP,0.667269
24,SP-CT,0.663687
15,logPPR,0.662223
23,logAbs,0.661906
7,logHeat,0.66153
13,FE,0.66083
17,logModifPPR,0.660433
3,logFor,0.660237


In [4]:
list(df.iloc[7]), list(df.iloc[10])

(['logHeat', 0.6615296144644105], ['SCT', 0.6868189236793265])

# Upper bound: best measure for every column 

In [5]:
by_column_results = defaultdict(list)
for column_name, column_results in results_modularity_any3.items():
    by_kernel_results = defaultdict(list)
    for graph_idx, graph_results in column_results.items():
        for kernel_name, kernel_result in graph_results.items():
            by_kernel_results[kernel_name].append(kernel_result)
    best_measure, best_ari = sorted([(k, np.mean(v)) for k, v in by_kernel_results.items()], key=lambda x: -x[1])[0]
    by_column_results[column_name] = (best_measure, best_ari)

df = pd.DataFrame([(k, v[0], v[1]) for k, v in by_column_results.items()], columns=['column', 'best_measure', 'mean ari'])
np.mean(list(df['mean ari']))

0.7060681799642478

# Upper bound: best measure for every graph

In [6]:
results = list()
for column_name, column_results in results_modularity_any3.items():
    for graph_idx, graph_results in column_results.items():
        graph_best_ari = np.max(list(graph_results.values()))
        results.append(graph_best_ari)

np.mean(results)

0.7102334917627724