In [1]:
%matplotlib inline

In [2]:
import os
import pickle
import random
import sys
from collections import defaultdict

import networkx as nx
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from networkx.algorithms.approximation import clique
from scipy.stats import rankdata
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from tqdm.notebook import tqdm

sys.path.append('../../pygkernels')
from pygkernels.data import Datasets

from sbm_neighbour_score import sbm_neighbour_score

In [3]:
kernels_names = [
    'Katz', 'logKatz',
    'For', 'logFor',
    'Comm', 'logComm',
    'Heat', 'logHeat',
    'NHeat', 'logNHeat',
    'SCT', 'SCCT',
    'RSP', 'FE',
    'PPR', 'logPPR',
    'ModifPPR', 'logModifPPR',
    'HeatPR', 'logHeatPR',
    'DF', 'logDF',
    'Abs', 'logAbs',
    'SP-CT'
]

shuffle = lambda x: sorted(x, key=lambda k: random.random())

def dict_argmax(dct, score_key):
    best_key = list(dct.keys())[0]
    best_val = dct[best_key]
    for k, v in dct.items():
        if v[score_key] > best_val[score_key]:
            best_key, best_val = k, v
    return best_key, best_val

CACHE_ROOT = '../../cache/cache'

def load_or_calc_and_save(filename, force_calc=False, ignore_if_exist=False):
    def decorator(func):
        def wrapped(*args, **kwargs):
            if os.path.exists(filename) and not force_calc:
                print(f'{func.__name__}: cache file {filename} found! Skip calculations')
                if not ignore_if_exist:
                    with open(filename, 'rb') as f:
                        result = pickle.load(f)
                else:
                    result = None
            else:
                print(f'{func.__name__}: RECALC {filename}.\nargs: {", ".join(args)}, kwargs: {", ".join([f"{k}={v}" for k, v in kwargs.items()])}')
                result = func(*args, **kwargs)
                with open(filename, 'wb') as f:
                    pickle.dump(result, f)
            return result
        return wrapped
    return decorator

def calc_avranks(results):  # {dataset: {classifier: accuracy}}
    ranks = defaultdict(list)
    for dataset, classifier_accuracy in results.items():
        if type(dataset) == tuple:
            dataset = '_'.join([str(x) for x in dataset])
        classifiers, accuracies = zip(*list(classifier_accuracy.items()))
        for classifier, rank in zip(classifiers, rankdata(accuracies)):
            ranks[classifier].append(rank)
    ranks = {k: np.mean(v) for k, v in sorted(ranks.items(), key=lambda x: x[0])}
    return list(ranks.values()), list(ranks.keys()), len(results)

def ytrue_to_partition(y_true):
    partition = defaultdict(list)
    for idx, class_ in enumerate(y_true):
        partition[class_].append(idx)
    return list(partition.values())

In [4]:
DATASETS_RESULTS_ROOT = '../../cache/kkmeans_init_datasets'
datasets = [
    'cora_DB', 'cora_EC', 'cora_HA', 'cora_HCI', 'cora_IR', 'cora_Net',
    'dolphins',
    'eu-core',
    'eurosis',
    'football',
    'karate',
    'news_2cl1_0.1', 'news_2cl2_0.1', 'news_2cl3_0.1',
    'news_3cl1_0.1', 'news_3cl2_0.1', 'news_3cl3_0.1',
    'news_5cl1_0.1', 'news_5cl2_0.1', 'news_5cl3_0.1',
    'polblogs',
    'polbooks',
    'sp_school_day_1', 'sp_school_day_2'
]

In [5]:
with open(f'{CACHE_ROOT}/datasets_inits_bestparam_byari_individual_0.1.pkl', 'rb') as f:
    results = pickle.load(f)
with open(f'{CACHE_ROOT}/datasets_modularity_0.1.pkl', 'rb') as f:
    modularity_results = pickle.load(f)
    
for key in list(results.keys()):
    if key[0] not in datasets:
        del results[key]
        
for key in list(results.keys()):
    if key[0] not in datasets:
        del modularity_results[key]

In [6]:
results_modularity_any3 = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))  # {dataset: {graphidx: {kernel_name: best_ari}}}
for (dataset, kernel_name, graph_idx), si_ari in results.items():
    results_modularity_any3[dataset][graph_idx][kernel_name] = si_ari['modularity_any3']

In [7]:
def extract_feature(dataset_name, feature, G, partition, sp, max_clique):
    # graph-independent features
    n, k, p_in, p_out = dataset_name
    if feature == 'n':
        return n
    elif feature == 'k':
        return k
    elif feature == 'p_in':
        return p_in
    elif feature == 'p_out':
        return p_out
    elif feature == 'n/k':
        return n / k
    elif feature == 'p_in/p_out':
        return p_in / p_out
    
    elif feature == 'log(n)':
        return n
    elif feature == 'log(k)':
        return k
    elif feature == 'log(p_in)':
        return p_in
    elif feature == 'log(p_out)':
        return p_out
    elif feature == 'log(n/k)':
        return n / k
    elif feature == 'log(p_in/p_out)':
        return p_in / p_out
    
    elif feature == 'n/k * p_in/p_out':
        return (n / k) * (p_in / p_out)
    elif feature == 'log(n)/k * p_in/p_out':
        return np.log(n) / k * (p_in / p_out)
    elif feature == 'log(n/k) * p_in/p_out':
        return np.log(n / k) * (p_in / p_out)
    elif feature == 'log(n/k * p_in/p_out)':
        return np.log((n / k) * (p_in / p_out))
    
    elif feature == 'sbm_neighbour_score':
        return sbm_neighbour_score(int(n), int(k), p_in, p_out)
    
    # graph-dependant features
    elif feature == 'modularity':
        return nx.community.modularity(G, partition)
    elif feature == 'diameter':
        return nx.diameter(G)
    elif feature == 'density':
        return nx.density(G)
    elif feature == 'avg_deg':
        return np.mean(G.degree)
    elif feature == 'std_deg':
        return np.std(G.degree)
    elif feature == 'avg(deg | deg > avg_deg)':
        deg = np.array(G.degree)
        return np.mean(deg[deg > np.mean(deg)])
    elif feature == 'median_deg':
        return np.median(G.degree)
    elif feature == 'max_deg':
        return np.max(G.degree)
    elif feature == 'avg_sp':
        return nx.average_shortest_path_length(G)
    elif feature == 'std_sp':
        return np.std(sp)
    elif feature == 'median_sp':
        return np.median(sp)
    elif feature == 'max_sp':
        return np.max(sp)
    elif feature == 'max_clique':
        return max_clique
    elif feature == 'max_clique/(n/k)':
        return max_clique/(n/k)
    else:
        raise Exception()

# Feature importance

In [8]:
feature_names = [
    'n', 'k', 'p_in', 'p_out', 'n/k', 'p_in/p_out',
    'log(n)/k * p_in/p_out', 'n/k * p_in/p_out', 'log(n/k) * p_in/p_out', 'log(n/k * p_in/p_out)',
    'sbm_neighbour_score',
    'modularity', 'diameter', 'density', 
    'avg_deg', 'std_deg', 'avg(deg | deg > avg_deg)', 'median_deg', 'max_deg',
    'avg_sp', 'std_sp', 'median_sp', 'max_sp', 
    'max_clique', 'max_clique/(n/k)'
]

feature_names.extend([f'kernel_{name}' for name in kernels_names])

In [9]:
def prepare_column(column):
    @load_or_calc_and_save(f'{CACHE_ROOT}/feature_importance/{column}.pkl')
    def wrapper():
        X, ya, yr = [], [], []
        (A, partition), info = Datasets()[column]
        n, k, p_in, p_out = info['n'], info['k'], info['p_in'], info['p_out']
        G = nx.from_numpy_matrix(A)
        partition = ytrue_to_partition(partition) 
        sp = [l for u in G for v, l in nx.single_source_shortest_path_length(G, u).items()]
        max_clique = len(clique.max_clique(G))
        features = [extract_feature((n, k, p_in, p_out), feature_name, G, partition, sp, max_clique) for feature_name in feature_names]
        for graph_idx in range(7):
            graph_ari = [v for k, v in sorted(list(results_modularity_any3[column][graph_idx].items()), key=lambda x: x[0])]
            graph_ranks = calc_avranks({0: results_modularity_any3[column][graph_idx]})[0]

            X.append(features)
            ya.append(graph_ari)
            yr.append(graph_ranks)
        return X, ya, yr
    
    return wrapper()
    
Xy_list = Parallel(n_jobs=1)(delayed(prepare_column)(column) for column in tqdm(results_modularity_any3.keys()))

X, y, X_train, y_train, X_val, y_val = [], [], [], [], [], []
for Xi, _, yi in Xy_list:
    yi = yi > (np.max(yi, axis=1, keepdims=True) - 0.05)
    for i in range(25):
        feature_onehot = np.zeros((7, 25))
        feature_onehot[:, i] = 1
        Xif = np.concatenate([Xi, feature_onehot], axis=1)
        X.extend(Xif)
        y.extend(yi[:, i])
        X_train.extend(Xif[:5])
        y_train.extend(yi[:5, i])
        X_val.extend(Xif[5:])
        y_val.extend(yi[5:, i])
    
X, y, X_train, y_train, X_val, y_val = np.array(X), np.array(y), np.array(X_train), np.array(y_train), np.array(X_val), np.array(y_val)

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))

wrapper: cache file ../../cache/cache/feature_importance/cora_DB.pkl found! Skip calculations
wrapper: cache file ../../cache/cache/feature_importance/cora_EC.pkl found! Skip calculations
wrapper: cache file ../../cache/cache/feature_importance/cora_HA.pkl found! Skip calculations
wrapper: cache file ../../cache/cache/feature_importance/cora_HCI.pkl found! Skip calculations
wrapper: cache file ../../cache/cache/feature_importance/cora_IR.pkl found! Skip calculations
wrapper: cache file ../../cache/cache/feature_importance/cora_Net.pkl found! Skip calculations
wrapper: cache file ../../cache/cache/feature_importance/dolphins.pkl found! Skip calculations
wrapper: cache file ../../cache/cache/feature_importance/eu-core.pkl found! Skip calculations
wrapper: cache file ../../cache/cache/feature_importance/eurosis.pkl found! Skip calculations
wrapper: cache file ../../cache/cache/feature_importance/football.pkl found! Skip calculations
wrapper: cache file ../../cache/cache/feature_importance

In [10]:
X_train[0]

array([1.00600000e+03, 7.00000000e+00, 2.28103279e-02, 2.95176881e-03,
       1.43714286e+02, 7.72768107e+00, 7.63245103e+00, 1.11057816e+03,
       3.83897842e+01, 7.01263603e+00, 7.63662712e-01, 4.30491183e-01,
       1.30000000e+01, 6.24116001e-03, 2.54386183e+02, 3.22101079e+02,
       6.30000000e+02, 2.55000000e+01, 1.00500000e+03, 4.86076971e+00,
       1.61001160e+00, 5.00000000e+00, 1.30000000e+01, 7.00000000e+00,
       4.87077535e-02, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00])

In [23]:
estimator = LogisticRegression(max_iter=10000)
estimator.fit(X_train, y_train)
pd.DataFrame([{'feature': k, 'importance': v} for k, v in sorted(zip(feature_names, estimator.coef_[0]), key=lambda x: -np.abs(x[1]))])

Unnamed: 0,feature,importance
0,median_sp,-1.72107
1,avg(deg | deg > avg_deg),-1.595722
2,kernel_FE,1.296688
3,kernel_RSP,1.084001
4,kernel_logFor,1.084001
5,sbm_neighbour_score,0.832644
6,log(n/k * p_in/p_out),0.820211
7,kernel_HeatPR,0.744296
8,kernel_Comm,-0.689058
9,kernel_logHeat,-0.689058


In [12]:
estimator = LogisticRegression(max_iter=10000)
selector = RFE(estimator, n_features_to_select=2, verbose=1)
selector = selector.fit(X_train, y_train)
print(list(zip(feature_names, selector.support_)))
print(list(zip(feature_names, selector.ranking_)))

Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 fe

In [13]:
pd.DataFrame(zip(feature_names, selector.support_, selector.ranking_), columns=['feature', 'to choose', 'rank']).sort_values('rank')

Unnamed: 0,feature,to choose,rank
21,median_sp,True,1
20,std_sp,True,1
10,sbm_neighbour_score,False,2
13,density,False,3
38,kernel_FE,False,4
37,kernel_RSP,False,5
28,kernel_logFor,False,6
43,kernel_HeatPR,False,7
49,kernel_SP-CT,False,8
42,kernel_logModifPPR,False,9


In [14]:
estimator = LogisticRegression()
estimator.fit(X_train[:, selector.support_], y_train)

LogisticRegression()

In [22]:
print('\n'.join([f'{k}\t{v:.3f}' for k, v in sorted(zip(np.array(feature_names)[selector.support_], estimator.coef_[0]), key=lambda x: -np.abs(x[1]))]))

std_sp	4.232
median_sp	-2.444


In [17]:
y_pred = estimator.predict(X_val[:, selector.support_])

In [18]:
accuracy_score(y_val.ravel(), y_pred.ravel())

0.9341666666666667

In [19]:
f1_score(y_val.ravel(), y_pred.ravel())

0.6550218340611353