In [1]:
import sys
import pickle
from collections import defaultdict

from matplotlib import colors
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from networkx.algorithms.approximation import clique
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
from sklearn.preprocessing import Normalizer
from tqdm.notebook import tqdm
from sklearn.model_selection import LeaveOneOut

from helpers import (
    SBM_Data,
    Datasets_Data,
    load_or_calc_and_save,
    ytrue_to_partition,
    calc_avranks,
    RFE,
    RFE_LOO,
    OneVsRest_custom,
    OneHotEncoding_custom,
)

sys.path.append("../../pygkernels")
from pygkernels.scenario import d3_category20

In [2]:
sbm_data_hub, datasets_data_hub = SBM_Data(), Datasets_Data()

X_trainval, ari_trainval = sbm_data_hub.make_dataset(return_clf=False)
X_test, ari_test = datasets_data_hub.make_dataset(return_clf=False)
_, y_trainval = sbm_data_hub.make_dataset(return_clf=True)
_, y_test = datasets_data_hub.make_dataset(return_clf=True)

X_trainval_flat = X_trainval.reshape(-1, X_trainval.shape[2])
X_test_flat = X_test.reshape(-1, X_test.shape[2])
y_trainval_flat = y_trainval.reshape(-1, y_trainval.shape[2])
y_test_flat = y_test.reshape(-1, y_test.shape[2])
ari_trainval_flat = ari_trainval.reshape(-1, ari_trainval.shape[2])
ari_test_flat = ari_test.reshape(-1, ari_test.shape[2])

feature_names = sbm_data_hub.allowed_features_list

prepare columns:   0%|          | 0/11 [00:00<?, ?it/s]

wrapper: RECALC ../../cache/cache/feature_importance/dataset2sbm_sp_school_day_1.pkl. args: , kwargs: 
Dataset sp_school_day_1 not in cache; reload


prepare columns:  45%|████▌     | 5/11 [00:32<00:39,  6.56s/it]

wrapper: RECALC ../../cache/cache/feature_importance/dataset2sbm_sp_school_day_2.pkl. args: , kwargs: 
Dataset sp_school_day_2 not in cache; reload


prepare columns:  55%|█████▍    | 6/11 [01:04<01:11, 14.25s/it]

wrapper: RECALC ../../cache/cache/feature_importance/dataset2sbm_news_2cl1_0.1.pkl. args: , kwargs: 
Dataset news_2cl1_0.1 not in cache; reload


prepare columns:  64%|██████▎   | 7/11 [03:31<03:35, 53.88s/it]

wrapper: RECALC ../../cache/cache/feature_importance/dataset2sbm_news_2cl2_0.1.pkl. args: , kwargs: 
Dataset news_2cl2_0.1 not in cache; reload


prepare columns:  73%|███████▎  | 8/11 [06:01<04:08, 82.81s/it]

wrapper: RECALC ../../cache/cache/feature_importance/dataset2sbm_news_2cl3_0.1.pkl. args: , kwargs: 
Dataset news_2cl3_0.1 not in cache; reload


prepare columns:  82%|████████▏ | 9/11 [08:13<03:15, 97.67s/it]

wrapper: RECALC ../../cache/cache/feature_importance/dataset2sbm_news_3cl1_0.1.pkl. args: , kwargs: 
Dataset news_3cl1_0.1 not in cache; reload


prepare columns:  91%|█████████ | 10/11 [15:55<03:26, 206.83s/it]

wrapper: RECALC ../../cache/cache/feature_importance/dataset2sbm_news_3cl2_0.1.pkl. args: , kwargs: 
Dataset news_3cl2_0.1 not in cache; reload


prepare columns: 100%|██████████| 11/11 [23:09<00:00, 126.35s/it]
prepare columns:   0%|          | 0/11 [00:00<?, ?it/s]

wrapper: RECALC ../../cache/cache/feature_importance/news_2cl1_0.1.pkl. args: , kwargs: 
Dataset news_2cl1_0.1 not in cache; reload


prepare columns:  36%|███▋      | 4/11 [00:13<00:23,  3.40s/it]

wrapper: RECALC ../../cache/cache/feature_importance/news_2cl2_0.1.pkl. args: , kwargs: 
Dataset news_2cl2_0.1 not in cache; reload


prepare columns:  45%|████▌     | 5/11 [00:27<00:39,  6.52s/it]

wrapper: RECALC ../../cache/cache/feature_importance/news_2cl3_0.1.pkl. args: , kwargs: 
Dataset news_2cl3_0.1 not in cache; reload


prepare columns:  55%|█████▍    | 6/11 [00:38<00:39,  7.99s/it]

wrapper: RECALC ../../cache/cache/feature_importance/news_3cl1_0.1.pkl. args: , kwargs: 
Dataset news_3cl1_0.1 not in cache; reload


prepare columns:  64%|██████▎   | 7/11 [01:21<01:13, 18.31s/it]

wrapper: RECALC ../../cache/cache/feature_importance/news_3cl2_0.1.pkl. args: , kwargs: 
Dataset news_3cl2_0.1 not in cache; reload


prepare columns:  73%|███████▎  | 8/11 [02:01<01:14, 24.93s/it]

wrapper: RECALC ../../cache/cache/feature_importance/sp_school_day_1.pkl. args: , kwargs: 
Dataset sp_school_day_1 not in cache; reload


prepare columns:  91%|█████████ | 10/11 [02:04<00:17, 17.96s/it]

wrapper: RECALC ../../cache/cache/feature_importance/sp_school_day_2.pkl. args: , kwargs: 
Dataset sp_school_day_2 not in cache; reload


prepare columns: 100%|██████████| 11/11 [02:08<00:00, 11.67s/it]
prepare columns: 100%|██████████| 11/11 [00:00<00:00, 2434.56it/s]
prepare columns: 100%|██████████| 11/11 [00:00<00:00, 2698.25it/s]


In [3]:
chosen_feature_names = ["n", "k", "p_in", "p_out"]

chosen_features = []
for chosen_feature in chosen_feature_names:
    chosen_features.append(sbm_data_hub.allowed_features_list.index(chosen_feature))
chosen_features

[0, 1, 2, 3]

In [4]:
support_kernels = np.array([False, True, True, True, False, True, True, True, False,
                            False, True, True, True, False, False, False, True, True,
                            True, True, False, False, False, False, True])

# Baseline 1: the best measure for all

In [5]:
baseline1_kernel_idx = np.argmax(np.mean(ari_trainval_flat, axis=0))
baseline1_trainval_ari = np.mean(ari_trainval_flat[:, baseline1_kernel_idx])
baseline1_test_ari = np.mean(ari_test_flat[:, baseline1_kernel_idx])

baseline1_kernel_name = sbm_data_hub.kernel_names[baseline1_kernel_idx]
print(f"baseline 1. best: {baseline1_kernel_name} ({baseline1_kernel_idx}), "
      f"trainval: {baseline1_trainval_ari:.3f}, test: {baseline1_test_ari:.3f}")

baseline 1. best: HeatPR (18), trainval: 0.952, test: 0.827


# Upper bound 1: the best measure for all (by test)

In [6]:
upperbound1_kernel_idx = np.argmax(np.mean(ari_test_flat, axis=0))
upperbound1_trainval_ari = np.mean(ari_trainval_flat[:, upperbound1_kernel_idx])
upperbound1_test_ari = np.mean(ari_test_flat[:, upperbound1_kernel_idx])

upperbound1_kernel_name = sbm_data_hub.kernel_names[upperbound1_kernel_idx]
print(f'upper bound 1. best: {upperbound1_kernel_name} ({upperbound1_kernel_idx}), '
      f'trainval: {upperbound1_trainval_ari:.3f}, test: {upperbound1_test_ari:.3f}')

upper bound 1. best: SCT (10), trainval: 0.952, test: 0.834


# Upper bound 2: the best measure for every graph

In [7]:
upperbound2_trainval_ari = np.mean(np.max(ari_trainval_flat, axis=1))
upperbound2_test_ari = np.mean(np.max(ari_test_flat, axis=1))
print(f'upper bound 2. trainval: {upperbound2_trainval_ari:.3f}, test: {upperbound2_test_ari:.3f}')

upper bound 2. trainval: 0.973, test: 0.851


# Ours PRELIMINARY

In [8]:
estimator = OneVsRest_custom(SVR(), weight_samples=True)
estimator.fit(X_trainval_flat[:, chosen_features], ari_trainval_flat[:, support_kernels])
y_pred = estimator.predict(X_test_flat[:, chosen_features])
ours7 = np.mean(ari_test_flat[:, support_kernels][range(y_pred.shape[0]), np.argmax(y_pred, axis=1)])
ours7

0.8192357951064864

In [9]:
estimator = OneVsRest_custom(SVC(), weight_samples=False)
selector = RFE(estimator, feature_names, max_features=4, n_jobs=12)
selector = selector.fit(X_trainval_flat, y_trainval_flat, X_test_flat, y_test_flat, ari_test_flat)

  0%|          | 0/19 [00:00<?, ?it/s]

all features, acc=-1.000, f1=-1.000, ari=0.813


100%|██████████| 19/19 [00:00<00:00, 41.44it/s]
100%|██████████| 171/171 [00:00<00:00, 882.56it/s]

1 features, acc=-1.000, f1=-1.000, ari=0.821, set=('avg_sp',)



  6%|▌         | 60/969 [00:00<00:01, 473.35it/s]

2 features, acc=-1.000, f1=-1.000, ari=0.832, set=('n', 'sbm_neighbour_score')


100%|██████████| 969/969 [00:05<00:00, 179.65it/s]
  2%|▏         | 60/3876 [00:00<00:08, 475.71it/s]

3 features, acc=-1.000, f1=-1.000, ari=0.837, set=('p_in/p_out', 'modularity', 'std_sp')


100%|██████████| 3876/3876 [00:23<00:00, 162.24it/s]


4 features, acc=-1.000, f1=-1.000, ari=0.832, set=('n', 'k', 'p_in', 'sbm_neighbour_score')


In [10]:
estimator = OneVsRest_custom(SVR(), weight_samples=True)
selector = RFE(estimator, feature_names, max_features=4, n_jobs=12)
selector = selector.fit(X_trainval_flat, ari_trainval_flat, X_test_flat, ari_test_flat, ari_test_flat)

100%|██████████| 19/19 [00:00<00:00, 2378.01it/s]

all features, acc=-1.000, f1=-1.000, ari=0.615



 21%|██        | 36/171 [00:00<00:00, 316.05it/s]

1 features, acc=-1.000, f1=-1.000, ari=0.834, set=('diameter',)


100%|██████████| 171/171 [00:00<00:00, 185.37it/s]
  4%|▎         | 36/969 [00:00<00:02, 339.61it/s]

2 features, acc=-1.000, f1=-1.000, ari=0.836, set=('n/k', 'avg_deg')


100%|██████████| 969/969 [00:08<00:00, 116.33it/s]
  1%|          | 40/3876 [00:00<00:11, 346.53it/s]

3 features, acc=-1.000, f1=-1.000, ari=0.837, set=('k', 'n/k', 'std_deg')


100%|██████████| 3876/3876 [00:35<00:00, 108.17it/s]


4 features, acc=-1.000, f1=-1.000, ari=0.837, set=('p_in/p_out', 'modularity', 'diameter', 'std_sp')


# Ours MAIN

In [11]:
kernel_ari = defaultdict(list)
for train_index, test_index in LeaveOneOut().split(X_trainval):
    X_train, X_val = X_trainval[train_index], X_trainval[test_index]
    y_train, y_val = y_trainval[train_index], y_trainval[test_index]
    
    estimator = OneVsRest_custom(SVR(), weight_samples=True)
    estimator.fit(X_train[:, chosen_features], ari_train[:, support_kernels])
    y_pred = estimator.predict(X_val[:, chosen_features])
    ours7 = np.mean(ari_val[:, support_kernels][range(y_pred.shape[0]), np.argmax(y_pred, axis=1)])
    

NameError: name 'ari_train' is not defined