In [1]:
%matplotlib inline

In [2]:
import os
import pickle
import sys
from collections import Counter, defaultdict
from copy import deepcopy
from itertools import combinations, product
import pandas as pd
import json
import random

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import powerlaw
from joblib import Parallel, delayed
from matplotlib import colors
from networkx.algorithms.community.quality import modularity as nx_modularity
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.utils.testing import ignore_warnings
from tqdm.notebook import tqdm

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

sys.path.append('../../pygkernels')
from pygkernels.measure import kernels
from pygkernels.scenario import d3_colors



# Localizaton by SVM

In [3]:
kernels_order = [x.name for x in kernels]
list(enumerate(kernels_order));

In [4]:
# root = '/media/illusionww/68949C3149F4E819/phd/pygkernels/montecarlo_lfr_simple'
# dataset = []
# for fn in tqdm(os.listdir(root)):
#     with open(f'{root}/{fn}', 'rb') as f:
#         dataset.append(json.load(f))
        
with open('filtered_dataset.json', 'r') as f:
    dataset = json.load(f)
    
idx_several_answers = [idx for idx, d in enumerate(dataset) 
                       if np.sum(np.max(np.array(list(d['measure_best_results'].values()))) == np.array(list(d['measure_best_results'].values()))) > 1]
idx_several_answers = set(idx_several_answers)

dataset = [d for idx, d in enumerate(dataset) if idx not in idx_several_answers]

In [5]:
lfr_feature_names = [
    'n',
    'tau1|sqrtinv',
    'tau2|sqrtinv',
#     'avg_density',
    'avg_degree|log',
    'modularity'
]

graph_feature_names = [
    'n',
    'tau1|sqrtinv',
    'tau2|sqrtinv',
#     'avg_density',
    'avg_degree|log',
    'modularity'
]

all_feature_names = [
    'n',
    'tau1|sqrtinv',
    'tau2|sqrtinv',
#     'avg_density',
    'avg_degree|log',
    'modularity'
]

top_feature_names = [
    'n',
    'tau1|sqrtinv',
    'tau2|sqrtinv',
#     'avg_density',
    'avg_degree|log',
    'modularity'
]

feature_min = {fn: np.min([data['estimated_params'][fn] for data in dataset]) for fn in all_feature_names}
feature_max = {fn: np.max([data['estimated_params'][fn] for data in dataset]) for fn in all_feature_names}
feature_width = {fn: feature_max[fn] - feature_min[fn] for fn in all_feature_names}
feature_normalize = lambda x, fn: (x - feature_min[fn]) / feature_width[fn]

for fn in all_feature_names:
    print(f'{fn}: {feature_min[fn]:.2f}, {feature_max[fn]:.2f} | '
          f'{feature_normalize(feature_min[fn], fn):.2f}, {feature_normalize(feature_max[fn], fn):.2f}')

n: 10.00, 1499.00 | 0.00, 1.00
tau1|sqrtinv: 0.27, 0.94 | 0.00, 1.00
tau2|sqrtinv: 0.19, 0.97 | 0.00, 1.00
avg_degree|log: 0.86, 7.07 | 0.00, 1.00
modularity: -0.46, 0.84 | 0.00, 1.00


In [6]:
class AffineModel(nn.Module):
    def __init__(self, ndim):
        super().__init__()
#         print('ndim', ndim)
        self.layer = nn.Linear(ndim, ndim)
        
    def forward(self, X):
        h = self.layer(X)
        return (torch.clamp(1 - torch.sqrt(torch.sum(torch.pow(h, 2), dim=1)), -10, 1) + 1) / 2  # from -49.5 to 1, with border on 0.5

In [7]:
def mse(y_pred, y_true):
    pos_weight, neg_weight = float(y_true.shape[0]) / torch.sum(y_true == 1).float() / 2, float(y_true.shape[0]) / torch.sum(y_true == 0).float() / 2
#     print(pos_weight, neg_weight)
    weights = y_true * pos_weight + (1 - y_true) * neg_weight
    return torch.mean(weights * (y_pred - y_true) ** 2)

In [8]:
class EllipsoidEstimator:
    def __init__(self, ndim, device=0):
        self.device = device
        self.model = AffineModel(ndim).to(device)

        with torch.no_grad():
            self.model.layer.weight.fill_(0.)
            for i in range(self.model.layer.weight.shape[0]):
                self.model.layer.weight[i, i] = 1
            self.model.layer.bias.fill_(0.5)
        
    def fit(self, X, y):
        X, y_true = torch.from_numpy(X).float().to(self.device), torch.from_numpy(y).long().to(self.device)
        
        best_weights = {
            'weight': None,
            'bias': None
        }
        min_loss = 228*1488
        optimizer = optim.Adam(params=self.model.parameters(), lr=0.001)
        for n_epoch in range(10000):
            optimizer.zero_grad()
            y_pred = self.model(X)
            loss = mse(y_pred, y_true)
            loss.backward()
            optimizer.step()

            if loss < min_loss:
                min_loss = loss.detach()
                with torch.no_grad():
                    best_weights['weight'] = self.model.layer.weight.clone()
                    best_weights['bias'] = self.model.layer.bias.clone()

        with torch.no_grad():
            self.model.layer.weight = torch.nn.Parameter(best_weights['weight'])
            self.model.layer.bias = torch.nn.Parameter(best_weights['bias'])
        
    def predict(self, X):
        X = torch.from_numpy(X).to(self.device)
        with torch.no_grad():
            y_pred = self.model(X)
        return y_pred.cpu().numpy()
    
    def fit_predict(self, X, y):
        self.fit(X, y)
        return self.predict(X)

In [9]:
tx = np.array([[feature_normalize(item['estimated_params'][fname], fname) for fname in all_feature_names] for item in dataset]).astype(np.float32)
ty = np.array([[item['measure_best_results'][kernel] for kernel in kernels_order] for item in dataset]).astype(np.float32)
print(tx.shape, ty.shape)

# for kernel_name in ['SCCT', 'NHeat', 'Comm', 'RSP', 'logComm', 'SP-CT', 'DF', 'Katz', 'SCT', 'logDF', 'logKatz']:
#     measure_idx = kernels_order.index(kernel_name)
#     y_true = np.argmax(ty, axis=1) == measure_idx
    
#     estimator = EllipsoidEstimator(tx.shape[1])
#     y_pred = estimator.fit_predict(tx, y_true)
#     f1 = f1_score(y_true, y_pred > 0.5)
    
#     print(f'{kernel_name}\t({measure_idx})\t{np.sum(y_true)}\t{f1:.2f}')

(5360, 5) (5360, 25)


## Best 2d projection

In [10]:
class RFE:
    def __init__(self, feature_names, max_features=2, n_jobs=4):
        self.feature_names = feature_names
        self.max_features = max_features
        self.n_jobs = n_jobs
        self.results = {}

    @ignore_warnings(category=ConvergenceWarning)
    def score_one(self, X, y_true, set_feat_names, device=0):
        support = np.array([x in set_feat_names for x in self.feature_names], dtype=np.bool)
        estimator = EllipsoidEstimator(ndim=np.sum(support), device=device)
#         print(X[:, support].shape, y_true.shape)
        y_pred = estimator.fit_predict(X[:, support], y_true)
#         print(y_pred.shape)
        f1 = f1_score(y_true, y_pred > 0.5)
        return set_feat_names, f1, estimator

    def fit(self, X, y_true):
        # for all features first:
        _, f1_all, estimator_all = self.score_one(X, y_true, self.feature_names)
        print(f'all features, f1={f1_all:.3f}')

#         for n_features in range(self.max_features, self.max_features + 1):
#             raw_results = Parallel(n_jobs=self.n_jobs)(delayed(self.score_one)(X, y_true, set_feat_names, idx % 2)
#                                                    for idx, set_feat_names in enumerate(tqdm(list(combinations(self.feature_names, n_features)))))
#             results = {
#                 'lfr': {'best': {'set': None, 'f1': 0}, 'all_results': []},
#                 'graphstructure': {'best': {'set': None, 'f1': 0}, 'all_results': []},
#                 'all': {'best': {'set': None, 'f1': 0}, 'all_results': []},
#             }
#             for set_feat_names, f1, estimator in raw_results:
#                 item = {'set': set_feat_names, 'f1': f1}
#                 if all([name in lfr_feature_names for name in set_feat_names]):
#                     results['lfr']['all_results'].append(item)
#                     if f1 > results['lfr']['best']['f1']:
#                         results['lfr']['best'] = item
#                 if all([name in graph_feature_names for name in set_feat_names]):
#                     results['graphstructure']['all_results'].append(item)
#                     if f1 > results['graphstructure']['best']['f1']:
#                         results['graphstructure']['best'] = item
#                 results['all']['all_results'].append(item)
#                 if f1 > results['all']['best']['f1']:
#                     results['all']['best'] = item
#             print(f"{n_features} features")
#             print(f"lfr, f1={results['lfr']['best']['f1']:.3f}, set={results['lfr']['best']['set']}")
#             print(f"graphstructure, f1={results['graphstructure']['best']['f1']:.3f}, set={results['graphstructure']['best']['set']}")
#             print(f"all, f1={results['all']['best']['f1']:.3f}, set={results['all']['best']['set']}")
#             self.results[n_features] = results
        return self

In [11]:
tx = np.array([[feature_normalize(item['estimated_params'][fname], fname) for fname in all_feature_names] for item in dataset]).astype(np.float32)
ty = np.array([[item['measure_best_results'][kernel] for kernel in kernels_order] for item in dataset]).astype(np.float32)
print(tx.shape, ty.shape)

all_estimators = {}

for kernel_name in [
    'SCCT',
    'NHeat',
    'Comm',
    'RSP',
    'logComm',
    'SP-CT',
    'SCT',
    'logDF',
    'logKatz'
]:
    print(kernel_name)
    measure_idx = kernels_order.index(kernel_name)
    y_true = np.argmax(ty, axis=1) == measure_idx
    
    estimator = RFE(all_feature_names, max_features=5)
    estimator.fit(tx, y_true)
    
    all_estimators[kernel_name] = estimator

    print()

(5360, 5) (5360, 25)
SCCT
all features, f1=0.782

NHeat
all features, f1=0.434

Comm
all features, f1=0.246

RSP
all features, f1=0.122

logComm
all features, f1=0.301

SP-CT
all features, f1=0.081

SCT
all features, f1=0.100

logDF
all features, f1=0.222

logKatz
all features, f1=0.125



In [12]:
def draw2d(self, X, y_true):
    set_feat_names, f1, estimator = self.results[2]['set'], self.results[2]['f1'], self.results[2]['estimator']
    support = np.array([x in set_feat_names for x in self.feature_names], dtype=np.bool)
    support_idx = [x for x in range(support.shape[0]) if support[x]]
#         print(set_feat_names)
#         print(support)
#         print(support_idx)

    y_pred = estimator.predict(X[:, support])
    f1 = f1_score(y_true, y_pred > 0.5)
    print(f'f1: {f1:.3f}')

    background = np.zeros((101, 101, 3), dtype=np.uint8)
    background[:] = 255
    flatgrid = np.array(list(product(np.array(range(101)), np.array(range(101)))), dtype=np.int)
    flatgrid2 = np.array(list(product(np.array(range(101)) / 100, np.array(range(101)) / 100)), dtype=np.float32)
    background_flat = estimator.predict(flatgrid2)
    for a, b in zip(flatgrid, background_flat):
        if b > 0.5:
            background[a[1], a[0], :] = 128

    fig, ax = plt.subplots(1, 1, figsize=(8, 8))
    ax.imshow(background)

    ax.scatter(X[:, support_idx[0]] * 100, X[:, support_idx[1]] * 100, s=1, c=y_true)

    ax.set_xlabel(set_feat_names[0])
    ax.set_ylabel(set_feat_names[1])

    plt.tight_layout()