In [1]:
import sys
import pickle
from collections import defaultdict

from matplotlib import colors
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from networkx.algorithms.approximation import clique
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
from sklearn.preprocessing import Normalizer
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.optim.lr_scheduler import StepLR

from utils import SBM_Data, Datasets_Data, load_or_calc_and_save, ytrue_to_partition, calc_avranks, RFE, RFE_LOO, OneVsRest_custom, OneHotEncoding_custom

sys.path.append('../../pygkernels')
from pygkernels.scenario import d3_category20

In [2]:
data_hub = SBM_Data()
_, results_modularity_any3, modularity_results = data_hub.load_precalculated()

X_3d, ari_3d = data_hub.make_dataset(return_clf=False)
# X_train_3d,    ari_train_3d    = X_3d[:, :60], ari_3d[:, :60]
X_trainval_3d, ari_trainval_3d = X_3d[:, :80], ari_3d[:, :80]
# X_val_3d,      ari_val_3d      = X_3d[:, 60:80], ari_3d[:, 60:80]
X_test_3d,     ari_test_3d     = X_3d[:, 80:], ari_3d[:, 80:]

X,          ari          = X_3d.reshape(-1, X_3d.shape[2]),          ari_3d.reshape(-1, ari_3d.shape[2])
# X_train,    ari_train    = X_train_3d.reshape(-1, X_3d.shape[2]),    ari_train_3d.reshape(-1, ari_3d.shape[2])
X_trainval, ari_trainval = X_trainval_3d.reshape(-1, X_3d.shape[2]), ari_trainval_3d.reshape(-1, ari_3d.shape[2])
# X_val,      ari_val      = X_val_3d.reshape(-1, X_3d.shape[2]),      ari_val_3d.reshape(-1, ari_3d.shape[2])
X_test,     ari_test     = X_test_3d.reshape(-1, X_3d.shape[2]),     ari_test_3d.reshape(-1, ari_3d.shape[2])

feature_names = data_hub.allowed_features_list

prepare columns: 100%|██████████| 87/87 [00:00<00:00, 651.25it/s]


In [3]:
data_hub.kernel_names.index('SCT')

10

# Baseline 1: the best measure for all

In [4]:
baseline1_kernel_idx = np.argmax(np.mean(ari_trainval, axis=0))
baseline1_kernel_name = data_hub.kernel_names[baseline1_kernel_idx]
baseline1_trainval_ari = np.mean(ari_trainval[:, baseline1_kernel_idx])
baseline1_test_ari = np.mean(ari_test[:, baseline1_kernel_idx])
print(f'baseline 1. best: {baseline1_kernel_name} ({baseline1_kernel_idx}), '
      f'trainval: {baseline1_trainval_ari:.3f}, test: {baseline1_test_ari:.3f}')

baseline 1. best: logHeat (7), trainval: 0.682, test: 0.680


# Baseline 2: best measure for every column (based on train)

In [5]:
baseline2_kernel_indices = np.argmax(np.mean(ari_trainval_3d, axis=1), axis=1)
baseline2_trainval_ari = np.mean(np.mean(ari_trainval_3d, axis=1)[range(len(baseline2_kernel_indices)), baseline2_kernel_indices])
baseline2_test_ari = np.mean(np.mean(ari_test_3d, axis=1)[range(len(baseline2_kernel_indices)), baseline2_kernel_indices])
print(f'baseline 2. trainval: {baseline2_trainval_ari:.3f}, test: {baseline2_test_ari:.3f}')

baseline 2. trainval: 0.698, test: 0.698


# Upper bound 1: best measure for every column (based on val)

In [6]:
upperbound1_kernel_indices = np.argmax(np.mean(ari_trainval_3d, axis=1), axis=1)
upperbound1_trainval_ari = np.mean(np.mean(ari_trainval_3d, axis=1)[range(len(upperbound1_kernel_indices)), upperbound1_kernel_indices])
upperbound1_kernel_indices = np.argmax(np.mean(ari_test_3d, axis=1), axis=1)
upperbound1_test_ari = np.mean(np.mean(ari_test_3d, axis=1)[range(len(upperbound1_kernel_indices)), upperbound1_kernel_indices])
print(f'upper bound 1. trainval: {upperbound1_trainval_ari:.3f}, test: {upperbound1_test_ari:.3f}')

upper bound 1. trainval: 0.698, test: 0.704


# Upper bound 2: best measure for every graph

In [7]:
upperbound2_trainval_ari = np.mean(np.max(ari_trainval, axis=1))
upperbound2_test_ari = np.mean(np.max(ari_test, axis=1))
print(f'upper bound 2. trainval: {upperbound2_trainval_ari:.3f}, test: {upperbound2_test_ari:.3f}')

upper bound 2. trainval: 0.736, test: 0.738


# Ours NN

In [8]:
class SBMDataset(Dataset):
    def __init__(self, part='train'):
        super().__init__()
        data_hub = SBM_Data()
        
#         chosen_feature_names = ['n', 'k', 'p_in', 'p_out']
#         chosen_feature_names = ['log(n)/k * p_in/p_out', 'avg_sp']
#         chosen_feature_names = ['sbm_neighbour_score', 'modularity']
#         chosen_feature_names = ['log(n)/k * p_in/p_out', 'median_deg', 'max_clique/(n/k)']
        chosen_feature_names = ['sbm_neighbour_score', 'modularity', 'avg_sp', 'std_sp']

        chosen_features = []
        for chosen_feature in chosen_feature_names:
            chosen_features.append(data_hub.allowed_features_list.index(chosen_feature))
        chosen_features
        
        X_3d, ari_3d = data_hub.make_dataset(return_clf=False)
        X_3d = X_3d[:, :, chosen_features]
        
        if part == 'train':
            X, y = X_3d[:, :60].reshape(-1, X_3d.shape[2]), ari_3d[:, :60].reshape(-1, ari_3d.shape[2])
        elif part == 'val':
            X, y = X_3d[:, 60:80].reshape(-1, X_3d.shape[2]), ari_3d[:, 60:80].reshape(-1, ari_3d.shape[2])
        elif part == 'test':
            X, y = X_3d[:, 80:].reshape(-1, X_3d.shape[2]), ari_3d[:, 80:].reshape(-1, ari_3d.shape[2])
        assert X.shape[0] == y.shape[0]
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float()
        print(f'{part}: {self.X.shape[0]}')
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, index):
        Xi, yi = self.X[index], self.y[index]
        return Xi, yi

In [9]:
class PolicyNet(nn.Module):
    def __init__(self, input_channels, out_channels):
        super(PolicyNet, self).__init__()

        self.fc1 = nn.Linear(input_channels, 128)
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, 512)
        self.fc4 = nn.Linear(512, 256)
        self.fc5 = nn.Linear(256, 128)
        self.fc6 = nn.Linear(128, out_channels)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        return torch.softmax(x, axis=1)

class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()
        self.criterion = nn.MSELoss()
        

    def forward(self,x,y):
        loss = torch.sqrt(self.criterion(x, y))
        return loss

In [10]:
device = 'cuda:0'
n_epoch = 2000
batch_size = 50

train_dataset = SBMDataset(part='train')
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_dataset = SBMDataset(part='val')
val_dataloader = DataLoader(val_dataset, batch_size=10)
test_dataset = SBMDataset(part='test')
test_dataloader = DataLoader(test_dataset, batch_size=10)

sampleX, sampley = val_dataset[0]

model = PolicyNet(sampleX.shape[0], sampley.shape[0]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0012)
scheduler = StepLR(optimizer, step_size=170, gamma=0.5)
criterion = RMSELoss()

prepare columns: 100%|██████████| 87/87 [00:00<00:00, 692.05it/s]


train: 5220


prepare columns: 100%|██████████| 87/87 [00:00<00:00, 438.12it/s]


val: 1740


prepare columns: 100%|██████████| 87/87 [00:00<00:00, 688.33it/s]


test: 1740


In [11]:
writer = SummaryWriter()

best_val_loss = 1000
for epoch in tqdm(list(range(n_epoch))):
    writer.add_scalar('train/lr', scheduler.get_last_lr()[0], epoch)
                  
    for idx, (X, ari) in enumerate(train_dataloader):
        X, ari = X.to(device), ari.to(device)
        y_true = torch.max(ari, dim=1, keepdims=True)[0]
        optimizer.zero_grad()
        output = model(X)
        y_pred = torch.sum(ari * output, dim=1, keepdims=True)
        loss = criterion(y_pred, y_true)
        loss.backward()
        optimizer.step()
        ari = torch.mean(y_pred.detach())
        writer.add_scalar('train/loss', loss.item(), epoch * len(train_dataset) + idx * batch_size)
        writer.add_scalar('train/ari', ari.item(), epoch * len(train_dataset) + idx * batch_size)
    
    val_loss, val_ari, counter = 0, 0, 0
    with torch.no_grad():
        for X, ari in val_dataloader:
            X, ari = X.to(device), ari.to(device)
            y_true = torch.max(ari, dim=1, keepdims=True)[0]
            output = model(X)
            y_pred = ari[range(len(output)), torch.max(output, dim=1)[1]].unsqueeze(1)
            loss = criterion(y_pred, y_true)
            val_loss += loss.item()
            val_ari += torch.mean(y_pred.detach())
            counter += 1
    val_loss = val_loss / counter
    val_ari = val_ari / counter
    writer.add_scalar('val/loss', val_loss, epoch)
    writer.add_scalar('val/ari', val_ari, epoch)
    writer.flush()
    scheduler.step()
    
    if best_val_loss > val_loss:
        print(f'new best: {epoch}, loss={val_loss:.3f}, ari={val_ari:.3f}')
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'model_best.pth')

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

new best: 0, loss=0.080, ari=0.674
new best: 1, loss=0.071, ari=0.683
new best: 3, loss=0.064, ari=0.690
new best: 8, loss=0.064, ari=0.690
new best: 9, loss=0.064, ari=0.691
new best: 14, loss=0.063, ari=0.691
new best: 18, loss=0.062, ari=0.692
new best: 43, loss=0.062, ari=0.693
new best: 60, loss=0.061, ari=0.693
new best: 80, loss=0.061, ari=0.693
new best: 166, loss=0.061, ari=0.692
new best: 176, loss=0.060, ari=0.693
new best: 186, loss=0.060, ari=0.693
new best: 198, loss=0.060, ari=0.693
new best: 202, loss=0.060, ari=0.693
new best: 326, loss=0.059, ari=0.693
new best: 348, loss=0.059, ari=0.694
new best: 359, loss=0.059, ari=0.694
new best: 365, loss=0.059, ari=0.694
new best: 369, loss=0.059, ari=0.694
new best: 404, loss=0.058, ari=0.694



In [12]:
model = PolicyNet(sampleX.shape[0], sampley.shape[0]).to(device)
model.load_state_dict(torch.load('model_best.pth'))
model.eval()

PolicyNet(
  (fc1): Linear(in_features=4, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=512, bias=True)
  (fc4): Linear(in_features=512, out_features=256, bias=True)
  (fc5): Linear(in_features=256, out_features=128, bias=True)
  (fc6): Linear(in_features=128, out_features=25, bias=True)
)

In [13]:
test_loss, test_ari, counter = 0, 0, 0
with torch.no_grad():
    for X, ari in test_dataloader:
        X, ari = X.to(device), ari.to(device)
        y_true = torch.max(ari, dim=1, keepdims=True)[0]
        output = model(X)
        y_pred = ari[range(len(output)), torch.max(output, dim=1)[1]].unsqueeze(1)
        loss = criterion(y_pred, y_true)
        test_loss += loss.item()
        test_ari += torch.mean(y_pred.detach())
        counter += 1
test_loss = test_loss / counter
test_ari = test_ari / counter
print(f'test loss: {test_loss:.3f}, ari: {test_ari:.3f}')

test loss: 0.059, ari: 0.696
