In [1]:
# file specific libraries
import torch
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import math, random, torch, collections, time, torch.nn.functional as F, networkx as nx, matplotlib.pyplot as plt, numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Linear
from torch_geometric.nn import GCNConv
from IPython.display import clear_output
from torch_geometric.utils import to_networkx
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from functools import wraps

%matplotlib inline
import sys,os
from models import *
from train_utils import *


In [2]:
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
import torch
import torch_geometric as tg
import pandas as pd

dataset_name = 'Cora'
dataset = Planetoid(root='../data/Planetoid', name=dataset_name, transform=NormalizeFeatures())
###
print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Contains isolated nodes: {data.has_isolated_nodes()}')
print(f'Contains self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')



Dataset: Cora():
Number of graphs: 1
Number of features: 1433
Number of classes: 7

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Number of training nodes: 140
Training node label rate: 0.05
Contains isolated nodes: False
Contains self-loops: False
Is undirected: True


In [14]:
#### Simple model

model =GNN(input_dim = data.num_features, hidden_dim=252,
           output_dim = dataset.num_classes, n_layers=2,
           activation ='relu', slope=.1,
           device ='cpu',
           alpha_res =0., alpha=0.5,
           beta=1.,
           normalize=False)
criterion = torch.nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4) #didn't include weight decay
train_acc_list, test_acc_list, loss_list, misclassified, predictions = train(500, model, criterion, optimizer, 
                                                 x= data.x, edge_index= data.edge_index, y=data.y, 
                                                m=mask(data.train_mask, data.test_mask),
                                                scatter_size=30, plotting=False)

Final test accuracy: 0.77


In [37]:
u = model(data.x, data.edge_index)

In [38]:
 F.mse_loss(u, u)

tensor(0., grad_fn=<MseLossBackward0>)

In [53]:
loo_pipeline(model, dataset, data,data.train_mask,
                 data.test_mask, 1,
                 10,
                 original_output=None,
                 compute_y_differences=True,
                 task='classfication',
                 loss_function=torch.nn.CrossEntropyLoss(),
                 lr=0.001)

Final test accuracy: 0.77


(tensor([[-1.6147, -0.6425, -1.1681,  ..., -1.2312, -2.9105, -0.9876],
         [-0.0281, -2.0336, -1.6827,  ...,  4.0344, -1.7742, -1.6099],
         [-1.4779, -2.1740, -0.3025,  ...,  4.4120, -3.9713, -3.5023],
         ...,
         [ 0.5894,  0.5385, -2.2205,  ..., -1.4522,  0.6031, -0.3671],
         [-0.7702, -0.6123, -1.8835,  ..., -0.7678, -1.8863, -2.7292],
         [-1.1804, -0.3707, -1.4391,  ..., -0.8305, -2.1603, -3.3164]],
        grad_fn=<AddmmBackward0>),
 0.07619693,
 array([ 1.9922217e-08, -5.9752682e-07, -1.0459329e-06, ...,
         4.6757538e-08,  1.5225369e-08,  8.1113596e-09], dtype=float32),
 0.772)

In [57]:
import copy
from scipy.special import rel_entr

def loo_pipeline(model, dataset, data, train_mask,
                 test_mask, which_node,
                 n_epochs=200,
                 original_output=None,
                 compute_y_differences=False,
                 task='classfication',
                 loss_function=torch.nn.CrossEntropyLoss(),
                 lr=0.001):
    """
    model should be the trained model/
    original_output = output using the original data, optional
    indicate = Boolean
    compute_y_differences = Boolean

    returns: prediction by the given data,
            y_differences (y - y_hat) if compute_y_differences set True,
            accuracy of the model trained using the given data
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if original_output is None:
        model.eval()
        original_output  = model(data.x, data.edge_index)
    _, original_predictions = torch.max(original_output.detach(),1)
    length = len(data.y[train_mask])
    original_accuracy = (original_predictions[test_mask] == data.y[test_mask].detach()).sum().item()/length
    original_misclassified = (original_predictions[test_mask] != data.y[test_mask]).numpy()

    #### Mask a node, and retrain on the data for each node
    new_model = copy.deepcopy(model)
    optimizer = torch.optim.Adam(new_model.parameters(), lr=lr, weight_decay=5e-4) 
    new_mask = copy.deepcopy(train_mask)
    new_mask[which_node] = False
    train_acc_list, test_acc_list, loss_list, misclassified, predictions = train(n_epochs, new_model, loss_function, optimizer, 
                                                     x= data.x, edge_index= data.edge_index, y=data.y, 
                                                     m=mask(new_mask, test_mask),
                                                     scatter_size=30, plotting=False)
    loo_output  = new_model(data.x, data.edge_index)
    _, loo_predictions = torch.max(loo_output.detach(),1)
    length = len(data.y[test_mask])
    loo_accuracy = (loo_predictions[test_mask] == data.y[test_mask].detach()).sum().item()/length
    loo_misclassified = (loo_predictions[test_mask] != data.y[test_mask]).numpy()
    if compute_y_differences:
        if len(loo_output) != len(original_output):
            y_differences = None
            print(HERE)
        else:
            original_scores = torch.nn.functional.softmax(original_output, dim=0).detach().numpy()
            loo_scores = torch.nn.functional.softmax(loo_output, dim=0).detach().numpy()
            kl = np.mean(rel_entr(original_scores, loo_scores),1)
            #print(f"kl divergence: {kl}")
            y_differences = np.mean(np.linalg.norm(loo_scores - original_scores))
            # softmax(original_output) - softmax(new_output) -> difference of the score
        # compare two differences / nonnegative version
        # KL divergence, take MSE loss

        # deeplearning & nonlinear model - embedding solution is unique
        # score is more identifiable, as sums up to 1, comparable
        # way the model train is end up in diff local minimum
        # for x,y,z, to be unique, needs another constraint for ex alpha = 0
    else:
        y_differences = None
        kl = None

    return loo_output, y_differences, kl, loo_accuracy


def check_pipeline(model, dataset, data, train_mask,
                   test_mask,
                   n_epochs=200,
                   original_output=None,
                   indicate=False, \
                   return_prediction=False,
                   compute_y_differences=False,
                   dimension=32,
                   task='classification',
                   loss_function=torch.nn.CrossEntropyLoss(),
                   lr=0.001):
    """
    model should be the trained model/
    original_output = output using the original data, optional
    indicate = Boolean
    compute_y_differences = Boolean

    returns: prediction by the given data,
            y_differences (y - y_hat) if compute_y_differences set True,
            accuracy of the model trained using the given data
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if original_output is None:
        model.eval()
        original_output  = model(data.x, data.edge_index)
    _, original_predictions = torch.max(original_output.detach(),1)
    length = len(data.y[train_mask])
    original_accuracy = (original_predictions[test_mask] == data.y[test_mask].detach()).sum().item()/length
    original_misclassified = (original_predictions[test_mask] != data.y[test_mask]).numpy()
    y_differences = []
    kl = []
    #### Mask a node, and retrain on the data for each node
    for i in torch.where(data.train_mask)[0]:
        loo_output, y_prime, kl_prime, loo_accuracy_prime = loo_pipeline(model, dataset, data, train_mask,
                 test_mask, which_node=i,
                 n_epochs=n_epochs,
                 original_output=original_output,
                 compute_y_differences=compute_y_differences,
                 task=task,
                 loss_function=loss_function,
                 lr=lr)
        y_differences += [y_prime]
        kl += [kl_prime]


    return new_output, y_differences, kl, new_accuracy

