In [37]:
# file specific libraries
import torch
import pandas as pd 
import numpy as np

In [38]:
import sys
sys.path.append('../')
from GNN import *
from train_mask import *
from data import *

## training

In [39]:
cora_A_random_data = random_train_mask(cora_A)
cora_P_data_manual = create_train_mask(cora_P, [1,2,3,4,5,6,7], 0.3)
cora_P_data_one = mask_a_node(cora_P, [0, 1])

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cora_P2 = accuracy(GraphSAGE, device, cora_P, cora_P_data_manual)
cora_P2.train(50) 
evalP = cora_P2.evaluate()

## note: "accuracy object is not subscriptable" error is because I probably named one of the variable name 
## using the dataset name rip so just rename the devices, data, etc. 

Training the model
Evaluating the model
Accuracy: 0.1655



In [50]:
def return_accuracy(model, dataset, data, epoch_size, indicate): 
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    new_class = GNN.accuracy(model, device, dataset, data, indicate=indicate)              
    new_class.train(epoch_size)
    accuracy = new_class.evaluate()
    return accuracy
    
#     if compute_gradient: 
#         output, accuracy = new_class.evaluate(return_prediction=compute_gradient)
#         accuracy = round(accuracy,3)
#         out
    
#     return accuracy, gradients

In [None]:
def return_accuracy(model, dataset, data, epoch_size, original_output=None, indicate, compute_gradient=False):
    """
    original_output = output using the original data, optional
    indicate = Boolean
    compute_gradient = Boolean
    
    returns: accuracy of the model trained using the given data 
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    new_class = GNN.accuracy(model, device, dataset, data, indicate=indicate)              
    new_class.train(epoch_size)

    if compute_gradient: 
        output, accuracy = new_class.evaluate(return_prediction=compute_gradient)
        original_ouput
        accuracy = round(accuracy,3)
        return accuracy, gradients
    
    else: return new_class.evaluate()

In [53]:
import time, math
from copy import deepcopy

class find_the_influential_class(): 
    """
    given a dataset, mask each node at a time and compare accuracy to find the most influential point 

    note: the dataset must already have a preexisting train_mask tensor
    note: will only mask the points in the current train_mask 
    """
    
    def __init__(self, model, dataset, epoch_size):
        self.model = model
        self.dataset = dataset
        self.epoch_size = epoch_size
        self.init_accuracy = return_accuracy(model, dataset, dataset[0], epoch_size, False)
        self.accuracy = deepcopy(self.init_accuracy)
        self.influential = {}
    
    def influential_accuracy(self, indicate=False, record_time=True, index_range=True, compute_gradient=False):
        ##question: integer = integer; would this be equal or a copy?
        
        start_time = time.time()
        data = self.dataset[0]

        ## calculate the initial accuracy using the original data in the the dataset 
        print(f"\ninitial accuracy given the initial data: {self.init_accuracy}")

        # calculate new accuracy and update "influential" if it improves the accuracy 
        train_mask = data.train_mask.numpy()
        leng = len(data.x)

        for i in range(leng): 
            if train_mask[i]:
                if indicate: print(f"--Masking: Excluding point number {i}--")
                data = mask_a_node(self.dataset, i)
                new_accuracy = return_accuracy(self.model, self.dataset, data, self.epoch_size, indicate)
                if new_accuracy >= self.accuracy: 
                    self.accuracy = new_accuracy
                    self.influential[i] = new_accuracy
                
        if record_time: print("--- %s seconds ---" % round((time.time() - start_time), 1))
        if index_range: print(f"lowest accuracy: {self.init_accuracy}, highest accuracy: {self.accuracy}, range: {self.accuracy-self.init_accuracy}")
        return self.influential     

    #post processing  
    def top25(self): ## print the upper 75th percentile
        leng = len(self.influential)
        print(leng)
        top25 = math.floor(leng* 0.75) #rounding down in case there are only one or two influential points
        influence_index = list(self.influential.keys())[top75:]
        print(f"{leng} influential points, 75th percentiles: ")
        for i in influence_index: 
            print(f"node {i}, accuracy: {self.influential[i]}")

    def to_percentage(self): ## use softmax to update influencial.values() to softmax percentage
        for key, value in self.influential.items(): 
            self.influential[key] = round((value-self.init_accuracy)/value,2)*100
            print(f"masking node {key} increases the prediction accuracy by {self.influential[key]}%")

In [54]:
new = find_the_influential_class(GraphSAGE, cora_P, 1)
new.influential_accuracy()
new.to_percentage()


initial accuracy given the initial data: 0.08956692913385826
--- 48.5 seconds ---
lowest accuracy: 0.08956692913385826, highest accuracy: 0.3229873908826382, range: 0.23342046174877995
masking node 0 increases the prediction accuracy by 0.0%
masking node 2 increases the prediction accuracy by 32.0%
masking node 5 increases the prediction accuracy by 72.0%
masking node 15 increases the prediction accuracy by 72.0%
masking node 19 increases the prediction accuracy by 72.0%
masking node 28 increases the prediction accuracy by 72.0%
masking node 30 increases the prediction accuracy by 72.0%


In [56]:
print(new.influential)

{0: 0.0, 2: 32.0, 5: 72.0, 15: 72.0, 19: 72.0, 28: 72.0, 30: 72.0}


In [55]:
def multiple_testing(test_size, model, dataset, epoch_size):
    record = {}
    
    for i in range(test_size): 
        influential = find_the_influential(model, dataset, epoch_size)
        for key, value in influential.items():
            if key in record.keys(): 
                record[key] += value
            else: 
                record[key] = value
    
    print(record)

In [None]:
multiple_testing(5, GraphSAGE, cora_P, 1)

# new todo
1. plot the distribution 
    1. plot a ziploc and rank the nodes by the number of time they appear 
    2. cumulative distribution histogram
    3. magnitude - average magnitude per points 
    4. print the degree of each nodes, betweenness and centrality (copy networkX library) 
https://networkx.org/documentation/networkx1.10/reference/generated/networkx.algorithms.centrality.betweenness_centrality.html
https://networkx.org/documentation/networkx-1.10/reference/algorithms.centrality.html
    5. degree vs influence
    
    **question: is the topological component of a node influential?**

2. spatial relationships between the points 
    1. remove several points at a time - a point & neighbourhoods 
    2. top 25% - connected to each other? spatially distanced? 
    
    
3. see if the database is stable - notion of sensitivity 
4. approximate the influential points? cannot use iteration - we can maybe use gradients 
5. correlate an influence of a point to higher/lower impact on low degree node

### difference calculation (2): average KL difference
- y original - y_i
- diff = ||Y_original - Y_i||^2
- diff2 = mean(KL(Y^{(original)}_j - Y^{(-i)}_j)_ ) (average KL difference) 

## 3. bootstrapping - node feature influence
by swapping classes of one node 
instead of deleting points from the training set, switch its label and asses its participation that it leaves


In [57]:
# define a manual train class function 
def random_train_class(dataset, which_class):
    """
    note: dataset must have a preexisting train_mask
    classes (List[int]) – The classes to remove from the training set.
    """
#     cora_A_data = transform_nodes(cora_A[0])'
    transform_class = RemoveTrainingClasses(which_class) # RandomNodeSplit is a class 
    data = transform_class(dataset[0])
    return data

## 4. test the amount of separation we have between clusters 
suppose linear classifier, two sets of points and separating the two clusters (fit a line between two clusters) 
look at other types of loss ex. hinge lost 
hinge - maximise margin (Tries to look for good margin) 
 (switching the cross entropy loss) 
 https://pytorch.org/docs/stable/generated/torch.nn.HingeEmbeddingLoss.html
 https://pytorch.org/docs/stable/generated/torch.nn.MultiMarginLoss.html (use this instead of the above one!)
  needs an additional line of code as one vs a bunch of clusters 
  
  use a for loop, just assessing a small dataset so won't take too long

want to be in the middle of two groups 
assess the separating margins between the two classes 

additional post-processing
take the prediction score and compute clustering coefficients 
- https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation

**robustness of individual points, separating margins**
**separation of clusters** - the intuition is not robust, take in y^hat and y^true data points

**question on GPU**