In [1]:
from library.Accessor import Accessor
from library.attributionUtils import get_attributes,adversarial_detection_set
from library.attributions import multiply_attributed_with_input,number_of_active_nodes
from library.train import binary_acc

import torch
import numpy as np

In [2]:
dataset = 'mnist' # can be mnist, cifar10, cuckoo, ember
model_name = 'mnist_1' # can be 'cifar10_1','cuckoo_1','Ember_2','mnist_1','mnist_2','mnist_3'
attack= 'FGSM' # can be 'FGSM','CW','PGD',"CKO",'EMBER'

In [3]:
test_benign_accessor = Accessor('./Benign/'+dataset+'/' +model_name +'/')
test_adv_accessor= Accessor('./Adversarial/'+dataset+'/'+attack +'/' +model_name +'/' )
#ground_truth_accessor = Accessor('./Ground_truth/mnist/mnist_1')

print('Loading Benign testing activations...')
test_benign_act = test_benign_accessor.get_all()
print('Loading Adversarial testing activations...')
test_adv_act = test_adv_accessor.get_all()
#gt_sample_act = ground_truth_accessor.get_all()



# Transforms the activations to the folowing data set : x[activationA,activaitonB,...]  y= [1, 0 ,1...]
X_adv,Y_adv = adversarial_detection_set(test_adv_act,label = torch.tensor(1.0))
X_ben,Y_ben = adversarial_detection_set(test_benign_act,label = torch.tensor(0.0))
#X_gt ,Y_gt =adversarial_detection_set(gt_sample_act,label = torch.tensor(0),expected_nb_nodes=expected_nb_nodes)



Loading Benign testing activations...
Loaded all activations for ./Benign/mnist/mnist_1/
Loading Adversarial testing activations...
Loaded all activations for ./Adversarial/mnist/FGSM/mnist_1/


In [4]:
## Testing model on test data
# We sample equal number of adversarial and benign data ...
shape_min = np.min([X_adv.shape[0],X_ben.shape[0]])
X_test = torch.cat((X_adv[:shape_min],X_ben[:shape_min]))# + X_gt
Y_test= torch.cat((Y_adv[:shape_min], Y_ben[:shape_min]))# + Y_gt

print('Shape of testing adv activations:',X_adv.shape)
print('Shape of testing ben activations:',X_ben.shape)

model = torch.load('./models/mnist_1_graph.pt')

if torch.cuda.is_available():
    model=model.cuda()

X_test=torch.Tensor(X_test)
Y_test=torch.Tensor(Y_test)
    
model.eval()
y_pred = model(X_test)

acc = binary_acc(y_pred, Y_test.unsqueeze(1))#(y_pred.round() == Y_test).float().mean()
acc = float(acc)
print('Model accuracy: ',float(acc))

Shape of testing adv activations: torch.Size([9834, 1204])
Shape of testing ben activations: torch.Size([10000, 1204])
Model accuracy:  100.0


In [5]:
## Performing adv and benign data feature attribution on the model
'''Attribution is performed with respect to the label 0 (benign)
If the attribute is postive: the node is directed to the label 0, 
otherwise it is directed to the label 1'''

adv_mul, adv_attr =multiply_attributed_with_input(X_adv,Y_adv,model)
ben_mul, ben_attr =multiply_attributed_with_input(X_ben,Y_ben,model)
#gt_attr =multiply_attributed_with_input(X_gt,Y_gt,model)

avg_adv = [np.average(i) for i in adv_attr]
avg_ben = [np.average(i) for i in ben_attr]
#gt_attr = [np.average(i) for i in gt_attr]

print(f'Average Attributes of Adversarial samples:{np.average(avg_adv)} \n Average Attributes of Benign samples: {np.average(avg_ben)}')# Gt : {np.average(gt_attr)} ')


torch.Size([9834, 1204])
torch.Size([10000, 1204])
Average Attributes of Adversarial samples:1.4938300500204521e-07 
 Average Attributes of Benign samples: -0.000830439291968719


In [None]:
from library.attributionUtils import get_nodes_data

# nodes characteristics of adv data
adv_weights,adv_atts, adv_avg_weights, adv_avg_atts = get_nodes_data(X_adv, adv_attr)

# nodes characteristics of Benign data
ben_weights,ben_atts, ben_avg_weights, ben_avg_atts = get_nodes_data(X_ben, ben_attr)

In [None]:
import matplotlib.pyplot as plt

x_axis = range(X_adv.shape[1]) # Nodes

def bar_plot(x_axis, y_ben, y_adv, x_label, y_label, title, xticks=None):
    
    # set width of bar
    barWidth = 0.25
    fig = plt.subplots(figsize =(12, 8))
    
    # set x positions
    x_ben = [x - barWidth/2 for x in x_axis]
    x_adv = [x + barWidth/2 for x in x_axis]
    
    plt.bar(x_ben, y_ben,color ='green', width = barWidth,
        edgecolor ='green', label ='Benign')
    plt.bar(x_adv, y_adv,color ='r', width = barWidth,
        edgecolor ='red', label ='Adv')
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    if xticks != None:
        plt.xticks(x_axis,labels=xticks,rotation='vertical')
    plt.legend()
    plt.show()

In [None]:
# plot avg weights of all nodes
bar_plot(x_axis,ben_avg_weights,adv_avg_weights, 'Nodes', 'Avg Activation Weights', 'Avg Activation weights of each Node across all benign/Adversarial samples')

Nodes have bigger weights when samples are benign.

**Hypothesis: When nodes activations are very low. This could be a sign that the model is under attack and its output could be wrong.**

In [None]:
# plot avg Attributes of all nodes
bar_plot(x_axis,ben_avg_atts,adv_avg_atts, 'Nodes', 'Avg Attributes', 'Avg Attributes of each Node across all benign/Adversarial samples')

In [None]:
# plot avg Attributes of all nodes
bar_plot(x_axis,len(ben_avg_atts)*[0],adv_avg_atts, 'Nodes', 'Avg Attributes', 'Avg Attributes of each Node across all benign/Adversarial samples')

To get a clearer look on the most relevant Nodes, we filter those that are irrelevant to the prediction. More precisely, we discard nodes with an average attribute value close to 0

In [None]:
# select a thresholds
threshold_ben = 0.01
threshold_Adv = 0.00001

In [None]:
ben_node_filter = [np.abs(x) > threshold_ben for x in ben_avg_atts]
ben_nodes = [x_axis[i] for i in range(len(x_axis)) if ben_node_filter[i]]
print('Nodes that are relevant to the benign samples are:\n',ben_nodes)
print('\n len = ',len(ben_nodes))

In [None]:
adv_node_filter = [np.abs(x) > threshold_Adv for x in adv_avg_atts]
adv_nodes = [x_axis[i] for i in range(len(x_axis)) if adv_node_filter[i]]
print('Nodes that are relevant to the Adversarial samples are:\n',adv_nodes)
print('\n len = ',len(adv_nodes))

In [None]:
# merge all relavant nodes
nodes = adv_nodes + ben_nodes
nodes = list(set(nodes))
nodes.sort()
print('studied nodes are : \n',nodes)
print('\n len = ',len(nodes))

In [None]:
ben_avg_atts_filtered = [ben_avg_atts[i] for i in range(len(x_axis)) if i in nodes]
adv_avg_atts_filtered = [adv_avg_atts[i] for i in range(len(x_axis)) if i in nodes]

In [None]:
# plot avg Attributes of all nodes
bar_plot(range(len(nodes)),ben_avg_atts_filtered,adv_avg_atts_filtered, 'Nodes', 'Avg Attributes', 'Avg Attributes of each Node across all benign/Adversarial samples', xticks=nodes)

In [None]:
## rescaling Adv attributions to plot both types of data together
bar_plot(range(len(nodes)),ben_avg_atts_filtered,[x*1000 for x in adv_avg_atts_filtered], 'Nodes', 'Avg Attributes', 'Avg Attributes of each Node across all benign/Adversarial samples', xticks=nodes)

In [None]:
ben_atts_filtered = [ben_atts[i] for i in range(len(x_axis)) if i in nodes]
adv_atts_filtered = [adv_atts[i] for i in range(len(x_axis)) if i in nodes]

In [None]:
def boxplot(x_axis, y_ben, y_adv, x_label, y_label, title, xticks=None):
     
    # set width of box
    boxWidth = 0.25
    fig1, ax1 = plt.subplots()
    
    # set x positions
    x_ben = [x - boxWidth/2 for x in x_axis]
    x_adv = [x + boxWidth/2 for x in x_axis]
    
    
    plt.title(title)
    plt.xlabel(x_label)
    #plt.ylabel(y_label)
    #if xticks != None:
    #    plt.xticks(x_axis,labels=xticks,rotation='vertical')
    

    # Creating plot
    bx1 = ax1.boxplot(y_ben,notch=True,whis=2,positions=x_ben,widths=boxWidth,patch_artist=True,showfliers=False)
    ax1.set_xticks(x_axis,labels=xticks,rotation=90)
    ax1.set_ylabel(y_label+' Benign')
    plt.setp(bx1["boxes"], facecolor='green',label='Benign')
    
    ax2 = ax1.twinx()
    
    # Creating plot
    bx2 = ax2.boxplot(y_adv,notch=True,whis=2,positions=x_adv,widths=boxWidth,patch_artist=True,showfliers=False)
    ax2.set_xticks(x_axis,labels=xticks,rotation=90)
    ax2.set_ylabel(y_label+' Adversarial')
    plt.setp(bx2["boxes"], facecolor='red',label='Adversarial')
    
    by_label={0: 'Adversarial',
             1: 'Benign'}
    
    # show plot
    plt.show()

In [None]:
boxplot(range(len(nodes)),ben_atts_filtered,[x for x in adv_atts_filtered], 'Nodes', 'Attributes', 'Attributes of each Node across all benign/Adversarial samples', xticks=nodes)

In [None]:
ben_weights_filtered = [ben_weights[i] for i in range(len(x_axis)) if i in nodes]
adv_weights_filtered = [adv_weights[i] for i in range(len(x_axis)) if i in nodes]

In [None]:
boxplot(range(len(nodes)),ben_weights_filtered,[x for x in adv_weights_filtered], 'Nodes', 'Activation Weights', 'Activation Weights of each Node across all benign/Adversarial samples', xticks=nodes)