In [None]:
"""
Notation:
  the class of node k: c(k)
  the class i: c_i
  all neighbors of k: N(k)
  all neighbors of k in the class: N(k)=c_i

build a model:
	Input: features of N(k)=c_i   ,  for any i
  Try to predict: c(k)

if use GCN, loss = preds - c(k); if use logistic regression, calc probability. But 作用相同，以下用loss举例

loss results example:
	一个node(eg: A)有10个neighbors，这些neighbors有3类class label，用这三类分别去predict A的label,
  																																	得到有loss_1, loss_2, loss_3，loss_1最小，需要算下avg(loss_2+loss_3)
	如果avg(loss_2+loss_3)是top 50大的，这个node是我们要攻击的
  如果被攻击，因为loss_1最小, 所以class为1的neighbors是我们要抢的

intuition:

选择抢的neighbors:
  the smaller loss means this class of neighbors provide more info to predict c(k)
  既然作用大，那就抢它
  then, we changes edges (this class of neighbors, k) to (this class of neighbors, fake node of k)
  注意cora的edge是要注意方向的

选择attack的node:
  neighbors里面已经抢掉了最小loss的class，剩下的class的loss的average越大，代表剩下的这些neighbors对于确定c(k)作用越小，
  那么我们就攻击这个
   -> 再想一下这个思路，剩下的loss的avg不一定最好，万一原本就很差呢？或许看最小loss有多小会更好（i.e.抢完neighbors损失的信息更多）？
   		或者看avg(loss_1+loss_2+loss_3)-avg(loss_2+loss_3)会更好？ 可以分别试一下看看结果！
  创建一个fake node with different class label （怎么去选这个label？随机？）
"""

In [None]:
!pip install stellargraph

In [2]:
import stellargraph as sg
from stellargraph.mapper import FullBatchNodeGenerator
from stellargraph.layer import GCN

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm, trange

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import pandas as pd
import json
import os
import warnings
from copy import deepcopy
from tqdm import tqdm
import tensorflow_addons as tfa
warnings.filterwarnings("ignore")

2022-12-12 06:39:06.975401: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [3]:
class NeighborModel(nn.Module):
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.ff1 = nn.Sequential(
            nn.BatchNorm1d(num_features),
            nn.Dropout(0.5),
            nn.Linear(num_features, 32),
            nn.GELU()
        )
        self.ff2 = nn.Sequential(
            nn.BatchNorm1d(32),
            nn.Dropout(0.5),
            nn.Linear(32, 32),
            nn.GELU()
        )
        self.ff3 = nn.Sequential(
            nn.BatchNorm1d(32),
            nn.Dropout(0.5),
            nn.Linear(32, 32),
            nn.GELU()
        )
        self.top = nn.Linear(32, num_classes)
        

    def forward(self, x):
        x = self.ff1(x)
        x = F.normalize(x)
        x = self.ff2(x) + x
        x = self.ff3(x) + x
        x = self.top(x)
        return x

In [4]:
def test(G, node_subjects, classifier_name, attack_num=0):
    train_subjects, val_subjects = model_selection.train_test_split(
        node_subjects, train_size=0.8, test_size=None, stratify=node_subjects,
        random_state=12345, shuffle=True
    )
    test_subjects = node_subjects[:-attack_num] if attack_num else node_subjects
    target_encoding = preprocessing.LabelBinarizer()

    train_targets = target_encoding.fit_transform(train_subjects)
    val_targets = target_encoding.transform(val_subjects)
    test_targets = target_encoding.transform(test_subjects)


    def setup_SGC():
        generator = FullBatchNodeGenerator(G, method="sgc", k=2)
        classifier_model = GCN(
            layer_sizes=[train_targets.shape[1]],
            generator=generator,
            bias=True,
            dropout=0.5,
            activations=["softmax"],
            kernel_regularizer=regularizers.l2(5e-4),
        )
        return generator, classifier_model


    def setup_GCN():
        generator = FullBatchNodeGenerator(G, method="gcn")
        classifier_model = GCN(
            layer_sizes=[32, 32], activations=["relu", "relu"], generator=generator, dropout=0.5
        )
        return generator, classifier_model


    def setup_GraphSAGE():
        generator = GraphSAGENodeGenerator(G, 50, [10,5])
        classifier_model = GraphSAGE(
            layer_sizes=[32, 32], generator=generator, bias=False, dropout=0.5,
        )
        return generator, classifier_model

    def setup_GAT():
        generator = FullBatchNodeGenerator(G, method="gat")
        classifier_model = GAT(
            layer_sizes=[8, train_targets.shape[1]],
            activations=["elu", "softmax"],
            attn_heads=8,
            generator=generator,
            in_dropout=0.5,
            attn_dropout=0.5,
            normalize=None,
        )
        return generator, classifier_model

    if classifier_name == "GCN":
        generator, classifier_model = setup_GCN()
        es_callback = EarlyStopping(monitor="val_acc", patience=50, restore_best_weights=True)
    elif classifier_name == "GraphSAGE":
        generator, classifier_model = setup_GraphSAGE()
    elif classifier_name == "GAT":
        generator, classifier_model = setup_GAT()
        if not os.path.isdir("logs"):
            os.makedirs("logs")
        es_callback = EarlyStopping(
            monitor="val_acc", patience=50
        )  # patience is the number of epochs to wait before early stopping in case of no further improvement
        mc_callback = ModelCheckpoint(
            "logs/best_model.h5", monitor="val_acc", save_best_only=True, save_weights_only=True
        )
    elif classifier_name == "SGC":
        generator, classifier_model = setup_SGC()
        if not os.path.isdir("logs"):
            os.makedirs("logs")
        es_callback = EarlyStopping(
            monitor="val_acc", patience=50
        )  # patience is the number of epochs to wait before early stopping in case of no further improvement
        mc_callback = ModelCheckpoint(
            "logs/best_model.h5", monitor="val_acc", save_best_only=True, save_weights_only=True
        )



    train_gen = generator.flow(train_subjects.index, train_targets)

    x_inp, x_out = classifier_model.in_out_tensors()

    predictions = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)

    val_gen = generator.flow(val_subjects.index, val_targets)

    test_gen = generator.flow(test_subjects.index, test_targets)

    tqdm_callback = tfa.callbacks.TQDMProgressBar(show_epoch_progress=False)

    if classifier_name == "GAT":
        model = Model(inputs=x_inp, outputs=x_out)
        model.compile(
            optimizer=optimizers.Adam(lr=0.005),
            loss=losses.categorical_crossentropy,
            metrics=["acc"],
        )
        history = model.fit(
            train_gen,
            epochs=100,
            validation_data=val_gen,
            verbose=0,
            shuffle=False,  # this should be False, since shuffling data means shuffling the whole graph
            callbacks=[es_callback, mc_callback, tqdm_callback],
        )
    elif classifier_name == "GCN":
        model = Model(inputs=x_inp, outputs=predictions)
        model.compile(
            optimizer=optimizers.Adam(lr=0.01),
            loss=losses.categorical_crossentropy,
            metrics=["acc"],
        )
        history = model.fit(
            train_gen,
            epochs=500,
            validation_data=val_gen,
            verbose=0,
            shuffle=False,  # this should be False, since shuffling data means shuffling the whole graph
            callbacks=[es_callback, tqdm_callback],
        )
    elif classifier_name == "GraphSAGE":
        model = Model(inputs=x_inp, outputs=predictions)
        model.compile(
            optimizer=optimizers.Adam(lr=0.2),
            loss=losses.categorical_crossentropy,
            metrics=["acc"],
        )
        history = model.fit(
            train_gen, epochs=20, validation_data=test_gen, verbose=2, shuffle=False
        )
    elif classifier_name == "SGC":
        model = Model(inputs=x_inp, outputs=x_out)
        model.compile(
            optimizer=optimizers.Adam(lr=0.2),
            loss=losses.categorical_crossentropy,
            metrics=["acc"],
        )
        history = model.fit(
            train_gen,
            epochs=200,
            validation_data=val_gen,
            verbose=0,
            shuffle=False,  # this should be False, since shuffling data means shuffling the whole graph
            callbacks=[es_callback, mc_callback, tqdm_callback],
        )


#     sg.utils.plot_history(history)

    test_metrics = model.evaluate(test_gen)
#     print("\nTest Set Metrics:")
#     for name, val in zip(model.metrics_names, test_metrics):
#         print("\t{}: {:0.4f}".format(name, val))
    del model
    return test_metrics[1]

In [5]:
class AttackingAlgo():
    def __init__(self, node_ids, features, edges, targets):
        paper_idx = {name: idx for idx, name in enumerate(node_ids)}
        self.node_ids = [paper_idx[i] for i in node_ids]
        self.edges = [[paper_idx[n1], paper_idx[n2]] for n1,n2 in edges]
        
        self.features = features
        
        class_values = sorted(targets.unique())
        class_idx = {name: id for id, name in enumerate(class_values)}
        targets = targets.apply(lambda value: class_idx[value])
        self.targets = targets.to_numpy()
        
        self.num_classes = len(class_values)
        
        self.find_neighbors()
    
    def calc_loss(self, neighbors_features, node_targets):
        neighbors_features = torch.from_numpy(neighbors_features).float()
        targets = torch.from_numpy(node_targets).float().type(torch.LongTensor)
        
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

        model = NeighborModel(neighbors_features.shape[1], self.num_classes)
        model.to(device)

        criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=1e-2)

        for epoch in range(1, 101):
            running_loss = 0.0
            for i in range(0, len(neighbors_features), 256):
                # get the inputs; data is a list of [inputs, labels]
                inputs, labels = neighbors_features[i:i+256].to(device), targets[i:i+256].to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()
        
        node_targets = F.one_hot(torch.from_numpy(node_targets))
        
        loss = []
        with torch.no_grad():
            for i in range(0, len(neighbors_features), 256):
                # get the inputs; data is a list of [inputs, labels]
                inputs = neighbors_features[i:i+256].to(device)
                # calculate outputs by running images through the network
                outputs = model(inputs)
                loss.extend(-np.sum(node_targets[i:i+256].numpy() * np.ma.log(outputs.cpu().numpy()).filled(0), axis=1))
        
        loss_mat = np.zeros((len(self.node_ids), self.num_classes))
        index = 0
        for i in range(len(self.node_ids)):
            for j in range(self.num_classes):
                if (i, j) in self.pos:
                    loss_mat[i, j] = loss[index]
                    index += 1
                else:
                    loss_mat[i, j] = None
        return loss_mat
    
    def generate_inputs(self):
        self.pos = []
        neighbors_features = []
        node_targets = []
        for i in range(self.num_classes):
            for j in self.node_ids:
                temp = []
                for n in self.neighbors[j]:
                    if self.targets[n] == i:
                        temp.append(self.features[n])
                if len(temp) != 0:
                    self.pos.append((j, i))
                    node_targets.append(self.targets[j])
                    neighbors_features.append(np.concatenate((self.features[j], np.mean(temp, axis=0))))
            
        return np.array(neighbors_features), np.array(node_targets)
    
    def find_neighbors(self):
        self.neighbors = {i:[] for i in self.node_ids}
        for n1, n2 in self.edges:
            self.neighbors[n1].append(n2)
            
    def test(self):
        neighbors_features, node_targets = self.generate_inputs()
        return self.calc_loss(neighbors_features, node_targets)
    
    def attack(self, attack_num=50, loss_function=2):
        # loss function
        # 1. AVE without lowest loss
        # 2. Minimum loss
        # 3. ave(all loss) - ave(loss without lowest)
        
        neighbors_features, node_targets = self.generate_inputs()
        loss_mat = self.calc_loss(neighbors_features, node_targets)
        # temp loss mat for coding
        k = len(self.node_ids)
        
        
        node_loss = []
        node_neigh_class = [] # the class with min_loss
        node_heighest_loss_class = []
        for each_node in loss_mat:
    
            valid_loss = each_node[~np.isnan(each_node)]
            if len(valid_loss) < 2:
                node_loss.append(10**10)
                node_neigh_class.append(-1)
                node_heighest_loss_class.append(-1)
                continue
            min_loss = min(valid_loss)
            max_loss = max(valid_loss)
            min_class = np.where(valid_loss==min_loss)[0][0]
            max_class = np.where(valid_loss==max_loss)[0][0]
            ave_loss_without_min = (sum(valid_loss)-min_loss)/len(valid_loss-1)
            node_heighest_loss_class.append(max_class)
            
            # 1. AVE without lowest loss
            if loss_function ==1:
                node_loss.append(ave_loss_without_min)
            
            # 2. Minimum loss
            elif loss_function == 2:
                node_loss.append(min_loss)
            
            # 3. ave(all loss) - ave( loss without lowest)
            else:
                ave_diff = sum(valid_loss)/len(valid_loss) - ave_loss_without_min
                node_loss.append(ave_diff)
             
            node_neigh_class.append(min_class)


        node_loss = np.array(node_loss)
        node_neigh_class = np.array(node_neigh_class)
        node_heighest_loss_class = np.array(node_heighest_loss_class)
        sorted_index = np.argsort(node_loss)
        
        
        # 1. AVE without lowest loss
        if loss_function == 1:
            top_50_node = sorted_index[k-attack_num:k]
        # 2. Minimum loss
        # 3. ave(all loss) - ave( loss without lowest)
        else:
            top_50_node = sorted_index[0:attack_num]
        
        
        # top_50_node: node to attack
        # node_neigh_class: the neighbor class with the lowest loss; the class to change neighbor edge for each node
        
        edge_set = set()
        for [n1,n2] in self.edges:
            edge_set.add((n1,n2))
        # perform attack according to information above.
        for node_index in top_50_node:
            change_neighbor_class = node_neigh_class[node_index]
            my_target = self.targets[node_index]
            my_features = self.features[node_index]
            
            ## Another way of selecting new node target (based on highest loss)
            new_node_target = node_heighest_loss_class[node_index]
            
#             new_node_target = np.random.randint(self.num_classes, size=1)[0]
#             while new_node_target == my_target:
#                 new_node_target = np.random.randint(self.num_classes, size=1)[0]
                
            new_node_features = np.copy(my_features)
            new_node_index = len(self.targets)
            
            self.features = np.append(self.features, np.array([my_features]), axis=0)
            self.targets = np.append(self.targets, np.array([new_node_target]), axis=0)
            
            my_neighbors = self.neighbors[node_index]
            for neighbor_index in my_neighbors:
                if self.targets[neighbor_index] == node_neigh_class[node_index]:
                    edge_set.remove((node_index, neighbor_index))
                    edge_set.add((new_node_index, neighbor_index))
            edge_set.add((node_index, new_node_index))
            edge_set.add((new_node_index, node_index))
        
        updated_edge_list = []
        for (n1,n2) in edge_set:
            updated_edge_list.append([n1,n2])
        self.edges = np.array(updated_edge_list)
        return self.features, self.edges, self.targets

In [6]:
import random
def set_global_determinism(seed=12345):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

set_global_determinism()

In [7]:
import stellargraph as sg
import tensorflow as tf
from tensorflow import keras
import json
import os
import warnings
import pandas as pd
import numpy as np



import pandas as pd
import os

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import stellargraph as sg
from stellargraph.mapper import FullBatchNodeGenerator
from stellargraph.layer import GCN
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from stellargraph.layer import GAT
from stellargraph.mapper import DirectedGraphSAGENodeGenerator, GraphSAGENodeGenerator
from stellargraph.layer import DirectedGraphSAGE, GraphSAGE
from stellargraph import datasets
from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, model_selection
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline
from tensorflow.keras import layers, optimizers, losses, metrics, Model, regularizers

def loadData(features, edge_list, labels):
    node_features = pd.DataFrame(features)
    source = edge_list[:,0]
    target = edge_list[:,1]
    edges = pd.DataFrame({"source": source, "target": target})
    G = sg.StellarDiGraph(node_features, edges, node_type_default="paper", edge_type_default="cites")
    node_subjects = pd.Series(labels)
    return G, node_subjects


classifier_name = "GCN"


dataset = "cora"  # can also select 'pubmed'

if dataset == "cora":
    G, node_subjects = datasets.Cora().load(directed=True)
elif dataset == "pubmed":
    d = datasets.PubMedDiabetes()
    display(HTML(d.description))
    G, node_subjects = d.load()
    
#     G, node_subjects = loadData(np.array(G.node_features()), np.array(G.edges), node_subjects)

test(G, node_subjects, classifier_name)

Using GCN (local pooling) filters...


Training:   0%|           0/500 ETA: ?s,  ?epochs/s

2022-12-12 06:39:11.098199: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)




0.9272525906562805

In [8]:
G.nodes()

Int64Index([  31336, 1061127, 1106406,   13195,   37879, 1126012, 1107140,
            1102850,   31349, 1106418,
            ...
             626531, 1131180, 1130454, 1131184, 1128974, 1128975, 1128977,
            1128978,  117328,   24043],
           dtype='int64', length=2708)

In [9]:
ata = AttackingAlgo(G.nodes(), G.node_features(), G.edges(), node_subjects)
attacked_features, attacked_edges, attacked_targets = ata.attack(attack_num=90, loss_function=3)
new_G, new_node_subjects = loadData(attacked_features, np.array(attacked_edges), attacked_targets)

accs = []
for i in range(10):
    accs.append(test(new_G, new_node_subjects, classifier_name, attack_num=90))
    keras.backend.clear_session()
    
sum(accs) / 10

Using GCN (local pooling) filters...


Training:   0%|           0/500 ETA: ?s,  ?epochs/s

Using GCN (local pooling) filters...


Training:   0%|           0/500 ETA: ?s,  ?epochs/s

Using GCN (local pooling) filters...


Training:   0%|           0/500 ETA: ?s,  ?epochs/s

Using GCN (local pooling) filters...


Training:   0%|           0/500 ETA: ?s,  ?epochs/s

Using GCN (local pooling) filters...


Training:   0%|           0/500 ETA: ?s,  ?epochs/s

Using GCN (local pooling) filters...


Training:   0%|           0/500 ETA: ?s,  ?epochs/s

Using GCN (local pooling) filters...


Training:   0%|           0/500 ETA: ?s,  ?epochs/s

Using GCN (local pooling) filters...


Training:   0%|           0/500 ETA: ?s,  ?epochs/s

Using GCN (local pooling) filters...


Training:   0%|           0/500 ETA: ?s,  ?epochs/s

Using GCN (local pooling) filters...


Training:   0%|           0/500 ETA: ?s,  ?epochs/s



0.9252584993839263