<a href="https://colab.research.google.com/github/zuobinxiong/CS789/blob/main/Lab6_Membership_Inference_Attack_student.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Running membership inference attacks on the Nursery data

In this tutorial we will show how to run black-box membership attacks. This will be demonstrated on the Nursery dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/nursery).

In [None]:
!pip install adversarial-robustness-toolbox

We have already preprocessed the dataset such that all categorical features are one-hot encoded, and the data was scaled using sklearn's StandardScaler.

In [None]:
import os
import sys
sys.path.insert(0, os.path.abspath('..'))

## Load data

In [None]:
from art.utils import load_nursery
(x_train, y_train), (x_test, y_test), _, _ = load_nursery(test_set=0.5)

## Train random forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier


model = ... # Define random forest model using RandomForestClassifier
model.fit(...) # Evalutation
print('Base model accuracy: ', ... )

## Attack
### Rule-based attack
The rule-based attack uses the simple rule to determine membership in the training data: if the model's prediction for a sample is correct, then it is a member. Otherwise, it is not a member.

In [None]:
import numpy as np
from art.estimators.classification.scikitlearn import ScikitlearnRandomForestClassifier
from art.attacks.inference.membership_inference import MembershipInferenceBlackBoxRuleBased

# use ScikitlearnRandomForestClassifier in art
art_classifier = ScikitlearnRandomForestClassifier(model)

# define the attack object using MembershipInferenceBlackBoxRuleBased
attack = ...

# infer attacked feature
inferred_train = attack.infer(...)
inferred_test = attack.infer(...)

# check accuracy
train_acc = ...
test_acc = ...
acc = ...
print(f"Members Accuracy: {train_acc:.4f}")
print(f"Non Members Accuracy {test_acc:.4f}")
print(f"Attack Accuracy {acc:.4f}")

This means that on average for 51% of the data, membership status is inferred correctly.

In [None]:
def calc_precision_recall(predicted, actual, positive_value=1):
    score = 0  # both predicted and actual are positive
    num_positive_predicted = 0  # predicted positive
    num_positive_actual = 0  # actual positive
    for i in range(len(predicted)):
        if predicted[i] == positive_value:
            num_positive_predicted += 1
        if actual[i] == positive_value:
            num_positive_actual += 1
        if predicted[i] == actual[i]:
            if predicted[i] == positive_value:
                score += 1

    if num_positive_predicted == 0:
        precision = 1
    else:
        precision = score / num_positive_predicted  # the fraction of predicted “Yes” responses that are correct
    if num_positive_actual == 0:
        recall = 1
    else:
        recall = score / num_positive_actual  # the fraction of “Yes” responses that are predicted correctly

    return precision, recall

# rule-based
# using the infer attacked features to calculate the precision and recall of the membership attack
print(calc_precision_recall(...))

### Black-box attack
The black-box attack basically trains an additional classifier (called the attack model) to predict the membership status of a sample. It can use as input to the learning process probabilities/logits or losses, depending on the type of model and provided configuration.
#### Train attack model

In [None]:
from art.attacks.inference.membership_inference import MembershipInferenceBlackBox

attack_train_ratio = 0.5
attack_test_ratio = 0.5
attack_train_size = int(len(x_train) * attack_train_ratio)
attack_test_size = int(len(x_test) * attack_test_ratio)

# Define the black-box membership inference attack
bb_attack = ... # using MembershipInferenceBlackBox function

# train attack model
bb_attack.fit(...)

#### Infer membership and check accuracy

In [None]:
# get inferred values
inferred_train_bb = ...
inferred_test_bb = ...

# check accuracy
train_acc = ...
test_acc = ...
acc = ...
print(f"Members Accuracy: {train_acc:.4f}")
print(f"Non Members Accuracy {test_acc:.4f}")
print(f"Attack Accuracy {acc:.4f}")

Achieves much better results than the rule-based attack.

In [None]:
# black-box
print(calc_precision_recall(...))

## Train neural network model

In [None]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
from art.estimators.classification.pytorch import PyTorchClassifier

# reduce size of training set to make attack slightly better
train_set_size = 500
x_train = x_train[:train_set_size]
y_train = y_train[:train_set_size]
x_test = x_test[:train_set_size]
y_test = y_test[:train_set_size]
attack_train_size = int(len(x_train) * attack_train_ratio)
attack_test_size = int(len(x_test) * attack_train_ratio)


class NurseryDataset(Dataset):
    def __init__(self, x, y=None):
        self.x = torch.from_numpy(x.astype(np.float64)).type(torch.FloatTensor)

        if y is not None:
            self.y = torch.from_numpy(y.astype(np.int8)).type(torch.LongTensor)
        else:
            self.y = torch.zeros(x.shape[0])

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        if idx >= len(self.x):
            raise IndexError("Invalid Index")

        return self.x[idx], self.y[idx]

train_set = NurseryDataset(x_train, y_train)
train_loader = DataLoader(...)

device = torch.device("cpu")

In [None]:
# Define the neural network model
class ModelToAttack(nn.Module):

    def __init__(self, num_classes, num_features):
        super(ModelToAttack, self).__init__()

        self.fc1 = nn.Sequential(
                nn.Linear(num_features, 1024),
                nn.Tanh(), )

        self.fc2 = nn.Sequential(
                nn.Linear(1024, 512),
                nn.Tanh(), )

        self.fc3 = nn.Sequential(
            nn.Linear(512, 256),
            nn.Tanh(), )

        self.fc4 = nn.Sequential(
            nn.Linear(256, 128),
            nn.Tanh(),
        )

        self.classifier = nn.Linear(128, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
        out = self.fc3(out)
        out = self.fc4(out)
        return self.classifier(out)

In [None]:
# train the nn model
mlp_model = ModelToAttack(4, 24)
mlp_model = torch.nn.DataParallel(mlp_model)
# fillout the loss function and optimizer
criterion = ...
optimizer = ...

for epoch in range(20):
    for (input, targets) in train_loader:
        input, targets = torch.autograd.Variable(input), torch.autograd.Variable(targets)

        targets = targets.to(device)

        optimizer.zero_grad()
        # get outputs
        outputs = ...
        outputs = outputs.to(device)
        # calculate loss funciton
        loss = ...

        loss.backward()
        optimizer.step()


mlp_art_model = PyTorchClassifier(model=mlp_model, loss=criterion, optimizer=optimizer, input_shape=(24,), nb_classes=4)

train_pred = np.array([np.argmax(arr) for arr in mlp_art_model.predict(x_train.astype(np.float32))])
# calculate the training accuracy
print('Base model Train accuracy: ', ...)
# calculate the testing accuracy
test_pred = np.array([np.argmax(arr) for arr in mlp_art_model.predict(x_test.astype(np.float32))])
print('Base model Test accuracy: ', ...)

### Rule-based attack

In [None]:
# wrap the model
mlp_attack = MembershipInferenceBlackBoxRuleBased(...)

# infer
mlp_inferred_train = ...
mlp_inferred_test = ...

# check accuracy
mlp_train_acc = ...
mlp_test_acc = ...
mlp_acc = ...
print(mlp_train_acc)
print(mlp_test_acc)
print(mlp_acc)

# calculate the precision and recall
print(calc_precision_recall(...))

### Black-box attack

In [None]:
mlp_attack_bb = MembershipInferenceBlackBox(mlp_art_model, attack_model_type='rf')

# train attack model
mlp_attack_bb.fit(...)

# infer
mlp_inferred_train_bb = ...
mlp_inferred_test_bb = ...

# check accuracy
mlp_train_acc_bb = ...
mlp_test_acc_bb = ...
mlp_acc_bb = ...

print(f"Members Accuracy: {mlp_train_acc_bb:.4f}")
print(f"Non Members Accuracy {mlp_test_acc_bb:.4f}")
print(f"Attack Accuracy {mlp_acc_bb:.4f}")

print(calc_precision_recall(np.concatenate((mlp_inferred_train_bb, mlp_inferred_test_bb)),
                            np.concatenate((np.ones(len(mlp_inferred_train_bb)), np.zeros(len(mlp_inferred_test_bb))))))

In [None]:
from sklearn.metrics import classification_report

y_train_pred = np.concatenate((mlp_inferred_train_bb, mlp_inferred_test_bb))
y_train_true = np.concatenate((np.ones_like(mlp_inferred_train_bb), np.zeros_like(mlp_inferred_test_bb)))
print(classification_report(y_pred=y_train_pred, y_true=y_train_true))

For the pytorch target model we were able to achieve slightly better than random attack performance, but not as good as for the random forest model.

## Running membership inference attacks using Shadow Models

## Load data
The data is separated, 25% will go towards and training and testing the target model, 75% of data will be used as shadow training data.

In [None]:
(x_target, y_target), (x_shadow, y_shadow), _, _ = load_nursery(test_set=0.75)

target_train_size = len(x_target) // 2
# get the first part to be traget_train dataset, the second part as target test
x_target_train = ...
y_target_train = ...
x_target_test = ...
y_target_test = ...

## Train random forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from art.estimators.classification.scikitlearn import ScikitlearnRandomForestClassifier

model = RandomForestClassifier()
model.fit(x_target_train, y_target_train)

art_classifier = ScikitlearnRandomForestClassifier(model)

print('Base model accuracy:', model.score(x_target_test, y_target_test))

## Train shadow models
Three shadow models are trained, and used to generate a meta-dataset of member and non-member predictions.

In [None]:
from art.attacks.inference.membership_inference import ShadowModels
from art.utils import to_categorical

# build a list of shadow models
shadow_models = ShadowModels(art_classifier, num_shadow_models=5)

shadow_dataset = shadow_models.generate_shadow_dataset(x_shadow, to_categorical(y_shadow, 4))
(member_x, member_y, member_predictions), (nonmember_x, nonmember_y, nonmember_predictions) = shadow_dataset

# Shadow models' accuracy
print([sm.model.score(x_target_test, y_target_test) for sm in shadow_models.get_shadow_models()])

## Attack
Black-box attack

We run a black-box membership inference attack on the meta-dataset generated using the shadow models.

In [None]:
from art.attacks.inference.membership_inference import MembershipInferenceBlackBox
# feed all data of member and non-member into the attack model
attack = MembershipInferenceBlackBox(art_classifier, attack_model_type="rf")
attack.fit(member_x, member_y, nonmember_x, nonmember_y, member_predictions, nonmember_predictions)

In [None]:
# perform attack on member data and non member data
member_infer = attack.infer(...)
nonmember_infer = attack.infer(...)

member_acc = np.sum(member_infer) / len(x_target_train)
nonmember_acc = 1 - np.sum(nonmember_infer) / len(x_target_test)
acc = (member_acc * len(x_target_train) + nonmember_acc * len(x_target_test)) / (len(x_target_train) + len(x_target_test))
print('Attack Member Acc:', member_acc)
print('Attack Non-Member Acc:', nonmember_acc)
print('Attack Accuracy:', acc)

In [None]:
print(calc_precision_recall(np.concatenate((member_infer, nonmember_infer)),
                            np.concatenate((np.ones(len(member_infer)), np.zeros(len(nonmember_infer))))))

## Rule-based attack

In [None]:
from art.attacks.inference.membership_inference import MembershipInferenceBlackBoxRuleBased

baseline = MembershipInferenceBlackBoxRuleBased(art_classifier)

bl_inferred_train = baseline.infer(x_target_train, y_target_train)
bl_inferred_test = baseline.infer(x_target_test, y_target_test)

bl_member_acc = np.sum(bl_inferred_train) / len(bl_inferred_train)
bl_nonmember_acc = 1 - (np.sum(bl_inferred_test) / len(bl_inferred_test))
bl_acc = (bl_member_acc * len(bl_inferred_train) + bl_nonmember_acc * len(bl_inferred_test)) / (len(bl_inferred_train) + len(bl_inferred_test))
print(bl_member_acc)
print(bl_nonmember_acc)
print('Baseline Accuracy:', bl_acc)

print(calc_precision_recall(np.concatenate((bl_inferred_train, bl_inferred_test)),
                            np.concatenate((np.ones(len(bl_inferred_train)), np.zeros(len(bl_inferred_test))))))