In [1]:
# Importing dependencies
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense, Flatten, Layer
from art.defences.postprocessor import ReverseSigmoid
from art.attacks.extraction import CopycatCNN
from art.estimators.classification import TensorFlowV2Classifier
from art.utils import load_dataset

In [2]:
# Initializing loss and optimizer objects
# for ART's TensorFlowV2Classifier wrapper class
loss = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

# Defining a training step for TensorFlowV2Classifier
def train_step(
    model, 
    inputs, 
    targets
    ):
    # Record the forward pass
    # and loss calculations in our model
    with tf.GradientTape() as tape:
        preds = model(inputs=inputs, training=True)
        loss_value = loss(y_true=targets, y_pred=preds)

    # Compute gradients with respect to the model's weights
    grads = tape.gradient(
        target=loss_value, 
        sources=model.trainable_variables)

    # Apply gradients to the model's weights
    optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))

In [3]:
# Loading data
(train_images_original, train_labels_original), (test_images_original, test_labels_original), min, max = load_dataset(name="mnist")

In [4]:
# Setting aside a subset of the source dataset for the original model
train_images_victim = train_images_original[:50000]
train_labels_victim = train_labels_original[:50000]

# Using the rest of the source dataset for the stolen model
train_images_stolen = train_images_original[50000:]
train_labels_stolen = train_labels_original[50000:]

In [5]:
# Initializing a postprocessor for comparison
postprocessor = ReverseSigmoid(
    beta=1.0, 
    gamma=0.2
    )

In [6]:
# Custom TF Keras class that implements ART's Reverse Sigmoid postprocessing defense
class ReverseSigmoidLayer(Layer):
    # Layer constructor   
    def __init__(self, beta, gamma, **kwargs):
        super(ReverseSigmoidLayer, self).__init__(**kwargs)
        self.beta = beta
        self.gamma = gamma

    # Method that defines the forward pass of the layer
    def call(self, preds, training=None):
        # Returning unprocessed inputs when training
        if training:
            return preds
        
        # Computing reverse sigmoid when not training
        clip_min = 1e-9
        clip_max = 1.0 - clip_min
        
        preds_clipped = tf.clip_by_value(t=preds, clip_value_min=clip_min, clip_value_max=clip_max)

        if preds.shape[1] > 1:
            perturbation_r = self.beta * (
                self.sigmoid(z=(-self.gamma * tf.math.log(x=((1.0 - preds_clipped) / preds_clipped)))) - 0.5
                )
            preds_perturbed = preds - perturbation_r
            preds_perturbed = tf.clip_by_value(t=preds_perturbed, clip_value_min=0.0, clip_value_max=1.0)
            alpha = 1.0 / tf.math.reduce_sum(input_tensor=preds_perturbed, axis=-1, keepdims=True)
            reverse_sigmoid = alpha * preds_perturbed
        else:
            preds_1 = preds
            preds_2 = 1.0 - preds
            
            preds_clipped_1 = preds_clipped
            preds_clipped_2 = 1.0 - preds_clipped

            perturbation_r_1 = self.beta * (
                self.sigmoid(z=(-self.gamma * tf.math.log(x=((1.0 - preds_clipped_1) / preds_clipped_1)))) - 0.5
            )

            perturbation_r_2 = self.beta * (
                self.sigmoid(z=(-self.gamma * tf.math.log(x=((1.0 - preds_clipped_2) / preds_clipped_2)))) - 0.5
            )

            preds_perturbed_1 = preds_1 - perturbation_r_1
            preds_perturbed_2 = preds_2 - perturbation_r_2

            preds_perturbed_1 = tf.clip_by_value(t=preds_perturbed_1, clip_value_min=0.0, clip_value_max=1.0)
            preds_perturbed_2 = tf.clip_by_value(t=preds_perturbed_2, clip_value_min=0.0, clip_value_max=1.0)

            alpha = 1.0 / (preds_perturbed_1 + preds_perturbed_2)
            reverse_sigmoid = alpha * preds_perturbed_1

        return reverse_sigmoid
        
    # Method for getting layer config when saving model
    def get_config(self):
        config = super().get_config()
        config.update({
            "beta": self.beta,
            "gamma": self.gamma,
        })
        return config          

    # Method to compute standard sigmoid
    def sigmoid(self, z):
        return 1.0 / (1.0 + tf.math.exp(x=-z))

In [7]:
# Function for creating a model with the Functional API
def create_model():
    # Defining and connecting the model's layers
    input = tf.keras.layers.Input(shape=(28, 28, 1))    
    x = Conv2D(filters=32, kernel_size=3, activation="relu")(input)
    x = Conv2D(filters=64, kernel_size=3, activation="relu")(x)
    x = MaxPool2D(pool_size=2)(x)
    x = Flatten()(x)
    x = Dense(units=128, activation="relu")(x)
    output = Dense(units=10, activation="softmax")(x)
    
    # Initializing the model
    model = tf.keras.models.Model(inputs=[input], outputs=[output])  

    # Compiling the model
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy"]
        )   

    # Returning the model
    return model

In [8]:
# The same function but using the Sequential API
"""def create_model():
    # Defining the model   
    model = tf.keras.models.Sequential([        
        Conv2D(filters=32, kernel_size=3, activation="relu", input_shape=(28, 28, 1)),
        Conv2D(filters=64, kernel_size=3, activation="relu"),
        MaxPool2D(pool_size=2),        
        Flatten(),
        Dense(units=128, activation="relu"),
        Dense(units=10, activation="softmax")        
    ])

    # Compiling the model
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy"]
        )   

    # Returning the model
    return model"""

'def create_model():\n    # Defining the model   \n    model = tf.keras.models.Sequential([        \n        Conv2D(filters=32, kernel_size=3, activation="relu", input_shape=(28, 28, 1)),\n        Conv2D(filters=64, kernel_size=3, activation="relu"),\n        MaxPool2D(pool_size=2),        \n        Flatten(),\n        Dense(units=128, activation="relu"),\n        Dense(units=10, activation="softmax")        \n    ])\n\n    # Compiling the model\n    model.compile(\n        optimizer="adam",\n        loss="categorical_crossentropy",\n        metrics=["accuracy"]\n        )   \n\n    # Returning the model\n    return model'

In [9]:
# Initializing and training the model
victim_model = create_model()
victim_model.fit(
    x=train_images_victim,
    y=train_labels_victim,
    batch_size=1024,
    epochs=20
    )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a18182eb20>

In [10]:
# Initializing our custom layer and linking it to the model's output
protected_output = ReverseSigmoidLayer(beta=1.0, gamma=0.2)(victim_model.output)

# Creating a new model with the custom layer on top while preserving the old model's weights
protected_model = tf.keras.models.Model(inputs=[victim_model.input], outputs=[protected_output])

In [11]:
# Viewing the victim model's architecture
victim_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 28, 28, 1)]       0         
                                                                 
 conv2d (Conv2D)             (None, 26, 26, 32)        320       
                                                                 
 conv2d_1 (Conv2D)           (None, 24, 24, 64)        18496     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 12, 12, 64)       0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 9216)              0         
                                                                 
 dense (Dense)               (None, 128)               1179776   
                                                             

In [12]:
# Viewing the protected model's architecture
protected_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 28, 28, 1)]       0         
                                                                 
 conv2d (Conv2D)             (None, 26, 26, 32)        320       
                                                                 
 conv2d_1 (Conv2D)           (None, 24, 24, 64)        18496     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 12, 12, 64)       0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 9216)              0         
                                                                 
 dense (Dense)               (None, 128)               1179776   
                                                           

In [13]:
# Wrapping the unprotected model
classifier_unprotected = TensorFlowV2Classifier(
    model=victim_model,
    nb_classes=10,
    input_shape=(28, 28, 1),
    loss_object=loss,
    train_step=train_step
)

# Wrapping the unprotected model
# and adding ART's postprocessing defense to it
classifier_protected = TensorFlowV2Classifier(
    model=victim_model,
    nb_classes=10,
    input_shape=(28, 28, 1),
    loss_object=loss,
    train_step=train_step,
    postprocessing_defences=postprocessor
)

# Wrapping the custom protected model
# without adding ART's postprocessing defense
classifier_protected_custom = TensorFlowV2Classifier(
    model=protected_model,
    nb_classes=10,
    input_shape=(28, 28, 1),
    loss_object=loss,
    train_step=train_step
)

In [14]:
# Creating the probabilistic "neural net thief" object
# that will try to steal the unprotected classifier
copycat_cnn_unprotected_probabilistic = CopycatCNN(
    batch_size_fit=256,
    batch_size_query=256,
    nb_epochs=10,
    nb_stolen=len(train_images_stolen),
    use_probability=True,
    classifier=classifier_unprotected    
    )

# Creating the probabilistic "neural net thief" object
# that will try to steal the protected classifier
copycat_cnn_protected_probabilistic = CopycatCNN(
    batch_size_fit=256,
    batch_size_query=256,
    nb_epochs=10,
    nb_stolen=len(train_images_stolen),
    use_probability=True,
    classifier=classifier_protected
    )

# Creating the probabilistic "neural net thief" object
# that will try to steal the protected custom classifier
copycat_cnn_protected_probabilistic_custom = CopycatCNN(
    batch_size_fit=256,
    batch_size_query=256,
    nb_epochs=10,
    nb_stolen=len(train_images_stolen),
    use_probability=True,
    classifier=classifier_protected_custom
    )

# Initializing base models that will be trained by the model extractor
# The unprotected model
model_stolen_unprotected = TensorFlowV2Classifier(
    model=create_model(),
    nb_classes=10,
    input_shape=(28, 28, 1),
    loss_object=loss,
    train_step=train_step
    )

# The model protected by ART's Reverse Sigmoid
model_stolen_protected = TensorFlowV2Classifier(
    model=create_model(),
    nb_classes=10,
    input_shape=(28, 28, 1),
    loss_object=loss,
    train_step=train_step
    )

# The model protected by the custom Reverse Sigmoid layer
model_stolen_protected_custom = TensorFlowV2Classifier(
    model=create_model(),
    nb_classes=10,
    input_shape=(28, 28, 1),
    loss_object=loss,
    train_step=train_step
    )

In [15]:
# Extracting the unprotected model
classifier_stolen_unprotected_probabilistic = copycat_cnn_unprotected_probabilistic.extract(
    x=train_images_stolen, 
    y=train_labels_stolen, 
    thieved_classifier=model_stolen_unprotected
    )

In [16]:
# Extracting the protected classifier
classifier_stolen_protected_probabilistic = copycat_cnn_protected_probabilistic.extract(
    x=train_images_stolen, 
    y=train_labels_stolen, 
    thieved_classifier=model_stolen_protected
    )

  perturbation_r = self.beta * (sigmoid(-self.gamma * np.log((1.0 - preds_clipped) / preds_clipped)) - 0.5)


In [17]:
# Extracting the protected custom classifier
classifier_stolen_protected_probabilistic_custom = copycat_cnn_protected_probabilistic_custom.extract(
    x=train_images_stolen, 
    y=train_labels_stolen, 
    thieved_classifier=model_stolen_protected_custom
    )

In [18]:
# Evaluating the performance of the victim model and the stolen models
score_victim = classifier_unprotected._model.evaluate(x=test_images_original, y=test_labels_original)
score_stolen_unprotected_probabilistic = classifier_stolen_unprotected_probabilistic._model.evaluate(x=test_images_original, y=test_labels_original)
score_stolen_protected_probabilistic = classifier_stolen_protected_probabilistic._model.evaluate(x=test_images_original, y=test_labels_original)
score_stolen_protected_probabilistic_custom = classifier_stolen_protected_probabilistic_custom._model.evaluate(x=test_images_original, y=test_labels_original)

# Comparing test losses
print("\n------ TEST METRICS, ORIGINAL VS PROBABILISTIC STOLEN MODELS ------\n\n")
print("------ TEST LOSS ------\n")
print(f"Original model: {score_victim[0]:.2f}\n" 
      f"Stolen unprotected model: {score_stolen_unprotected_probabilistic[0]:.2f}\n"
      f"Stolen protected model: {score_stolen_protected_probabilistic[0]:.2f}\n"
      f"Stolen protected custom model: {score_stolen_protected_probabilistic_custom[0]:.2f}\n")

# Comparing test accuracies
print("------ TEST ACCURACY ------\n")
print(f"Original model: {score_victim[1]:.2f}\n" 
      f"Stolen unprotected model: {score_stolen_unprotected_probabilistic[1]:.2f}\n"
      f"Stolen protected model: {score_stolen_protected_probabilistic[1]:.2f}\n"
      f"Stolen protected custom model: {score_stolen_protected_probabilistic_custom[1]:.2f}\n")


------ TEST METRICS, ORIGINAL VS PROBABILISTIC STOLEN MODELS ------


------ TEST LOSS ------

Original model: 0.04
Stolen unprotected model: 0.07
Stolen protected model: 2.30
Stolen protected custom model: 2.30

------ TEST ACCURACY ------

Original model: 0.99
Stolen unprotected model: 0.98
Stolen protected model: 0.11
Stolen protected custom model: 0.11



In [19]:
# Viewing the predictions of the victim model (standard probabilities)
preds = victim_model.predict(x=test_images_original[:5])

print("--- Output probabilities ---\n")
print(preds, "\n\n")

print("--- Class predictions ---\n")
print(tf.math.argmax(input=preds, axis=1).numpy())

--- Output probabilities ---

[[2.7468112e-15 1.8481106e-12 1.2033476e-09 5.2852425e-08 1.6848592e-16
  7.7248734e-14 5.5748515e-22 9.9999988e-01 1.4328472e-10 1.1050387e-07]
 [1.1688788e-13 5.0403987e-11 1.0000000e+00 2.3392673e-16 5.9806787e-19
  8.9342873e-21 5.3660982e-12 6.5156306e-20 7.7604608e-15 9.2375143e-14]
 [2.3782047e-09 9.9999833e-01 3.4241248e-09 2.0667263e-11 1.2605947e-06
  2.1213259e-11 2.2180614e-10 4.2062942e-07 2.2162581e-08 4.7198332e-11]
 [9.9999905e-01 8.5247933e-13 2.2508762e-09 1.4310031e-13 3.0353983e-12
  6.0558954e-11 9.9701549e-07 6.5476090e-11 1.2894335e-11 2.6122401e-11]
 [1.0913101e-16 1.5779389e-15 1.1723524e-14 1.4936533e-17 1.0000000e+00
  2.0704192e-18 5.4001573e-15 1.5701909e-14 2.8311672e-12 1.1672593e-09]] 


--- Class predictions ---

[7 2 1 0 4]


In [20]:
# Viewing the predictions of the model protected by the custom Reverse Sigmoid
protected_preds = protected_model.predict(x=test_images_original[:5])

print("--- Protected output probabilities ---\n")
print(protected_preds, "\n\n")

print("--- Class predictions ---\n")
print(tf.math.argmax(input=protected_preds, axis=1).numpy())

--- Protected output probabilities ---

[[0.09973355 0.09973355 0.09961436 0.09597486 0.09973355 0.09973355
  0.09973355 0.11109942 0.09973355 0.09491012]
 [0.09967895 0.09967895 0.10288944 0.09967895 0.09967895 0.09967895
  0.09967895 0.09967895 0.09967895 0.09967895]
 [0.09982257 0.11720216 0.09953831 0.10042316 0.09080638 0.10042316
  0.10042316 0.09321153 0.09772635 0.10042316]
 [0.11470099 0.09942506 0.09887123 0.09942506 0.09942506 0.09942506
  0.09045236 0.09942506 0.09942506 0.09942506]
 [0.09968884 0.09968884 0.09968884 0.09968884 0.10289965 0.09968884
  0.09968884 0.09968884 0.09968884 0.09958959]] 


--- Class predictions ---

[7 2 1 0 4]


In [21]:
# Passing the victim model's output through ART's Reverse Sigmoid
# to make sure that the custom layer has the same output
print("--- Protected output probabilities (ART's Reverse Sigmoid) ---\n")
print(postprocessor(preds), "\n\n")

print("--- Class predictions ---\n")
print(tf.math.argmax(input=postprocessor(preds), axis=1).numpy())

--- Protected output probabilities (ART's Reverse Sigmoid) ---

[[0.09973355 0.09973355 0.09961436 0.09597485 0.09973355 0.09973355
  0.09973355 0.11109942 0.09973355 0.09491012]
 [0.09967895 0.09967895 0.10288944 0.09967895 0.09967895 0.09967895
  0.09967895 0.09967895 0.09967895 0.09967895]
 [0.09982257 0.11720216 0.09953831 0.10042316 0.09080638 0.10042316
  0.10042316 0.09321153 0.09772635 0.10042316]
 [0.11470099 0.09942506 0.09887123 0.09942506 0.09942506 0.09942506
  0.09045236 0.09942506 0.09942506 0.09942506]
 [0.09968884 0.09968884 0.09968884 0.09968884 0.10289965 0.09968884
  0.09968884 0.09968884 0.09968884 0.09958959]] 


--- Class predictions ---

[7 2 1 0 4]


In [22]:
# Saving the protected model with the custom output
protected_model.save(filepath="postprocessed_model.h5")

