# Implementation on the MNIST DATASET

**Robustness and Privacy**

## Model and endpoint creation with Google Vertex AI

Initialise Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Install the latest version of Vertex SDK and Google Cloud Storage SDK and then restart kernel

In [None]:
import os
# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# Google Cloud Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"
    

! pip install {USER_FLAG} --upgrade google-cloud-aiplatform
! pip install {USER_FLAG} --upgrade google-cloud-storage
! pip install {USER_FLAG} --upgrade numpy
! pip install git+https://github.com/RobustBench/robustbench.git

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

Import all libraries

In [None]:
import os, sys
import gzip
from datetime import datetime
import numpy as np
import pandas as pd
import sklearn
import torch
from google.cloud import storage
from keras.preprocessing.image import array_to_img
from typing import Dict, Optional, Sequence, Tuple, List

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
if torch.cuda.is_available():
  print(torch.cuda.get_device_name(0))

cpu


Set global variable

In [None]:
PROJECT = "mnist-automl-350516"  # @param {type:"string"}
REGION = "europe-west4" # @param {type:"string"}

If not trained, set a new timestamp

In [None]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
print(TIMESTAMP)

20220531151405


If already trained, get last timestamp

In [None]:
TIMESTAMP = "20220530154112" # @param {type:"string"}

Authenticate to Google Cloud (if running on Colab)

In [None]:
# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# If on Google Cloud Notebooks, then don't execute this code
if not IS_GOOGLE_CLOUD_NOTEBOOK:
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

Create a bucket to execute our code

**This step is necessary**

Set bucket name

In [None]:
BUCKET_NAME = "gs://bucket-mnist-1"  # @param {type:"string"}
NO_PATH_BUCKET_NAME = "bucket-mnist-1"  # @param {type:"string"}

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
from google.cloud import storage


def create_bucket_class_location():
    """
    Create a new bucket in the settled region with the coldline storage
    class
    """
    bucket_name = NO_PATH_BUCKET_NAME
    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)
    bucket.storage_class = "STANDARD"
    new_bucket = storage_client.create_bucket(bucket, project=PROJECT, location=REGION)

    print(
        "Created bucket {} in {} with storage class {}".format(
            new_bucket.name, new_bucket.location, new_bucket.storage_class
        )
    )
    return new_bucket

create_bucket_class_location()

Initialise AI Platform for our project

In [None]:
from google.cloud import aiplatform

aiplatform.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=BUCKET_NAME
)

Definition of the function to load datset from a directory

In [None]:
def load_mnist(path, kind='train'):
    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte.gz'
                               % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte.gz'
                               % kind)
    with gzip.open(labels_path, 'rb') as lbpath:
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
                               offset=8)
    with gzip.open(images_path, 'rb') as imgpath:
        images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                               offset=16).reshape(len(labels), 784) 
    return images, labels

Load dataset from Google Drive

In [None]:
# Import training data
X_train, y_train = load_mnist(path='/content/drive/MyDrive/Colab Notebooks/mnist', kind='train')
X_test, y_test = load_mnist(path='/content/drive/MyDrive/Colab Notebooks/mnist', kind='t10k')

Create a dictionary to map all the images

In [None]:
# Dataset placeholder
files = pd.DataFrame({'part': np.concatenate([
                                   np.repeat('TRAIN', 60000),
                                   np.repeat('TEST', 10000)
                              ]),
                      'file': np.repeat('file', 70000),
                      'label': np.repeat('label', 70000)})

# Stack training and test data into single arrays
X_data = np.vstack([X_train, X_test])
y_data = np.concatenate([y_train, y_test])

Generate and upload CSV file to Google Cloud Storage (do it once)

In [None]:
storage_client = storage.Client()
bucket = storage_client.bucket(NO_PATH_BUCKET_NAME)
if not os.path.isdir(f"mnist_{TIMESTAMP}"):
    os.mkdir(f"mnist_{TIMESTAMP}")
for i, x in enumerate(X_data):
    # Console print
    if i % 1000 == 0:
        print('Uploading image {image}'.format(image=i))
    # Reshape and export image
    img = array_to_img(x=x.reshape(28, 28, 1))
    img.save(fp=f"mnist_{TIMESTAMP}/image_{str(i)}.jpg")
    # Add info to data frame
    files.iloc[i, 1] = f"{BUCKET_NAME}/mnist_{TIMESTAMP}/image_{str(i)}.jpg"
    files.iloc[i, 2] = y_data[i]
    # Upload to GCP
    blob = bucket.blob(f"mnist_{TIMESTAMP}/image_{str(i)}.jpg")
    blob.upload_from_filename(f"mnist_{TIMESTAMP}/image_{str(i)}.jpg")
    # Delete image file
    os.remove(f"mnist_{TIMESTAMP}/image_{str(i)}.jpg")
# Export CSV file
files.to_csv(path_or_buf='mnist_map.csv', header=False, index=False)
blob = bucket.blob(f"mnist_{TIMESTAMP}/mnist_map.csv")
blob.upload_from_filename(f"mnist_{TIMESTAMP}/mnist_map.csv")

Save generated CSV also to Google Drive, in order to have a backup

In [None]:
if not os.path.isdir(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/"):
    os.mkdir(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/")
files.to_csv(path_or_buf=f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/mnist_map.csv", header=False, index=False)

Set IMPORT_FILE variable

In [None]:
IMPORT_FILE = f"gs://{NO_PATH_BUCKET_NAME}/mnist_{TIMESTAMP}/mnist_map.csv"

Create a new dataset from CSV file

In [None]:
train_mnist_dataset = aiplatform.ImageDataset.create(
    display_name=f"mnist_dataset_{TIMESTAMP}",
    gcs_source=[IMPORT_FILE],
    import_schema_uri=aiplatform.schema.dataset.ioformat.image.single_label_classification,
)

Creating ImageDataset
Create ImageDataset backing LRO: projects/152908619293/locations/europe-west4/datasets/2840350795847696384/operations/1602647049134669824
ImageDataset created. Resource name: projects/152908619293/locations/europe-west4/datasets/2840350795847696384
To use this ImageDataset in another session:
ds = aiplatform.ImageDataset('projects/152908619293/locations/europe-west4/datasets/2840350795847696384')
Importing ImageDataset data: projects/152908619293/locations/europe-west4/datasets/2840350795847696384
Import ImageDataset data backing LRO: projects/152908619293/locations/europe-west4/datasets/2840350795847696384/operations/8772377655908499456
ImageDataset data imported. Resource name: projects/152908619293/locations/europe-west4/datasets/2840350795847696384


Create a training pipeline

In [None]:
train_job = aiplatform.AutoMLImageTrainingJob(
    display_name="mnist_job_" + TIMESTAMP,
    prediction_type="classification",
    multi_label=False,
    model_type="CLOUD",
    base_model=None,
)

print(train_job)


<google.cloud.aiplatform.training_jobs.AutoMLImageTrainingJob object at 0x7f4608805710>


Run the training pipeline to train the model

In [None]:
model = train_job.run(
    dataset=train_mnist_dataset,
    model_display_name="mnist_model_" + TIMESTAMP,
    budget_milli_node_hours=8000,
    disable_early_stopping=False,
)

No dataset split provided. The service will use a default split.
View Training:
https://console.cloud.google.com/ai/platform/locations/europe-west4/training/2419275225804832768?project=152908619293
AutoMLImageTrainingJob projects/152908619293/locations/europe-west4/trainingPipelines/2419275225804832768 current state:
PipelineState.PIPELINE_STATE_PENDING
AutoMLImageTrainingJob projects/152908619293/locations/europe-west4/trainingPipelines/2419275225804832768 current state:
PipelineState.PIPELINE_STATE_PENDING
AutoMLImageTrainingJob projects/152908619293/locations/europe-west4/trainingPipelines/2419275225804832768 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLImageTrainingJob projects/152908619293/locations/europe-west4/trainingPipelines/2419275225804832768 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLImageTrainingJob projects/152908619293/locations/europe-west4/trainingPipelines/2419275225804832768 current state:
PipelineState.PIPELINE_STATE_RUNNING
AutoMLImageT

**Only if already done:** retrieve the model by ID

In [None]:
model = aiplatform.Model("projects/152908619293/locations/europe-west4/models/8470901463176970240@1")

### Batch prediction test

Get dataframe with GCS path of all images of test set

In [None]:
batch_df = pd.read_csv(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/mnist_map.csv", header=None)
batch_df.columns = ["set", "path", "label"]
batch_df = batch_df.loc[batch_df['set']=="TEST"]
batch_df.reset_index(drop=True, inplace=True)
batch_df.drop(["set"], axis=1, inplace=True)

Upload to GCS the JSONL file of batch prediction

In [None]:
import json

import tensorflow as tf

gcs_input_uri = f"{BUCKET_NAME}/mnist_{TIMESTAMP}/batch_test.jsonl"
with tf.io.gfile.GFile(gcs_input_uri, "w") as f:
    for index, row in batch_df.iterrows():
        data = {"content": row["path"], "mime_type": "image/jpeg"}
        f.write(json.dumps(data) + "\n")

print(gcs_input_uri)

gs://bucket-mnist-1/mnist_20220530154112/batch_test.jsonl


Create and run batch prediction job

In [None]:
batch_predict_job = model.batch_predict(
    job_display_name="mnist_batch_prediction_job_" + TIMESTAMP,
    gcs_source=gcs_input_uri,
    gcs_destination_prefix=f"{BUCKET_NAME}/mnist_{TIMESTAMP}/predictions/",
    sync=True,
)

print(batch_predict_job)

Retrieve batch prediction results

In [None]:
if not os.path.exists(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/batch_pred_test.csv"):
    bp_iter_outputs = batch_predict_job.iter_outputs()

    prediction_results = list()
    for blob in bp_iter_outputs:
        if blob.name.split("/")[-1].startswith("prediction"):
            prediction_results.append(blob.name)

    pred_df = pd.DataFrame()
    for prediction_result in prediction_results:
        gfile_name = f"gs://{bp_iter_outputs.bucket.name}/{prediction_result}"
        with tf.io.gfile.GFile(name=gfile_name, mode="r") as gfile:
            for line in gfile.readlines():
                line = json.loads(line)
                new_line = {"path":line.get("instance").get("content"), "predicted_value":line.get("prediction").get("displayNames")[0]}
                pred_df = pred_df.append(new_line, ignore_index=True)

    pred_df.sort_values("path", inplace=True)
    pred_df.to_csv(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/batch_pred_test.csv", index=False)
else: 
    pred_df = pd.read_csv(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/batch_pred_test.csv")

Check accuracy of the model

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

actual_y_test = batch_df["label"].astype(int).to_list()
predicted_y_test = pred_df["predicted_value"].astype(int).to_list()
print(confusion_matrix(actual_y_test,predicted_y_test))
print(f"Accuracy of the CNN: {accuracy_score(actual_y_test,predicted_y_test)}")

[[ 975    0    0    0    0    2    1    0    0    2]
 [   0 1129    1    0    0    0    2    3    0    0]
 [   0    1 1018    5    0    5    3    0    0    0]
 [   0    0    1  947    0   59    0    1    2    0]
 [   0    0    0    0  973    0    1    0    0    8]
 [   1    0    0   10    0  878    2    1    0    0]
 [   1    1    1    0    0    6  947    0    2    0]
 [   0    2    4    0    2    1    0 1018    0    1]
 [   0    0    3    1    0    2    0    0  967    1]
 [   0    0    0    1    5    2    1    2    3  995]]
Accuracy of the CNN: 0.9847


## Robustness

Check the robustness with RobustBench (CUDA required)


In [None]:
X_test = X_test.reshape(10000,28,28,1)
X_test_tensor = torch.from_numpy(X_test).to(torch.float32)
y_test_tensor = torch.from_numpy(y_test).to(torch.float32)

In [None]:
from autoattack import AutoAttack
adversary = AutoAttack(model, norm='Linf', eps=8/255, version='custom', attacks_to_run=['apgd-ce', 'apgd-dlr'])
adversary.apgd.n_restarts = 1
x_adv = adversary.run_standard_evaluation(X_test_tensor, y_test_tensor)

## Privacy

### Deploy and test model

In [None]:
endpoint = aiplatform.Endpoint.create(
    display_name=f"mnist_endpoint_{TIMESTAMP}",
    project=PROJECT,
    location=REGION,
)

Creating Endpoint
Create Endpoint backing LRO: projects/152908619293/locations/europe-west4/endpoints/1281960189242638336/operations/3353773247995838464
Endpoint created. Resource name: projects/152908619293/locations/europe-west4/endpoints/1281960189242638336
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/152908619293/locations/europe-west4/endpoints/1281960189242638336')


Manually set endpoint ID, taken from the returned value

In [None]:
ENDPOINT_ID = "1281960189242638336"
endpoint = aiplatform.Endpoint(ENDPOINT_ID)

In [None]:
model.deploy(endpoint=endpoint,
             machine_type="n1-standard-4")

Deploying model to Endpoint : projects/152908619293/locations/europe-west4/endpoints/1281960189242638336
Model does not support dedicated deployment resources. The machine_type, accelerator_type and accelerator_count,autoscaling_target_accelerator_duty_cycle,autoscaling_target_cpu_utilization parameters are ignored.
Deploy Endpoint model backing LRO: projects/152908619293/locations/europe-west4/endpoints/1281960189242638336/operations/7281756547993042944
Endpoint model deployed. Resource name: projects/152908619293/locations/europe-west4/endpoints/1281960189242638336


<google.cloud.aiplatform.models.Endpoint object at 0x7fa229b43910> 
resource name: projects/152908619293/locations/europe-west4/endpoints/1281960189242638336

In [None]:
import tensorflow as tf
import json
import base64
from google.cloud.aiplatform.gapic.schema import predict

def predict_classification_online(gcs_input_uri):
    endpoint = aiplatform.Endpoint(ENDPOINT_ID)
    storage_client = storage.Client()
    bucket = storage_client.bucket(NO_PATH_BUCKET_NAME)

    results = []

    with tf.io.gfile.GFile(gcs_input_uri, "r") as f:
        for line in f.readlines():
            line = json.loads(line)
            blob = bucket.blob(line.get("content").replace("gs://bucket-mnist-1/",""))
            image_to_encode = blob.download_as_bytes()
            encoded_image = base64.b64encode(image_to_encode).decode("utf-8")
            instance = predict.instance.ImageClassificationPredictionInstance(
                    content=encoded_image,
                    mime_type=line.get("mime/type")
            ).to_value()
            instances = [instance]

            response = endpoint.predict(instances=instances)

            for prediction in response.predictions:
                id = 0
                tmp = -999
                for i, value in enumerate(prediction.get("confidences")):
                    if value>tmp:
                        tmp = value
                        id = i
                results.append(prediction.get("displayNames")[id])
        
    return results

In [None]:
X_test = X_test.reshape(10000,28,28,1)
size_noisy_data = 100

Define transformation and transform data

In [None]:
class AddGaussianNoise(object):
    def __init__(self, mean=0., std=1.):
        self.std = std
        self.mean = mean
        
    def __call__(self, tensor):
        return tensor + torch.randn(tensor.size()) * self.std + self.mean
    
    def __repr__(self):
        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)

### Try with one value of std dev (intensity) of noise

In [None]:
from torchvision.transforms import transforms
gaussian_noise=transforms.Compose([
    transforms.ToTensor(),
    AddGaussianNoise(0., 0.1)
])

X_test_trans = []
for img in X_test[:size_noisy_data]:
    img = gaussian_noise(img)
    X_test_trans.append(img)
X_test_trans = torch.cat(X_test_trans)

Save transformed data to GCS

Create a single file for all the images

In [None]:
# Dataset placeholder
files = pd.DataFrame({'part': np.repeat('noisy', size_noisy_data),
                      'file': np.repeat('file', size_noisy_data),
                      'label': np.repeat('label', size_noisy_data)})

In [None]:
X_test_trans = X_test_trans.cpu().numpy().reshape(size_noisy_data,28,28,1)

Generate and upload CSV file to Google Cloud Storage (do it once)

In [None]:
storage_client = storage.Client()
bucket = storage_client.bucket(NO_PATH_BUCKET_NAME)
if not os.path.isdir(f"mnist_noisy_{TIMESTAMP}"):
    os.mkdir(f"mnist_noisy_{TIMESTAMP}")
for i, x in enumerate(X_test_trans):
    # Reshape and export image
    img = array_to_img(x=x.reshape(28, 28, 1))
    img.save(fp=f"mnist_noisy_{TIMESTAMP}/image_{str(i)}.jpg")
    # Add info to data frame
    files.iloc[i, 1] = f"{BUCKET_NAME}/mnist_noisy_{TIMESTAMP}/image_{str(i)}.jpg"
    files.iloc[i, 2] = y_test[i]
    # Upload to GCP
    blob = bucket.blob(f"mnist_noisy_{TIMESTAMP}/image_{str(i)}.jpg")
    blob.upload_from_filename(f"mnist_noisy_{TIMESTAMP}/image_{str(i)}.jpg")
    # Delete image file
    os.remove(f"mnist_noisy_{TIMESTAMP}/image_{str(i)}.jpg")
# Export CSV file
files.to_csv(path_or_buf=f'mnist_noisy_{TIMESTAMP}/mnist_noisy_map.csv', header=False, index=False)
blob = bucket.blob(f"mnist_noisy_{TIMESTAMP}/mnist_noisy_map.csv")
blob.upload_from_filename(f"mnist_noisy_{TIMESTAMP}/mnist_noisy_map.csv")

Save generated CSV to Google Drive

In [None]:
if not os.path.isdir(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/"):
    os.mkdir(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/")
files.to_csv(path_or_buf=f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/mnist_noisy_map.csv", header=False, index=False)

Get dataframe with GCS path of all images of test set

In [None]:
batch_df = pd.read_csv(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/mnist_noisy_map.csv", header=None)
batch_df.columns = ["set", "path", "label"]
batch_df.drop(["set"], axis=1, inplace=True)

Upload to GCS the JSONL file of batch prediction

In [None]:
import json
import tensorflow as tf

gcs_input_uri = f"{BUCKET_NAME}/mnist_noisy_{TIMESTAMP}/batch_test.jsonl"
with tf.io.gfile.GFile(gcs_input_uri, "w") as f:
    for index, row in batch_df.iterrows():
        data = {"content": row["path"], "mime_type": "image/jpeg"}
        f.write(json.dumps(data) + "\n")

print(gcs_input_uri)

gs://bucket-mnist-1/mnist_noisy_20220530154112/batch_test.jsonl


In [None]:
#pred_results = predict_classification_online(gcs_input_uri)
pred_results = list(map(int, pred_results))
actual_values = batch_df["label"].to_list()
print(confusion_matrix(actual_values,pred_results))
print(f"Accuracy of the CNN: {accuracy_score(actual_values,pred_results)}")

[[ 8  0  0  0  0  0  0  0  0  0]
 [ 0  4  1  0  0  0  0  9  0  0]
 [ 0  0  4  0  0  0  0  1  3  0]
 [ 0  0  0  5  0  0  0  1  5  0]
 [ 0  0  1  0  9  0  0  1  3  0]
 [ 0  0  0  3  0  1  0  0  3  0]
 [ 2  0  0  0  0  0  1  0  7  0]
 [ 0  0  2  1  0  0  0 12  0  0]
 [ 0  0  0  0  0  0  0  0  2  0]
 [ 0  0  0  0  0  0  0  0  2  9]]
Accuracy of the CNN: 0.55


### Loop all the things to analyze the behavior at the variation of the intensity of the noise

In [None]:
import json
import tensorflow as tf
from torchvision.transforms import transforms
from sklearn.metrics import accuracy_score

size_noisy_data = 100
storage_client = storage.Client()
bucket = storage_client.bucket(NO_PATH_BUCKET_NAME)
for intensity in np.linspace(0., 0.6, 14):
    gaussian_noise=transforms.Compose([
        transforms.ToTensor(),
        AddGaussianNoise(0., intensity)
    ])

    X_test_trans = []
    for img in X_test[:size_noisy_data]:
        img = gaussian_noise(img)
        X_test_trans.append(img)
    X_test_trans = torch.cat(X_test_trans)


    # Dataset placeholder
    files = pd.DataFrame({'part': np.repeat('noisy', size_noisy_data),
                        'file': np.repeat('file', size_noisy_data),
                        'label': np.repeat('label', size_noisy_data)})
    
    X_test_trans = X_test_trans.cpu().numpy().reshape(size_noisy_data,28,28,1)
    
    if not os.path.isdir(f"mnist_noisy_{TIMESTAMP}"):
        os.mkdir(f"mnist_noisy_{TIMESTAMP}")
    for i, x in enumerate(X_test_trans):
        # Reshape and export image
        img = array_to_img(x=x.reshape(28, 28, 1))
        img.save(fp=f"mnist_noisy_{TIMESTAMP}/image_{str(i)}.jpg")
        # Add info to data frame
        files.iloc[i, 1] = f"{BUCKET_NAME}/mnist_noisy_{TIMESTAMP}/image_{str(i)}.jpg"
        files.iloc[i, 2] = y_test[i]
        # Upload to GCP
        blob = bucket.blob(f"mnist_noisy_{TIMESTAMP}/image_{str(i)}.jpg")
        blob.upload_from_filename(f"mnist_noisy_{TIMESTAMP}/image_{str(i)}.jpg")
        # Delete image file
        os.remove(f"mnist_noisy_{TIMESTAMP}/image_{str(i)}.jpg")
    # Export CSV file
    files.to_csv(path_or_buf=f'mnist_noisy_{TIMESTAMP}/mnist_noisy_map.csv', header=False, index=False)
    blob = bucket.blob(f"mnist_noisy_{TIMESTAMP}/mnist_noisy_map.csv")
    blob.upload_from_filename(f"mnist_noisy_{TIMESTAMP}/mnist_noisy_map.csv")

    if not os.path.isdir(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/"):
        os.mkdir(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/")
    files.to_csv(path_or_buf=f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/mnist_noisy_map.csv", header=False, index=False)

    batch_df = pd.read_csv(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/mnist_noisy_map.csv", header=None)
    batch_df.columns = ["set", "path", "label"]
    batch_df.drop(["set"], axis=1, inplace=True)

    gcs_input_uri = f"{BUCKET_NAME}/mnist_noisy_{TIMESTAMP}/batch_test.jsonl"
    with tf.io.gfile.GFile(gcs_input_uri, "w") as f:
        for index, row in batch_df.iterrows():
            data = {"content": row["path"], "mime_type": "image/jpeg"}
            f.write(json.dumps(data) + "\n")

    pred_results = predict_classification_online(gcs_input_uri)
    pred_results = list(map(int, pred_results))
    actual_values = batch_df["label"].to_list()
    print(f"Accuracy of the CNN with standard dev of noise {intensity.round(3)}: {accuracy_score(actual_values,pred_results)}")

Accuracy of the CNN with standard dev of noise 0.0: 1.0
Accuracy of the CNN with standard dev of noise 0.046: 0.84
Accuracy of the CNN with standard dev of noise 0.092: 0.63
Accuracy of the CNN with standard dev of noise 0.138: 0.37
Accuracy of the CNN with standard dev of noise 0.185: 0.3
Accuracy of the CNN with standard dev of noise 0.231: 0.16
Accuracy of the CNN with standard dev of noise 0.277: 0.13
Accuracy of the CNN with standard dev of noise 0.323: 0.1
Accuracy of the CNN with standard dev of noise 0.369: 0.1
Accuracy of the CNN with standard dev of noise 0.415: 0.08
Accuracy of the CNN with standard dev of noise 0.462: 0.13
Accuracy of the CNN with standard dev of noise 0.508: 0.11
Accuracy of the CNN with standard dev of noise 0.554: 0.08
Accuracy of the CNN with standard dev of noise 0.6: 0.1


### Create and run batch prediction job

In [None]:
batch_predict_job = model.batch_predict(
    job_display_name="mnist_noisy_batch_prediction_job_" + TIMESTAMP,
    gcs_source=gcs_input_uri,
    gcs_destination_prefix=f"{BUCKET_NAME}/mnist_noisy_{TIMESTAMP}/predictions/",
    sync=True,
)

print(batch_predict_job)

Creating BatchPredictionJob
<google.cloud.aiplatform.jobs.BatchPredictionJob object at 0x7f883940c5d0> is waiting for upstream dependencies to complete.


Retrieve batch prediction results

In [None]:
if not os.path.exists(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/noisy_batch_pred_test.csv"):
    bp_iter_outputs = batch_predict_job.iter_outputs()

    prediction_results = list()
    for blob in bp_iter_outputs:
        if blob.name.split("/")[-1].startswith("prediction"):
            prediction_results.append(blob.name)

    pred_df = pd.DataFrame()
    for prediction_result in prediction_results:
        gfile_name = f"gs://{bp_iter_outputs.bucket.name}/{prediction_result}"
        with tf.io.gfile.GFile(name=gfile_name, mode="r") as gfile:
            for line in gfile.readlines():
                line = json.loads(line)
                new_line = {"path":line.get("instance").get("content"), "predicted_value":line.get("prediction").get("displayNames")[0]}
                pred_df = pred_df.append(new_line, ignore_index=True)

    pred_df.sort_values("path", inplace=True)
    pred_df.to_csv(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/noisy_batch_pred_test.csv", index=False)
else: 
    pred_df = pd.read_csv(f"/content/drive/MyDrive/Colab Notebooks/MNIST/{TIMESTAMP}/noisy_batch_pred_test.csv")

Check accuracy of the model

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

actual_y_test = batch_df["label"].astype(int).to_list()
predicted_y_test = pred_df["predicted_value"].astype(int).to_list()
print(confusion_matrix(actual_y_test,predicted_y_test))
print(f"Accuracy of the CNN: {accuracy_score(actual_y_test,predicted_y_test)}")

[[1 0 0 0 0 0 0 0]
 [0 2 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0]
 [0 0 0 1 0 0 1 0]
 [0 0 0 0 0 0 1 0]
 [0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 2]]
Accuracy of the CNN: 0.8
