In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os
from tempfile import gettempdir

import tensorflow as tf

from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model

from clearml import Task


# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name='examples', task_name='TensorFlow v2 MNIST with summaries')


# Load and prepare the MNIST dataset.
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

# Add a channels dimension
x_train = x_train[..., tf.newaxis].astype('float32')
x_test = x_test[..., tf.newaxis].astype('float32')

# Use tf.data to batch and shuffle the dataset
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(32)
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)


# Build the tf.keras model using the Keras model subclassing API
class MyModel(Model):
    def __init__(self):
        super(MyModel, self).__init__()
        self.conv1 = Conv2D(32, 3, activation='relu', dtype=tf.float32)
        self.flatten = Flatten()
        self.d1 = Dense(128, activation='relu', dtype=tf.float32)
        self.d2 = Dense(10, activation='softmax', dtype=tf.float32)

    def call(self, x):
        x = self.conv1(x)
        x = self.flatten(x)
        x = self.d1(x)
        return self.d2(x)


# Create an instance of the model
model = MyModel()

# Choose an optimizer and loss function for training
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

# Select metrics to measure the loss and the accuracy of the model.
# These metrics accumulate the values over epochs and then print the overall result.
train_loss = tf.keras.metrics.Mean(name='train_loss', dtype=tf.float32)
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss', dtype=tf.float32)
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')


# Use tf.GradientTape to train the model
@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
        predictions = model(images)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions)


# Test the model
@tf.function
def test_step(images, labels):
    predictions = model(images)
    t_loss = loss_object(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)


# Set up summary writers to write the summaries to disk in a different logs directory
train_log_dir = os.path.join(gettempdir(), 'logs', 'gradient_tape', 'train')
test_log_dir = os.path.join(gettempdir(), 'logs', 'gradient_tape', 'test')
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
test_summary_writer = tf.summary.create_file_writer(test_log_dir)

# Set up checkpoints manager
ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, net=model)
manager = tf.train.CheckpointManager(ckpt, os.path.join(gettempdir(), 'tf_ckpts'), max_to_keep=3)
ckpt.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
    print("Restored from {}".format(manager.latest_checkpoint))
else:
    print("Initializing from scratch.")

# Start training
EPOCHS = 5
for epoch in range(EPOCHS):
    for images, labels in train_ds:
        train_step(images, labels)
        with train_summary_writer.as_default():
            tf.summary.scalar('loss', train_loss.result(), step=epoch)
            tf.summary.scalar('accuracy', train_accuracy.result(), step=epoch)

    ckpt.step.assign_add(1)
    if int(ckpt.step) % 1 == 0:
        save_path = manager.save()
        print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))

    for test_images, test_labels in test_ds:
        test_step(test_images, test_labels)
        with test_summary_writer.as_default():
            tf.summary.scalar('loss', test_loss.result(), step=epoch)
            tf.summary.scalar('accuracy', test_accuracy.result(), step=epoch)

    template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
    print(template.format(epoch+1,
                          train_loss.result(),
                          train_accuracy.result()*100,
                          test_loss.result(),
                          test_accuracy.result()*100))

    # Reset the metrics for the next epoch
    train_loss.reset_states()
    train_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()

2023-02-12 23:17:53.621506: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-12 23:17:53.690562: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


ClearML Task: created new task id=4de5086ff80a416cbd25da115868c71c
ClearML results page: http://192.168.0.152:8080/projects/f6b877d87fe442e3a23fe9acec993e99/experiments/4de5086ff80a416cbd25da115868c71c/output/log
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
2023-02-12 23:17:55,231 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: http://192.168.0.152:8080/projects/f6b877d87fe442e3a23fe9acec993e99/experiments/4de5086ff80a416cbd25da115868c71c/output/log
ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring


2023-02-12 23:17:56.540734: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-02-12 23:17:56.540758: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: Ubuntu-CNL-1
2023-02-12 23:17:56.540762: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: Ubuntu-CNL-1
2023-02-12 23:17:56.540804: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program
2023-02-12 23:17:56.540822: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 525.78.1
2023-02-12 23:17:56.541075: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions 

Initializing from scratch.
Saved checkpoint for step 2: /tmp/tf_ckpts/ckpt-1
Epoch 1, Loss: 0.13519562780857086, Accuracy: 95.93000030517578, Test Loss: 0.05724003165960312, Test Accuracy: 98.0199966430664
Saved checkpoint for step 3: /tmp/tf_ckpts/ckpt-2
Epoch 2, Loss: 0.042800817638635635, Accuracy: 98.66999816894531, Test Loss: 0.054517652839422226, Test Accuracy: 98.25999450683594
Saved checkpoint for step 4: /tmp/tf_ckpts/ckpt-3
Epoch 3, Loss: 0.02220507524907589, Accuracy: 99.2933349609375, Test Loss: 0.04859881103038788, Test Accuracy: 98.48999786376953
Saved checkpoint for step 5: /tmp/tf_ckpts/ckpt-4
Epoch 4, Loss: 0.013418901711702347, Accuracy: 99.56832885742188, Test Loss: 0.05808817967772484, Test Accuracy: 98.3499984741211
Saved checkpoint for step 6: /tmp/tf_ckpts/ckpt-5
Epoch 5, Loss: 0.009454944171011448, Accuracy: 99.69499969482422, Test Loss: 0.060444824397563934, Test Accuracy: 98.5999984741211


In [2]:
task.close()