In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
import os 
import numpy as np
import glob 
from pathlib import Path
from PIL import Image
import re 
from tensorflow.keras.datasets import mnist

In [5]:
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')
Number of devices: 4


In [3]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(-1, 28, 28, 1).astype("float32") / 255.0
x_test = x_test.reshape(-1, 28, 28, 1).astype("float32") / 255.0

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [4]:
epochs = 30
batch_size_per_replica = 1024
global_batch_size = batch_size_per_replica * strategy.num_replicas_in_sync
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(len(x_train)).batch(global_batch_size)
train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)

In [5]:
def create_model():
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(10, activation='softmax'))
    return model

In [1]:
from model import * 
yaml_path = "/app/yolo/configs/yolo-l-mish.yaml"
yolo = Yolo(yaml_path)
model = yolo(640)
img = np.random.rand(1,640,640,3)
pred = model(img)

In [3]:
print(len(pred))
print(pred[0].shape)

3
(1, 80, 80, 3, 25)


In [6]:
# Create model inside the strategy scope
with strategy.scope():
    # Create and compile the model
    model = create_model()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
optimizer = tf.keras.optimizers.Adam()

# Define checkpoint directory
checkpoint_dir = './checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)

# Define training step
@tf.function
def distributed_train_step(inputs):
    per_replica_losses = strategy.run(train_step, args=(inputs,))
    return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)

def train_step(inputs):
    images, labels = inputs
    with tf.GradientTape() as tape:
        predictions = model(images)
        loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(labels, predictions))
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


for epoch in range(epochs):
    total_loss = 0.0
    num_batches = 0
    for x in train_dist_dataset:
        total_loss += distributed_train_step(x)
        num_batches += 1
    train_loss = total_loss / num_batches
    
    # learning rate decay
    if epoch == 3:
        optimizer.lr = 1e-3
    elif epoch == 10:
        optimizer.lr = 1e-4
        
    # save checkpoint 
    if epoch % 2 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)
        
    # eval checkpoint 
    if epoch % 2 == 0:
        test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
        print('\nTest accuracy:', test_acc)
    
    print('Epoch {}, Loss: {}'.format(epoch, train_loss))
    
# Evaluate the model
# test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
# print('\nTest accuracy:', test_acc)

Instructions for updating:
Use `tf.data.Iterator.get_next_as_optional()` instead.
INFO:tensorflow:batch_all_reduce: 10 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:batch_all_reduce: 10 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:batch_all_reduce: 10 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:loca

In [19]:
!ls checkpoints

checkpoint		     ckpt-2.index
ckpt-1.data-00000-of-00001   ckpt-3.data-00000-of-00001
ckpt-1.index		     ckpt-3.index
ckpt-10.data-00000-of-00001  ckpt-4.data-00000-of-00001
ckpt-10.index		     ckpt-4.index
ckpt-11.data-00000-of-00001  ckpt-5.data-00000-of-00001
ckpt-11.index		     ckpt-5.index
ckpt-12.data-00000-of-00001  ckpt-6.data-00000-of-00001
ckpt-12.index		     ckpt-6.index
ckpt-13.data-00000-of-00001  ckpt-7.data-00000-of-00001
ckpt-13.index		     ckpt-7.index
ckpt-14.data-00000-of-00001  ckpt-8.data-00000-of-00001
ckpt-14.index		     ckpt-8.index
ckpt-15.data-00000-of-00001  ckpt-9.data-00000-of-00001
ckpt-15.index		     ckpt-9.index
ckpt-2.data-00000-of-00001


In [16]:
10 % 2 

0