## 1. Without MPI

#### Importing libraries

In [3]:
import time
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd

#### Loading Dataset

In [6]:
fashion_mnist_dataset = keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist_dataset.load_data()
x_train, y_train = train_images, train_labels
x_test, y_test = test_images, test_labels

#### Normalize Pixel Values

In [9]:
x_train = x_train.astype("float32") / 255.0
x_test = x_test.astype("float32") / 255.0

#### Convert Labels to Categorical Format

In [12]:
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

#### Displaying the shape of Training and Testing dataset

In [15]:
# Reshape the 3D arrays to 2D arrays for Pandas DataFrame
x_train_2d = x_train.reshape(x_train.shape[0], -1)
x_test_2d = x_test.reshape(x_test.shape[0], -1)

# Convert NumPy arrays to Pandas DataFrames
df_train = pd.DataFrame(x_train_2d)
df_test = pd.DataFrame(x_test_2d)

print("Training dataset{}".format(df_train.shape))
print("Testing dataset{}".format(df_test.shape))

Training dataset(60000, 784)
Testing dataset(10000, 784)


#### Define Model Architecture

In [18]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(10, activation='softmax')
])

  super().__init__(**kwargs)


#### Compile and Train the model

In [21]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
start_time = time.time()
history = model.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test))
end_time = time.time()

Epoch 1/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 762us/step - accuracy: 0.7678 - loss: 0.6681 - val_accuracy: 0.8502 - val_loss: 0.4240
Epoch 2/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 741us/step - accuracy: 0.8508 - loss: 0.4092 - val_accuracy: 0.8534 - val_loss: 0.3970
Epoch 3/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 736us/step - accuracy: 0.8641 - loss: 0.3692 - val_accuracy: 0.8670 - val_loss: 0.3699
Epoch 4/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 713us/step - accuracy: 0.8704 - loss: 0.3556 - val_accuracy: 0.8731 - val_loss: 0.3561
Epoch 5/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 721us/step - accuracy: 0.8833 - loss: 0.3230 - val_accuracy: 0.8691 - val_loss: 0.3615
Epoch 6/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 721us/step - accuracy: 0.8824 - loss: 0.3198 - val_accuracy: 0.8691 - val_loss: 0.3565
Epoc

In [22]:
train_time = end_time - start_time
print("Time taken without MPI: {:.2f} seconds".format(train_time))


Time taken without MPI: 14.09 seconds


## 2.With MPI

In [26]:
from mpi4py import MPI
from tensorflow.keras.models import Sequential 

[Santhoshis-MacBook-Air.local:03433] shmem: mmap: an error occurred while determining whether or not /var/folders/5h/tk2b9hsd5y5fnsdh5fdqzfym0000gn/T//ompi.Santhoshis-MacBook-Air.501/jf.0/3840999424/sm_segment.Santhoshis-MacBook-Air.501.e4f10000.0 could be created.


In [28]:
# Initialize MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

# Split the data across the nodes
x_rank = np.array_split(x_train, size)[rank]
y_rank = np.array_split(y_train, size)[rank]

# Train the model with MPI
start_time = MPI.Wtime()
history = model.fit(x_rank, y_rank, epochs=10, validation_data=(x_test, y_test))
end_time = MPI.Wtime()

# Compute the training time
train_time = end_time - start_time

# Compute the average training time across all nodes
train_time_avg = comm.reduce(train_time, op=MPI.SUM) / size

# Print the training time
if rank == 0:
    print("Time taken with MPI: {:.2f} seconds".format(train_time_avg))

Epoch 1/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 732us/step - accuracy: 0.8986 - loss: 0.2727 - val_accuracy: 0.8821 - val_loss: 0.3301
Epoch 2/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 710us/step - accuracy: 0.8989 - loss: 0.2673 - val_accuracy: 0.8877 - val_loss: 0.3268
Epoch 3/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 716us/step - accuracy: 0.9017 - loss: 0.2607 - val_accuracy: 0.8850 - val_loss: 0.3356
Epoch 4/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 703us/step - accuracy: 0.9016 - loss: 0.2603 - val_accuracy: 0.8798 - val_loss: 0.3389
Epoch 5/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 745us/step - accuracy: 0.9039 - loss: 0.2515 - val_accuracy: 0.8778 - val_loss: 0.3470
Epoch 6/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 742us/step - accuracy: 0.9062 - loss: 0.2468 - val_accuracy: 0.8871 - val_loss: 0.3345
Epoc

## 3.Mirrored Strategy

In [31]:
# Enable XLA
tf.config.optimizer.set_jit(True)

# Batch Processing
batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=10000).batch(batch_size)

test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_dataset = test_dataset.batch(batch_size)

# TensorFlow Auto-Tuning
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.prefetch(buffer_size=AUTOTUNE)

# Use MirroredStrategy for multi-GPU training
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Define the model architecture
    model = keras.Sequential([
        keras.layers.Flatten(input_shape=(28, 28)),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(10, activation='softmax')
    ])

    # Compile the model
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

# Split the data across the nodes
size = strategy.num_replicas_in_sync
x_rank = np.array_split(x_train, size)
y_rank = np.array_split(y_train, size)

# Train the model
start_time = time.time()
history = model.fit(train_dataset, epochs=10, validation_data=test_dataset)
end_time = time.time()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Epoch 1/10


2025-07-19 13:00:35.433146: W tensorflow/core/framework/dataset.cc:993] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.


[1m909/938[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 889us/step - accuracy: 0.7483 - loss: 0.7150

2025-07-19 13:00:36.484523: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]


[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7502 - loss: 0.7097 - val_accuracy: 0.8422 - val_loss: 0.4371
Epoch 2/10
[1m111/938[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 920us/step - accuracy: 0.8494 - loss: 0.4152

2025-07-19 13:00:36.725492: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]


[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8508 - loss: 0.4144 - val_accuracy: 0.8562 - val_loss: 0.3970
Epoch 3/10
[1m113/938[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 903us/step - accuracy: 0.8665 - loss: 0.3843

2025-07-19 13:00:37.893218: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]


[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8650 - loss: 0.3745 - val_accuracy: 0.8642 - val_loss: 0.3794
Epoch 4/10
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8712 - loss: 0.3496 - val_accuracy: 0.8649 - val_loss: 0.3730
Epoch 5/10
[1m112/938[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 913us/step - accuracy: 0.8844 - loss: 0.3144

2025-07-19 13:00:40.090784: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]


[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8806 - loss: 0.3268 - val_accuracy: 0.8749 - val_loss: 0.3490
Epoch 6/10
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8825 - loss: 0.3188 - val_accuracy: 0.8706 - val_loss: 0.3567
Epoch 7/10
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8862 - loss: 0.3061 - val_accuracy: 0.8745 - val_loss: 0.3472
Epoch 8/10
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8893 - loss: 0.2973 - val_accuracy: 0.8727 - val_loss: 0.3591
Epoch 9/10
[1m109/938[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 932us/step - accuracy: 0.8960 - loss: 0.2800

2025-07-19 13:00:44.617331: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]


[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8929 - loss: 0.2882 - val_accuracy: 0.8753 - val_loss: 0.3481
Epoch 10/10
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8953 - loss: 0.2812 - val_accuracy: 0.8831 - val_loss: 0.3271


In [33]:
# Compute the training time
train_time = end_time - start_time

print("Time taken with MirroredStrategy: {:.2f} seconds".format(train_time))

Time taken with MirroredStrategy: 11.54 seconds


## 4.Custom Data Parallelism

In [38]:
# Create a MirroredStrategy
strategy = tf.distribute.MirroredStrategy()

# Define the model architecture
def create_model():
    return keras.Sequential([
        keras.layers.Flatten(input_shape=(28, 28)),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(10, activation='softmax')
    ])

with strategy.scope():
    # Create the model
    model = create_model()

    # Compile the model
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # Create the optimizer inside the strategy scope
    optimizer = tf.keras.optimizers.Adam()

# Manually distribute and train the model
start_time = time.time()

# Define a custom training loop using tf.function
@tf.function
def distributed_train_step(inputs):
    per_replica_losses = strategy.run(train_step, args=(inputs, optimizer))
    return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)

def train_step(inputs, optimizer):
    x, y = inputs
    with tf.GradientTape() as tape:
        predictions = model(x)
        loss = keras.losses.categorical_crossentropy(y, predictions)
        loss = tf.reduce_mean(loss)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss

# Combine x_train and y_train into a Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(60000).batch(64)

# Iterate over epochs and batches for training
for epoch in range(10):
    for batch in train_dataset:
        distributed_train_step(batch)

end_time = time.time()

# Compute the training time
train_time = end_time - start_time

print("Time taken with Custom Data Parallelism: {:.2f} seconds".format(train_time))


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


2025-07-19 19:16:59.517774: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Time taken with Custom Data Parallelism: 7.72 seconds
