<a href="https://colab.research.google.com/github/vijaygwu/IntroToDeepLearning/blob/main/RayParallelizedCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install ray

Collecting ray
  Downloading ray-2.35.0-cp310-cp310-manylinux2014_x86_64.whl.metadata (16 kB)
Downloading ray-2.35.0-cp310-cp310-manylinux2014_x86_64.whl (65.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ray
Successfully installed ray-2.35.0


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time
import ray

# Initialize Ray
ray.init(ignore_reinit_error=True)

###############################################
# MNIST Dataset and Transformations
###############################################

# Define the data transformations: Convert to tensor and normalize
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Load the MNIST dataset using torchvision
train_dataset = datasets.MNIST(root='./mnist_data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./mnist_data', train=False, download=True, transform=transform)

# Define the DataLoader for batching
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

###############################################
# Define Neural Network Model
###############################################

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28*28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28*28)  # Flatten the input image
        x = F.relu(self.fc1(x))  # First hidden layer with ReLU activation
        x = F.relu(self.fc2(x))  # Second hidden layer with ReLU activation
        x = self.fc3(x)  # Output layer
        return x

###############################################
# Ray-based Parallel Data Loading
###############################################

# Remote function to load data in parallel using Ray
@ray.remote
def load_batch(batch):
    return batch

# Parallelized data loading function
def load_data_in_parallel(data_loader):
    ray_batches = [load_batch.remote(batch) for batch in data_loader]  # Load all batches in parallel
    return ray.get(ray_batches)  # Retrieve the loaded batches

###############################################
# Training Function with Ray-Parallelized Data Loading
###############################################

def train_model(model, optimizer, criterion, train_loader, epochs):
    for epoch in range(epochs):
        model.train()  # Set the model to training mode
        running_loss = 0.0

        # Load data in parallel using Ray
        batches = load_data_in_parallel(train_loader)

        for data, target in batches:  # Iterate over parallel-loaded batches
            optimizer.zero_grad()  # Clear previous gradients
            output = model(data)  # Forward pass through the network
            loss = criterion(output, target)  # Compute loss
            loss.backward()  # Backward pass to compute gradients
            optimizer.step()  # Update the weights
            running_loss += loss.item()  # Track running loss for the epoch

        # Print the loss after each epoch
        print(f'Epoch {epoch+1}, Training Loss: {running_loss / len(train_loader):.4f}')

###############################################
# Testing Function to Evaluate Model Performance
###############################################

def test_model(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0

    # No gradient computation during evaluation
    with torch.no_grad():
        for data, target in test_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    # Calculate and print accuracy
    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')

###############################################
# Main Function to Train and Test the Model
###############################################

if __name__ == "__main__":
    # Initialize the neural network, loss function, and optimizer
    model = Net()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    # Record start time for training
    start_time = time.time()

    # Train the model with Ray-parallelized data loading
    train_model(model, optimizer, criterion, train_loader, epochs=5)

    # Record end time and print training time
    end_time = time.time()
    print(f"Training Time with Ray: {end_time - start_time:.2f} seconds")

    # Test the model on the test set
    test_model(model, test_loader)

    # Shutdown Ray
    ray.shutdown()


2024-09-04 18:35:44,192	INFO worker.py:1783 -- Started a local Ray instance.


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./mnist_data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:01<00:00, 5085035.91it/s]


Extracting ./mnist_data/MNIST/raw/train-images-idx3-ubyte.gz to ./mnist_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./mnist_data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 132891.55it/s]


Extracting ./mnist_data/MNIST/raw/train-labels-idx1-ubyte.gz to ./mnist_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./mnist_data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:05<00:00, 293793.88it/s]


Extracting ./mnist_data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./mnist_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./mnist_data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 5627925.78it/s]


Extracting ./mnist_data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./mnist_data/MNIST/raw



[36m(load_batch pid=2337)[0m   return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))


Epoch 1, Training Loss: 0.8414
Epoch 2, Training Loss: 0.3130
Epoch 3, Training Loss: 0.2546
Epoch 4, Training Loss: 0.2172
Epoch 5, Training Loss: 0.1891
Training Time with Ray: 78.47 seconds
Test Accuracy: 94.83%


[36m(load_batch pid=2338)[0m   return torch.load(io.BytesIO(b))[32m [repeated 11x across cluster][0m
