In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import time
import os

# Step 1: Load the Dataset with Optimized DataLoader
def load_data(batch_size=64, num_workers=4):
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
    train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True
    )
    return train_loader

# Step 2: Define a More Complex Neural Network Model
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(128 * 8 * 8, 256)
        self.fc2 = nn.Linear(256, 10)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, 2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, 2)
        x = x.view(-1, 128 * 8 * 8)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Step 3: Training Function with GPU Synchronization
def train_model(device, model, train_loader, criterion, optimizer, epochs=1):
    model.to(device)
    model.train()
    
    torch.cuda.synchronize() if device == "cuda" else None
    start_time = time.time()
    
    for epoch in range(epochs):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
            optimizer.zero_grad()
            # Updated to torch.amp.autocast with device type specified
            with torch.amp.autocast(device_type='cuda', enabled=(device == "cuda")):
                outputs = model(inputs)
                loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
    
    torch.cuda.synchronize() if device == "cuda" else None
    elapsed_time = time.time() - start_time
    print(f"Training time on {device}: {elapsed_time:.2f} seconds")
    return elapsed_time


# Step 4: Set up and Run the Training Process on Both CPU and GPU
def main():
    batch_size = 128  # Increased batch size
    epochs = 1  # Adjust for benchmarking; increase for detailed analysis
    num_workers = os.cpu_count()  # Use all available CPU cores for data loading
    train_loader = load_data(batch_size, num_workers)

    # Initialize the model, loss function, and optimizer
    model_cpu = SimpleCNN()
    model_gpu = SimpleCNN()
    criterion = nn.CrossEntropyLoss()
    optimizer_cpu = optim.SGD(model_cpu.parameters(), lr=0.001, momentum=0.9)
    optimizer_gpu = optim.SGD(model_gpu.parameters(), lr=0.001, momentum=0.9)

    # Train on CPU
    cpu_time = train_model("cpu", model_cpu, train_loader, criterion, optimizer_cpu, epochs)

    # Train on GPU
    if torch.cuda.is_available():
        gpu_time = train_model("cuda", model_gpu, train_loader, criterion, optimizer_gpu, epochs)

        # Report the difference in time
        print(f"GPU training was {cpu_time / gpu_time:.2f}x faster than CPU training.")

if __name__ == "__main__":
    main()


Training time on cpu: 50.42 seconds
Training time on cuda: 1.72 seconds
GPU training was 29.35x faster than CPU training.
