In [1]:
import torch

def get_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using CUDA device:", torch.cuda.get_device_name(0))
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using MPS (Apple Silicon) device")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

device = get_device()

Using MPS (Apple Silicon) device


In [2]:
device

device(type='mps')

In [3]:
# Simple "Hello World" example using PyTorch and the detected device
print("Hello World from PyTorch!")

# Create a small tensor and move it to the detected device
x = torch.tensor([1.0, 2.0, 3.0])
x = x.to(device)
print(f"Tensor on {device}: {x}")

# Perform a simple operation to verify device is working
y = x * 2
print(f"Result of operation: {y}")


Hello World from PyTorch!
Tensor on mps: tensor([1., 2., 3.], device='mps:0')
Result of operation: tensor([2., 4., 6.], device='mps:0')


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
import platform

# --- Basic Setup ---
print(f"--- System Information ---")
print(f"PyTorch Version: {torch.__version__}")
print(f"Python Version: {platform.python_version()}")
print(f"macOS Version: {platform.mac_ver()[0]}")

# --- Device Setup (MPS) ---
if not torch.backends.mps.is_available():
    print("MPS backend not available.")
    exit()

device = torch.device("mps")
print(f"Using device: {device}\n")

# --- Benchmark Parameters ---
batch_size = 256        # Process data in larger chunks for parallelism
num_batches = 100       # Number of training iterations
image_size = 128        # Size of synthetic images (e.g., 128x128)
num_classes = 100       # Number of output classes for the model

# --- Define a Simple CNN Model ---
# A slightly more complex model to increase computational load
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1), # 3 input channels (RGB)
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        # Calculate the flattened size after conv/pool layers
        # This depends on image_size and the layers above
        # For image_size=128: 128 -> 64 -> 32 -> 16. So flattened size = 128 * 16 * 16
        # Adjust if you change image_size or network structure
        flattened_size = 128 * (image_size // 8) * (image_size // 8)
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(flattened_size, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x

print("--- Creating Model and Optimizer ---")
model = SimpleCNN(num_classes).to(device)
# Use AdamW which can sometimes be slightly more demanding than standard Adam
optimizer = optim.AdamW(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss() # Standard loss function
print("Model and optimizer created and moved to MPS.\n")

# --- Training Loop Benchmark ---
print(f"--- Starting Training Benchmark ---")
print(f"Batch Size: {batch_size}")
print(f"Image Size: {image_size}x{image_size}")
print(f"Number of Batches: {num_batches}")

model.train() # Set model to training mode
total_loss = 0.0
start_time = time.time()

for i in range(num_batches):
    # 1. Generate synthetic data and labels directly on the MPS device
    inputs = torch.randn(batch_size, 3, image_size, image_size, device=device)
    labels = torch.randint(0, num_classes, (batch_size,), device=device)

    # 2. Forward pass
    outputs = model(inputs)

    # 3. Calculate loss
    loss = criterion(outputs, labels)

    # 4. Backward pass (gradient computation - often the most intensive part)
    optimizer.zero_grad() # Reset gradients
    loss.backward()

    # 5. Optimizer step (update model weights)
    optimizer.step()

    total_loss += loss.item()
    print(f"Batch {i+1}/{num_batches} completed. Loss: {loss.item():.4f}", end='\r')

# --- Synchronization and Timing ---
# Crucial: Ensure all GPU operations are finished before stopping the timer
torch.mps.synchronize()
end_time = time.time()

print("\n\n--- Benchmark Finished ---")
print(f"Total time for {num_batches} batches: {end_time - start_time:.4f} seconds")
print(f"Average Loss: {total_loss / num_batches:.4f}")

--- System Information ---
PyTorch Version: 2.5.1
Python Version: 3.12.9
macOS Version: 15.4
Using device: mps

--- Creating Model and Optimizer ---
Model and optimizer created and moved to MPS.

--- Starting Training Benchmark ---
Batch Size: 256
Image Size: 128x128
Number of Batches: 100
Batch 100/100 completed. Loss: 4.6074

--- Benchmark Finished ---
Total time for 100 batches: 14.1793 seconds
Average Loss: 4.6413
