In [12]:
import torch

import sys
sys.path.append("../")
from src.utils import get_device, plot_loss_curves

device = get_device()
print(f"Using device: {device}")

Using device: mps


In [13]:
print(torch.__version__)

2.0.0


##  Create model and transforms

In [14]:
import torchvision
from torchvision import models

model_weights = models.ResNet50_Weights.DEFAULT
transforms = model_weights.transforms()

model = models.resnet50(weights = model_weights)

In [15]:
total_params = sum(param.numel() for param in model.parameters())
print(f"[INFO] Total number of parameters: {total_params}")

[INFO] Total number of parameters: 25557032


PyTorch speedups will be most noticeable when as much of the GPU is being used. This means that a larger model may take longer to train on the whole but will be much faster to train than if it were to be trained without using torch 2.0.

In [16]:
model.fc = torch.nn.Linear(in_features = 2048,
                           out_features = 10)

We can increase GPU utilization by:
* Using larger models
* Increasing the batch size
* Increase the data size (use 224 x 224 dimension images)
* Increase embedding size for data
* Decreasing data transfer: transferring data across devices will slow down a GPU.

Check device memory

In [17]:
if get_device() == 'cuda':
    total_free_memory, total_memory = torch.cuda.mem_get_info()
    print(f'Device is cuda')
    print(f"Total free GPU memory: {round(total_free_memory * 1e-9, 3)} GB")
    print(f"Total GPU memory: {round(total_memory * 1e-9, 3)} GB")
elif get_device() == 'mps':
    print(f'Device is mps')
    from torch import mps
    total_memory = mps.driver_allocated_memory()
    print(f"mps memory info: {round(mps.driver_allocated_memory()*1e-6, 3)} MB")

Device is mps
mps memory info: 262.242 MB


In [18]:
if round(total_memory * 1e-9, 3) >= 16:
    BATCH_SIZE = 128
    IMAGE_SIZE = 224
else:
    BATCH_SIZE = 32
    IMAGE_SIZE = 128

In [19]:
# Update transforms
transforms.crop_size = IMAGE_SIZE
transforms.resize_size = IMAGE_SIZE
transforms

ImageClassification(
    crop_size=128
    resize_size=128
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [20]:
if get_device() == 'cuda':
    major, minor = torch.cuda.get_device_capability(get_device())
    GPU_SCORE = major + float("0." + str(minor))
    if GPU_SCORE >= 8.0:
        torch.backends.cuda.matmul.allow_tf32 = True
        print(f"[INFO] Using GPU with {GPU_SCORE} compute capability, enabling TensorFloat32")
    else:
        print(f"[INFO] Using GPU with {GPU_SCORE} compute capability, not enabling TensorFloat32")
else:
    print(f"[INFO] Not using cuda device. Skip setting Tensorfloat32")

[INFO] Not using cuda device. Skip setting Tensorfloat32


Getting data

In [21]:
# Create train and test datasets
train_dataset = torchvision.datasets.CIFAR10(root='.', 
                                             train=True, 
                                             download=True, 
                                             transform=transforms)

test_dataset = torchvision.datasets.CIFAR10(root='.', 
                                            train=False, # want the test split
                                            download=True, 
                                            transform=transforms)

# Get the lengths of the datasets
train_len = len(train_dataset)
test_len = len(test_dataset)

print(f"[INFO] Train dataset length: {train_len}")
print(f"[INFO] Test dataset length: {test_len}")

Files already downloaded and verified
Files already downloaded and verified
[INFO] Train dataset length: 50000
[INFO] Test dataset length: 10000


Creating dataloaders

In [22]:
import os
from torch.utils.data import DataLoader

NUM_WORKERS = os.cpu_count()

train_dataloader = DataLoader(dataset = train_dataset,
                              batch_size = BATCH_SIZE,
                              shuffle = True,
                              num_workers = NUM_WORKERS)
test_dataloader = DataLoader(dataset = test_dataset,
                             batch_size = BATCH_SIZE,
                             shuffle = False,
                             num_workers = NUM_WORKERS)

In [25]:
import time
from tqdm.auto import tqdm
from typing import Dict, List, Tuple

def train_step(epoch: int,
               model: torch.nn.Module,
               dataloader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer,
               device: torch.device,
               disable_progress_bar: bool = False) -> Tuple[float, float]:
    """Trains a PyTorch model for a single epoch

    Args:
        epoch (int): epoch number
        loss_fn (torch.nn.Module): PyTorch loss function to minimize
        optimizer (torch.optim.Optimizer): PyTorch optimizer object
        device (torch.device): PyTorch device
        model (_type_, optional): _description_. Defaults to torch.nn.Module.
        dataloader (_type_, optional): _description_. Defaults to torch.utils.data.DataLoader.
        disable_progress_bar (bool, optional): _description_. Defaults to False.

    Returns:
        Tuple[float, float]: _description_
    """
    model.train()
    train_loss, train_acc = 0,0
    progress_bar = tqdm(
        enumerate(dataloader),
        desc = f"Training Epoch {epoch}",
        total = len(dataloader),
        disable = disable_progress_bar
    )
    