# 07 PyTorch Experiment Tracking

In [3]:
import torch
import torchvision

print(torch.__version__)
print(torchvision.__version__)

2.3.0+cu121
0.18.0+cu121


In [4]:
# Try to get torchinfo, install it if it doesn't work
try:
    from torchinfo import summary
except:
    print("[INFO] Couldn't find torchinfo... installing it.")
    !pip install -q torchinfo
    from torchinfo import summary

# Try to import the going_modular directory, download it from GitHub if it doesn't work
try:
    from going_modular.going_modular import data_setup, engine
except:
    # Get the going_modular scripts
    print("[INFO] Couldn't find going_modular scripts... downloading them from GitHub.")
    !git clone https://github.com/mrdbourke/pytorch-deep-learning
    !mv pytorch-deep-learning/going_modular .
    !rm -rf pytorch-deep-learning
    from going_modular.going_modular import data_setup, engine

[INFO] Couldn't find torchinfo... installing it.
[INFO] Couldn't find going_modular scripts... downloading them from GitHub.
Cloning into 'pytorch-deep-learning'...
remote: Enumerating objects: 4056, done.[K
remote: Total 4056 (delta 0), reused 0 (delta 0), pack-reused 4056[K
Receiving objects: 100% (4056/4056), 646.90 MiB | 34.20 MiB/s, done.
Resolving deltas: 100% (2372/2372), done.
Updating files: 100% (248/248), done.


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [6]:
# Set seeds
def set_seeds(seed: int=42):
    """Sets random sets for torch operations.

    Args:
        seed (int, optional): Random seed to set. Defaults to 42.
    """
    # Set the seed for general torch operations
    torch.manual_seed(seed)
    # Set the seed for CUDA torch operations (ones that happen on the GPU)
    torch.cuda.manual_seed(seed)

In [None]:
set_seeds()

## 1. Get data

In [7]:
import os
import zipfile
from pathlib import Path
import requests

def download_data(source: str,
                  destination: str,
                  remove_source: bool=True) -> Path:
  """ Downloads a zipped dataset from source and unzips to destination..."""
  data_path = Path("data/")
  image_path = data_path / destination
  if image_path.is_dir():
    print(f"[INFO] {image_path} directory already exists, skipping download.")
  else:
    print(f"[INFO] Did not find {image_path} directory, downloading...")
    image_path.mkdir(parents=True, exist_ok = True)
    target_file = Path(source).name
    with open(data_path / target_file, "wb") as f:
      request = requests.get(source)
      print(f"[INFO] Downloading {target_file} from {source}...")
      f.write(request.content)

    with zipfile.ZipFile(data_path / target_file, "r") as zip_ref:
      print(f"[INFO] Unzipping {target_file} data...")
      zip_ref.extractall(image_path)

    if remove_source:
      os.remove(data_path / target_file)

  return image_path

In [None]:
image_path = download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip",
                           destination="pizza_steak_sushi")
image_path

## 2. Create Datasets and DataLoaders

### 2.1 Create DataLoaders with manual transforms

In [8]:
from torchvision import transforms
from going_modular.going_modular import data_setup

train_dir = image_path / "train"
test_dir = image_path / "test"
BATCH_SIZE = 32
NUM_WORKERS = os.cpu_count()

manual_transform = transforms.Compose([
    transforms.Resize(size=(224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(train_dir=train_dir,
                                                                               test_dir=test_dir,
                                                                               transform=manual_transform,
                                                                               batch_size=BATCH_SIZE,
                                                                               num_workers=NUM_WORKERS)
train_dataloader, test_dataloader, class_names

NameError: name 'image_path' is not defined

### 2.2 Create DataLoaders with auto transforms

In [None]:
weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT
auto_transform = weights.transforms()

train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(train_dir=train_dir,
                                                                               test_dir=test_dir,
                                                                               transform=auto_transform,
                                                                               batch_size=BATCH_SIZE,
                                                                               num_workers=NUM_WORKERS)
train_dataloader, test_dataloader, class_names

## 3. Getting a pretrained model, freeze the base layers and change the classifier head

In [None]:
from torch import nn
weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT
model_0 = torchvision.models.efficientnet_b0(weights=weights).to(device)

summary(model=model_0,
        input_size=(1, 3, 224, 224))

In [None]:
for param in model_0.features.parameters():
  param.requires_grad = False

set_seeds()

model_0.classifier = nn.Sequential(
    nn.Dropout(p=0.3, inplace=True),
    nn.Linear(in_features=1280, out_features=len(class_names))

).to(device)

summary(model=model_0,
        input_size=(1, 3, 224, 224),
        verbose=0,
        col_names=["input_size", "output_size", "num_params", "trainable"],
        row_settings=['var_names'])

## 4. train a single model and track results

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_0.parameters(),
                              lr=0.001)

In [None]:
# Setup a SummaryWriter
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
writer

In [None]:
from typing import Dict, List, Tuple
from going_modular.going_modular.engine import train_step, test_step
from tqdm.auto import tqdm

In [None]:

def train(model: torch.nn.Module,
          train_dataloader: torch.utils.data.DataLoader,
          test_dataloader: torch.utils.data.DataLoader,
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device) -> Dict[str, List]:

    # Create empty results dictionary
    results = {"train_loss": [],
               "train_acc": [],
               "test_loss": [],
               "test_acc": []
    }

    # Make sure model on target device
    model.to(device)

    # Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(model=model,
                                          dataloader=train_dataloader,
                                          loss_fn=loss_fn,
                                          optimizer=optimizer,
                                          device=device)
        test_loss, test_acc = test_step(model=model,
          dataloader=test_dataloader,
          loss_fn=loss_fn,
          device=device)

        # Print out what's happening
        print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"train_acc: {train_acc:.4f} | "
          f"test_loss: {test_loss:.4f} | "
          f"test_acc: {test_acc:.4f}"
        )

        # Update results dictionary
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)

        ### New: Experiment tracking ###
        writer.add_scalars(main_tag="Loss",
                           tag_scalar_dict={"train_loss":train_loss,
                                            "test_loss":test_loss},
                           global_step=epoch)

        writer.add_scalars(main_tag="Accuracy",
                           tag_scalar_dict={"train_acc": train_acc,
                                            "test_acc": test_acc},
                           global_step=epoch)

        writer.add_graph(model=model,
                         input_to_model=torch.randn(32, 3, 224, 224).to(device))
    writer.close()

    # Return the filled results at the end of the epochs
    return results


In [None]:
model_0_results = train(model=model_0,
                        train_dataloader=train_dataloader,
                        test_dataloader=test_dataloader,
                        optimizer=optimizer,
                        loss_fn=loss_fn,
                        epochs=5,
                        device=device)

  0%|          | 0/5 [00:00<?, ?it/s]

  self.pid = os.fork()
  self.pid = os.fork()


Epoch: 1 | train_loss: 0.5930 | train_acc: 0.7852 | test_loss: 0.6071 | test_acc: 0.8258
Epoch: 2 | train_loss: 0.5383 | train_acc: 0.7969 | test_loss: 0.5276 | test_acc: 0.8655
Epoch: 3 | train_loss: 0.5549 | train_acc: 0.7930 | test_loss: 0.5066 | test_acc: 0.8561
Epoch: 4 | train_loss: 0.4978 | train_acc: 0.8125 | test_loss: 0.5273 | test_acc: 0.8352
Epoch: 5 | train_loss: 0.4473 | train_acc: 0.9297 | test_loss: 0.4706 | test_acc: 0.8456


## 5. View our model's results with TensorBoard

In [None]:
%load_ext tensorboard
%tensorboard --logdir runs

## 6. Create a function to prepare a `SummaryWriter()` instance

In [2]:
from datetime import datetime
import os
from torch.utils.tensorboard import SummaryWriter

def create_writer(experiment_name: str,
                  model_name: str,
                  extra: str = None):
  timestamp = datetime.now().strftime("%Y-%m-%d")

  if extra:
    log_dir = os.path.join("runs", timestamp, experiment_name, model_name, extra)
  else:
    log_dir = os.path.join("runs", timestamp, experiment_name, model_name)


  return SummaryWriter(log_dir=log_dir)

In [None]:
example_writer = create_writer(experiment_name="data_10_percent",
                               model_name="effnetb0",
                               extra="5_epochs")
example_writer

datetime.datetime(2024, 6, 1, 14, 18, 25, 285483)

### 6.1 Update the `train()` function to include a `writer` parameter

In [9]:
from typing import Dict, List, Tuple
from going_modular.going_modular.engine import train_step, test_step
from tqdm.auto import tqdm

def train(model: torch.nn.Module,
          train_dataloader: torch.utils.data.DataLoader,
          test_dataloader: torch.utils.data.DataLoader,
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device,
          writer: torch.utils.tensorboard.writer.SummaryWriter) -> Dict[str, List]:

    # Create empty results dictionary
    results = {"train_loss": [],
               "train_acc": [],
               "test_loss": [],
               "test_acc": []
    }

    # Make sure model on target device
    model.to(device)

    # Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(model=model,
                                          dataloader=train_dataloader,
                                          loss_fn=loss_fn,
                                          optimizer=optimizer,
                                          device=device)
        test_loss, test_acc = test_step(model=model,
          dataloader=test_dataloader,
          loss_fn=loss_fn,
          device=device)

        # Print out what's happening
        print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"train_acc: {train_acc:.4f} | "
          f"test_loss: {test_loss:.4f} | "
          f"test_acc: {test_acc:.4f}"
        )

        # Update results dictionary
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)

        ### New: Experiment tracking ###
        if writer:
          writer.add_scalars(main_tag="Loss",
                            tag_scalar_dict={"train_loss":train_loss,
                                              "test_loss":test_loss},
                            global_step=epoch)

          writer.add_scalars(main_tag="Accuracy",
                            tag_scalar_dict={"train_acc": train_acc,
                                              "test_acc": test_acc},
                            global_step=epoch)

          writer.add_graph(model=model,
                          input_to_model=torch.randn(32, 3, 224, 224).to(device))
          writer.close()
        else:
          pass

    # Return the filled results at the end of the epochs
    return results


## 7. Setting up a series of modelling experiments

### 7.1 What kind of experiments should you run

### 7.3 Download different dataset

In [10]:
# Donwload 10 percent and 20 percent datasets
data_10_percent_path = download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip",
                                     destination="pizza_steak_sushi")

data_20_percent_path = download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi_20_percent.zip",
                                     destination="pizza_steak_sushi_20_percent")

[INFO] Did not find data/pizza_steak_sushi directory, downloading...
[INFO] Downloading pizza_steak_sushi.zip from https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip...
[INFO] Unzipping pizza_steak_sushi.zip data...
[INFO] Did not find data/pizza_steak_sushi_20_percent directory, downloading...
[INFO] Downloading pizza_steak_sushi_20_percent.zip from https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi_20_percent.zip...
[INFO] Unzipping pizza_steak_sushi_20_percent.zip data...


### 7.4 Transform Datasets and Create DataLoaders

In [19]:
train_dir_10_percent = data_10_percent_path / "train"
test_dir = data_10_percent_path / "test"
train_dir_20_percent = data_20_percent_path / "train"


In [21]:
import torchvision
from going_modular.going_modular import data_setup
import os

BATCH_SIZE = 32
NUM_WORKERS = os.cpu_count()
weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT
auto_transform = weights.transforms()

train_dataloader_10_percent, test_dataloader, class_names = data_setup.create_dataloaders(train_dir=train_dir_10_percent,
                                                                                                     test_dir=test_dir,
                                                                                                     transform=auto_transform,
                                                                                                     batch_size=BATCH_SIZE,
                                                                                                     num_workers=NUM_WORKERS)

train_dataloader_20_percent, test_dataloader, class_names = data_setup.create_dataloaders(train_dir=train_dir_20_percent,
                                                                                                     test_dir=test_dir,
                                                                                                     transform=auto_transform,
                                                                                                     batch_size=BATCH_SIZE,
                                                                                                     num_workers=NUM_WORKERS)
len(train_dataloader_10_percent), len(test_dataloader), len(train_dataloader_20_percent), class_names

(8, 3, 15, ['pizza', 'steak', 'sushi'])

### 7.5 Create feature extractor models

In [26]:
from torch import nn
import torchvision

def effnetb0_feature_extractor(device = device):
  weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT
  model = torchvision.models.efficientnet_b0(weights=weights)
  for param in model.features.parameters():
    param.requires_grad = False

  model.classifier = nn.Sequential(
      nn.Dropout(p=0.3, inplace=True),
      nn.Linear(in_features=1280, out_features=3)
  )

  model.name = "effnetb0"

  return model.to(device)

def effnetb2_feature_extractor(device=device):
  weights = torchvision.models.EfficientNet_B2_Weights.DEFAULT
  model = torchvision.models.efficientnet_b2(weights=weights)

  for param in model.features.parameters():
    param.requires_grad = False

  model.classifier = nn.Sequential(
      nn.Dropout(p=0.3, inplace=True),
      nn.Linear(in_features=1408, out_features=3)
  )

  model.name = "effnetb2"

  return model.to(device)

In [30]:
created_model_test = effnetb2_feature_extractor()
summary(model=created_model_test,
        input_size=(32, 3, 224, 224),
        verbose=0,
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

Downloading: "https://download.pytorch.org/models/efficientnet_b2_rwightman-c35c1473.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b2_rwightman-c35c1473.pth
100%|██████████| 35.2M/35.2M [00:00<00:00, 138MB/s]


Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
EfficientNet (EfficientNet)                                  [32, 3, 224, 224]    [32, 3]              --                   Partial
├─Sequential (features)                                      [32, 3, 224, 224]    [32, 1408, 7, 7]     --                   False
│    └─Conv2dNormActivation (0)                              [32, 3, 224, 224]    [32, 32, 112, 112]   --                   False
│    │    └─Conv2d (0)                                       [32, 3, 224, 224]    [32, 32, 112, 112]   (864)                False
│    │    └─BatchNorm2d (1)                                  [32, 32, 112, 112]   [32, 32, 112, 112]   (64)                 False
│    │    └─SiLU (2)                                         [32, 32, 112, 112]   [32, 32, 112, 112]   --                   --
│    └─Sequential (1)                                        [32, 32, 112, 112]   [32, 

### 7.6 Create experiments and set up training code

In [31]:
# Create epoch list
num_epochs = [5, 10]

# Create models list
models = ["effnetb0", "effnetb2"]

# Create a DataLoaders dictionary
train_dataloaders = {"data_10_percent": train_dataloader_10_percent,
                     "data_20_percent": train_dataloader_20_percent}

In [None]:
%%time
from going_modular.going_modular.utils import save_model

set_seeds(seed=42)

experiment_number = 0
for dataloader_name, train_dataloader in train_dataloaders.items():
  for epochs in num_epochs:
    for model_name in models:
      experiment_number += 1
      print(f"[INFO] Experiment number: {experiment_number}")
      print(f"[INFO] Model: {model_name}")
      print(f"[INFO] DataLoader: {dataloader_name}")
      print(f"[INFO] Number of epochs: {epochs}")

      if model_name == "effnetb0":
        model = effnetb0_feature_extractor()
      else:
        model = effnetb2_feature_extractor()

      loss_fn = nn.CrossEntropyLoss()
      optimizer = torch.optim.Adam(model.parameters(),
                                   lr=0.001)

      train(model=model,
            train_dataloader=train_dataloader,
            test_dataloader=test_dataloader,
            optimizer=optimizer,
            loss_fn=loss_fn,
            epochs=epochs,
            device=device,
            writer=create_writer(experiment_name=dataloader_name,
                                 model_name=model_name,
                                 extra=f"{epochs}_epochs"))

      save_filepath = f"07_{model_name}_{dataloader_name}_{epochs}_epochs.pth"
      save_model(model=model,
                 target_dir="models",
                 model_name=save_filepath)

      print("-"*50 + "\n")



[INFO] Experiment number: 1
[INFO] Model: effnetb0
[INFO] DataLoader: data_10_percent
[INFO] Number of epochs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

  self.pid = os.fork()
  self.pid = os.fork()


Epoch: 1 | train_loss: 1.0534 | train_acc: 0.4180 | test_loss: 0.8519 | test_acc: 0.7538
Epoch: 2 | train_loss: 0.8659 | train_acc: 0.6680 | test_loss: 0.6617 | test_acc: 0.8447
Epoch: 3 | train_loss: 0.7535 | train_acc: 0.7070 | test_loss: 0.6801 | test_acc: 0.8551
