# Introduction of Weights&Biases

Use W&B for ML experiment tracking, dataset versioning, project collaboration.

In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from tqdm import tqdm

torch.backends.cudnn.deterministic = True

def random_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

random_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Step 0 : install Weights&biases

In [2]:
%%capture
!pip install wandb --upgrade

## Step 1 : Login

In [3]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Step 2 : Track metadata and hyperparameters with wandb.init

In [4]:
config = dict(
    epochs = 5,
    classes = 10,
    kernels = [16, 32],
    batch_size = 128,
    learning_rate = 0.005,
    dataset = "MNIST",
    architecture = "CNN"
)

### define the overall pipline
1. make a model
2. train
3. test

In [29]:
def model_pipeline(hyp):
  with wandb.init(project="pytorch-demo", config = hyp):
    #wandb start
    config = wandb.config
    model, train_loader, test_loader, criterion, optimizer = make(config)
    print(model)

    train(model, train_loader, criterion, optimizer, config)
    test(model, test_loader)

  return model

In [30]:
def make(config):
  #make the data
  train, test = get_data(train = True), get_data(train=False)
  train_loader = make_loader(train, batch_size = config.batch_size)
  test_loader = make_loader(test, batch_size = config.batch_size)

  #make the model
  model = ConvNet(config.kernels, config.classes).to(device)
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

  return model, train_loader, test_loader, criterion, optimizer

### define the loader and model

In [31]:
def get_data(slice = 5, train = True):
  full_dataset = torchvision.datasets.MNIST(root=".",train=train, transform=transforms.ToTensor(), download=True)

  sub_dataset = torch.utils.data.Subset(full_dataset, indices = range(0, len(full_dataset), slice))
  return sub_dataset

def make_loader(dataset, batch_size):
  loader = torch.utils.data.DataLoader(dataset = dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=8)
  return loader

In [36]:
class ConvNet(nn.Module):
  def __init__(self, kernels, classes=10):
      super(ConvNet, self).__init__()
      
      self.layer1 = nn.Sequential(
          nn.Conv2d(1, kernels[0], kernel_size=5, stride=1, padding=2),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
      self.layer2 = nn.Sequential(
          nn.Conv2d(16, kernels[1], kernel_size=5, stride=1, padding=2),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2, stride=2))
      self.fc = nn.Linear(7 * 7 * kernels[-1], classes)
      
  def forward(self, x):
      out = self.layer1(x)
      out = self.layer2(out)
      out = out.reshape(out.size(0), -1)
      out = self.fc(out)
      return out

### Define Training Logic

# Step 3. Track gradients with wandb.watch

In [43]:
def train(model, loader, criterion, optimizer, config):
  wandb.watch(model,criterion, log="all", log_freq = 10)

  total_batches = len(loader) * config.epochs
  example_ct = 0 #num of examples seen
  batch_ct = 0

  for epoch in tqdm(range(config.epochs)):
    for _, (images, labels) in enumerate(loader):

      loss = train_batch(images, labels, model, optimizer, criterion)
      example_ct += len(images)
      batch_ct += 1

      if ((batch_ct + 1) % 25) == 0:
        train_log(loss, example_ct, epoch)

def train_batch(images, labels, model, optimizer, criterion):
  images, labels = images.to(device), labels.to(device)

  outputs = model(images)
  loss = criterion(outputs, labels)

  #backward
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  return loss

def train_log(loss, example_ct, epoch):
  loss = float(loss)
  wandb.log({"epoch" : epoch, "loss" : loss}, step = example_ct)
  print(f"Loss after " + str(example_ct).zfill(5) + f"examples:{loss:.3f}")


### Define Testing Logic

In [44]:
def test(model, test_loader):
  model.eval()

  with torch.no_grad():
    correct, total = 0, 0
    for images, labels in test_loader:
      images, labels = images.to(device), labels.to(device)
      outputs = model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

    print(f"Accuracy of the model on the {total}" + f"test images : {100 * correct / total}%")
    wandb.log({"test accuracy" : correct / total})

  torch.onnx.export(model, images, "model.onnx")
  wandb.save("model.onnx")

### run training and watch metrics

In [45]:
model = model_pipeline(config)

  cpuset_checked))


ConvNet(
  (layer1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Linear(in_features=1568, out_features=10, bias=True)
)


  0%|          | 0/5 [00:00<?, ?it/s]

Loss after 03072examples:0.404
Loss after 06272examples:0.236
Loss after 09472examples:0.101


 20%|██        | 1/5 [00:02<00:11,  2.80s/it]

Loss after 12640examples:0.142
Loss after 15840examples:0.136
Loss after 19040examples:0.074
Loss after 22240examples:0.077


 40%|████      | 2/5 [00:05<00:08,  2.80s/it]

Loss after 25408examples:0.053
Loss after 28608examples:0.092
Loss after 31808examples:0.055
Loss after 35008examples:0.050


 60%|██████    | 3/5 [00:08<00:05,  2.81s/it]

Loss after 38176examples:0.020
Loss after 41376examples:0.017
Loss after 44576examples:0.028


 80%|████████  | 4/5 [00:11<00:02,  2.80s/it]

Loss after 47776examples:0.028
Loss after 50944examples:0.067
Loss after 54144examples:0.019
Loss after 57344examples:0.008


100%|██████████| 5/5 [00:14<00:00,  2.81s/it]


Accuracy of the model on the 2000test images : 97.9%


VBox(children=(Label(value=' 0.11MB of 0.11MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▃▃▃▃▅▅▅▅▆▆▆▆███
loss,█▅▃▃▃▂▂▂▂▂▂▁▁▁▁▂▁▁
test accuracy,▁

0,1
epoch,4.0
loss,0.0076
test accuracy,0.979
