# Using Weights and Biases (https://wandb.ai/site) to perform a hyperparameter sweep



1.   Set the configuration for the sweep
2.   Modify the training loop to give wandb control over the hyperparameters
3.   Assign an id to the sweep for tracking and logging



In [None]:
IN_COLAB = 'google.colab' in str(get_ipython())

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    %cd /content/drive/MyDrive/Documents/HLML/Mitchell_Object_Detection/CellDetection/

Mounted at /content/drive
/content/drive/MyDrive/Documents/HLML/Mitchell_Object_Detection/CellDetection


In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms.functional as TF
import numpy as np
import matplotlib.pyplot as plt
from unet import UNet

%matplotlib inline

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
!pip install wandb -Uq

[K     |████████████████████████████████| 1.8 MB 5.2 MB/s 
[K     |████████████████████████████████| 181 kB 73.5 MB/s 
[K     |████████████████████████████████| 162 kB 70.6 MB/s 
[K     |████████████████████████████████| 63 kB 1.8 MB/s 
[K     |████████████████████████████████| 158 kB 73.7 MB/s 
[K     |████████████████████████████████| 157 kB 73.7 MB/s 
[K     |████████████████████████████████| 157 kB 78.5 MB/s 
[K     |████████████████████████████████| 157 kB 75.6 MB/s 
[K     |████████████████████████████████| 157 kB 59.1 MB/s 
[K     |████████████████████████████████| 157 kB 73.2 MB/s 
[K     |████████████████████████████████| 157 kB 78.1 MB/s 
[K     |████████████████████████████████| 157 kB 79.9 MB/s 
[K     |████████████████████████████████| 156 kB 61.4 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [None]:
import wandb

wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
sweep_config = {
  'method': 'random', 
  'metric': {
      'name': 'val_loss',
      'goal': 'minimize'
  },
  'parameters': {
      'batch_size': {
          'values': [1]
      },
      'learning_rate':{
          'values': [0.0005]
      },
      'weight_decay':{
          'values': [1e-7, 1e-8, 1e-9]
      },
      'momentum':{
          'values': [0.1, 0.9, 5]
      },
      'epochs': {
        'values': [50]
      }
  }
}


In [None]:
import pprint

pprint.pprint(sweep_config)

{'method': 'random',
 'metric': {'goal': 'minimize', 'name': 'val_loss'},
 'parameters': {'batch_size': {'values': [1]},
                'epochs': {'values': [50]},
                'learning_rate': {'values': [0.0005]},
                'momentum': {'values': [0.1, 0.9, 5]},
                'weight_decay': {'values': [1e-07, 1e-08, 1e-09]}}}


In [None]:
def train_wandb(config=None):
  # Initialize a new wandb run
  with wandb.init(config=config):
    # If called by wandb.agent, as below,
    # this config will be set by Sweep Controller
    config = wandb.config

    no_trainng_samples = int(0.8*len(dataset.samples))
    no_val_samples = len(dataset.samples) - no_trainng_samples

    trainset, valset = random_split(dataset, [no_trainng_samples, no_val_samples])

    train_loader = DataLoader(trainset, batch_size=config.batch_size, shuffle=True)
    val_loader = DataLoader(valset, batch_size=config.batch_size, shuffle=True)

    model = UNet(1, 4)
    model.to(device)
    optimizer = optim.RMSprop(model.parameters(), config.learning_rate, config.weight_decay, config.momentum)

    criterion = nn.CrossEntropyLoss()

    for e in range(1, config.epochs+1):

      total_train_loss = 0
      total_val_loss = 0

      for i, batch in enumerate(train_loader):

        optimizer.zero_grad()

        output = model(batch['image'].to(device))
        loss = criterion(output, batch['mask'].to(device))

        wandb.log({"loss": loss.item(), "epoch": e})

        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()/len(train_loader)

      print("Total training loss: ", total_train_loss/(i+1))

      if e == 30:
        torch.save(model.state_dict(), "models/low_contrast_expanded_dataset_sweep/30_epochs_lr_{0}_m_{1}.pt".format(config.learning_rate, config.momentum))


      with torch.no_grad(): 

        for i, batch in enumerate(val_loader):

          output = model(batch['image'].to(device))
          val_loss = criterion(output, batch['mask'].to(device))

          wandb.log({"val_loss": val_loss.item(), "epoch": e})

          total_val_loss += val_loss.item()/len(val_loader)

        print("Total validation loss: ", total_val_loss/(i+1))

    torch.save(model.state_dict(), "models/low_contrast_expanded_dataset_sweep/50_epochs_lr_{0}_m_{1}.pt".format(config.learning_rate, config.momentum))


In [None]:
sweep_id = wandb.sweep(sweep_config, project="UNet-cell-detection")

Create sweep with ID: 5fgfl1u1
Sweep URL: https://wandb.ai/tz545/UNet-cell-detection/sweeps/5fgfl1u1


Import dataset and run wandb sweep agent:

In [None]:
from train_unet import CellsDataset

In [None]:
dataset = CellsDataset('data')

In [None]:
wandb.agent(sweep_id, train_wandb, count=5) 

[34m[1mwandb[0m: Agent Starting Run: 5pi2pcke with config:
[34m[1mwandb[0m: 	batch_size: 1
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	weight_decay: 1e-07
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Total training loss:  0.008648531838844065
Total validation loss:  0.029304596362635493
Total training loss:  0.006533223695441848
Total validation loss:  0.02341029403032735
Total training loss:  0.005339343821106013
Total validation loss:  0.01955563883529976
Total training loss:  0.004518102057772921
Total validation loss:  0.01680403936188668
Total training loss:  0.003919451035471866
Total validation loss:  0.014774132578168064
Total training loss:  0.003471610467386199
Total validation loss:  0.013236821570899338
Total training loss:  0.0031287348374462454
Total validation loss:  0.01204864151077345
Total training loss:  0.0028601277463167207
Total validation loss:  0.01110481534851715
Total training loss:  0.002645267464686185
Total validation loss:  0.010346524970373139
Total training loss:  0.0024707392185518984
Total validation loss:  0.009717854438349605
Total training loss:  0.0023257180546352174
Total validation loss:  0.00919241298106499
Total training loss:  0.0022032603

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▆▅▄▃▃▃▃▂▂▂▂▂▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▆▅▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂▁▁▁▂▁▁▁▁▁▁▁▁

0,1
epoch,50.0
loss,0.13212
val_loss,0.11259


[34m[1mwandb[0m: Agent Starting Run: zl2u12xs with config:
[34m[1mwandb[0m: 	batch_size: 1
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	momentum: 0.1
[34m[1mwandb[0m: 	weight_decay: 1e-09
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Total training loss:  0.00647873483467265
Total validation loss:  0.016482922772411257
Total training loss:  0.003105862031588913
Total validation loss:  0.009401741379406303
Total training loss:  0.0019613265749285347
Total validation loss:  0.00672149594174698
Total training loss:  0.001531919061562803
Total validation loss:  0.005561369544011541
Total training loss:  0.0013196196860008058
Total validation loss:  0.004963909581420012
Total training loss:  0.0011904508419320337
Total validation loss:  0.004450162945431657
Total training loss:  0.0011022291091649095
Total validation loss:  0.004169783525867388
Total training loss:  0.0010394196501692932
Total validation loss:  0.00395146702794591
Total training loss:  0.0009877427423816698
Total validation loss:  0.0038636543540633284
Total training loss:  0.0009466216406508465
Total validation loss:  0.0036321166189736687
Total training loss:  0.0009140617498815118
Total validation loss:  0.0035317746151122265
Total training loss:  0.

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▄▃▂▂▂▂▁▂▂▁▂▁▁▁▁▁▂▁▁▁▁▁▂▁▁▁▁▁▁▂▁▂▁▁▂▂▂▁▁
val_loss,█▅▃▃▂▂▂▂▁▂▁▂▂▂▂▂▁▂▁▁▁▂▁▂▁▂▁▁▂▁▁▂▂▂▂▂▂▁▂▂

0,1
epoch,50.0
loss,0.07532
val_loss,0.10497


[34m[1mwandb[0m: Agent Starting Run: kniv3y2h with config:
[34m[1mwandb[0m: 	batch_size: 1
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	momentum: 5
[34m[1mwandb[0m: 	weight_decay: 1e-08
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Total training loss:  0.010654656711267307
Total validation loss:  0.040370094240643084
Total training loss:  0.009621016288292594
Total validation loss:  0.036765035474672914
Total training loss:  0.00882192426797701
Total validation loss:  0.03392432176042348
Total training loss:  0.008196097529435065
Total validation loss:  0.0317085231654346
Total training loss:  0.007698535315284971
Total validation loss:  0.029894016159232706
Total training loss:  0.007279488749190932
Total validation loss:  0.02832855290034786
Total training loss:  0.006911856966326013
Total validation loss:  0.026935970003250986
Total training loss:  0.0065816890419228
Total validation loss:  0.02567568578524515
Total training loss:  0.006281367346673505
Total validation loss:  0.024524601234588772
Total training loss:  0.0060064283170504496
Total validation loss:  0.023468760948162526
Total training loss:  0.005753901819844032
Total validation loss:  0.022498023114167154
Total training loss:  0.005521464812773

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▇▆▆▅▅▅▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁
val_loss,█▇▇▆▆▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁

0,1
epoch,50.0
loss,0.2798
val_loss,0.28425


[34m[1mwandb[0m: Agent Starting Run: qerm4u4w with config:
[34m[1mwandb[0m: 	batch_size: 1
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	momentum: 5
[34m[1mwandb[0m: 	weight_decay: 1e-08
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Total training loss:  0.011563098451006226
Total validation loss:  0.04372176015749574
Total training loss:  0.01047326725529274
Total validation loss:  0.04035496967844665
Total training loss:  0.009752946825756226
Total validation loss:  0.03781073621939868
Total training loss:  0.00916439037246164
Total validation loss:  0.03560480277519673
Total training loss:  0.008637764614832122
Total validation loss:  0.03357680235058069
Total training loss:  0.00814807783535798
Total validation loss:  0.03169226221507415
Total training loss:  0.007700259087869199
Total validation loss:  0.029996068216860294
Total training loss:  0.007299649339984171
Total validation loss:  0.02848159510176629
Total training loss:  0.006940989562281175
Total validation loss:  0.02712091145804152
Total training loss:  0.006617278497287771
Total validation loss:  0.025887606316246092
Total training loss:  0.006322685814666329
Total validation loss:  0.024761142034549266
Total training loss:  0.006052779055607971


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▇▆▆▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
val_loss,██▇▇▆▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁

0,1
epoch,50.0
loss,0.30938
val_loss,0.32406


[34m[1mwandb[0m: Agent Starting Run: 1j86z8on with config:
[34m[1mwandb[0m: 	batch_size: 1
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	momentum: 5
[34m[1mwandb[0m: 	weight_decay: 1e-07
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Total training loss:  0.01229207052529091
Total validation loss:  0.04643997107632458
Total training loss:  0.011073205154389143
Total validation loss:  0.04229080432560295
Total training loss:  0.010148520181246568
Total validation loss:  0.03900981473270804
Total training loss:  0.009391316991241183
Total validation loss:  0.036241548135876656
Total training loss:  0.008749479355174117
Total validation loss:  0.033883515861816704
Total training loss:  0.008202126475225668
Total validation loss:  0.03186442703008652
Total training loss:  0.007730570025159977
Total validation loss:  0.03010909107979387
Total training loss:  0.007317580304516014
Total validation loss:  0.028558032063301653
Total training loss:  0.006950251161470078
Total validation loss:  0.027167739463038743
Total training loss:  0.006619466341362568
Total validation loss:  0.02590874955058098
Total training loss:  0.00631900275402586
Total validation loss:  0.024760925269220024
Total training loss:  0.0060445625458669

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▇▆▆▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁
val_loss,█▇▇▆▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁

0,1
epoch,50.0
loss,0.33356
val_loss,0.30039
