<a href="https://colab.research.google.com/github/Raviteja654321/CVIT_Workshop/blob/main/Day_6/Day_6_wandb_demo_HandsOn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using wandb to track experiments.

Demo task: multi-class image classification using CIFAR10 dataset.

In [40]:
from sklearn.metrics import average_precision_score
from torch.utils.data import DataLoader
from torchvision import datasets, models
from torchvision import transforms as T
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

!pip install wandb

!wandb.login()


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
/bin/bash: -c: line 1: syntax error: unexpected end of file


# The next cell includes-
- Collecting the CIFAR10 dataset and defining data loaders.
- Methods to load model, criterion, optimizer and schedulers.
- Definition of AverageMeter

In [41]:
# Downloading CIFAR10 dataset
inp_transforms = T.Compose([T.ToTensor(),
                            T.Normalize(mean=[0.485, 0.456, 0.406],
                                        std=[0.229, 0.224, 0.225])])
tgt_transforms = T.Lambda(lambda y: torch.zeros(10, dtype=torch.long).scatter_(0, torch.tensor(y), value=1))
cifar10 = datasets.CIFAR10(root = "/.",
                           transform = inp_transforms,
                           target_transform = tgt_transforms,
                           download = True)

# Defining dataset split (80-20)
train_dataset, val_dataset = torch.utils.data.random_split(cifar10,
                                                           [int(len(cifar10)*0.80), int(len(cifar10)*0.20)])

# Defining the dataloaders
train_dataloader = DataLoader(train_dataset,
                              batch_size=200,
                              shuffle=True)
val_dataloader = DataLoader(val_dataset,
                            batch_size=200,
                            shuffle=False)


# Method to get model based on config param model_type
def get_model(model_type):
    model = None
    if model_type == "pretrained": # Loading pretrained ResNet18 and with updated to final fc layer. 
        model = models.resnet18(pretrained=True)
        model.fc = nn.Linear(512, 10)
        model = model.to(device)
    elif model_type == "scratch": # Loading a blank ResNet18 which generated 10 outputs.
        model = models.resnet18(num_classes=10)
        model = model.to(device)
    else:
        raise NotImplemented
    return model


# Method to get criterion, optimizer and scheduler based on config params.
def get_criterion_optimizer_scheduler(config, model):
    optim_dct = {
        "adam": optim.Adam,
        "SGD": optim.SGD,
        "RMSprop": optim.RMSprop
    }
    optimizer = optim_dct[config["optimizer"]](model.parameters(), lr=config["lr"])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.1,
                                                           patience=config["scheduler_patience"],
                                                           threshold=config["scheduler_thresh"])
    criterion = nn.CrossEntropyLoss()
    return criterion, optimizer, scheduler



# Remainder of this cell includes definition of AverageMeter (can be ignored)
"""
Code taken from Pytorch ImageNet examples
https://github.com/pytorch/examples/blob/main/imagenet/main.py#L375
"""
class Summary():
    NONE = 0
    AVERAGE = 1
    SUM = 2
    COUNT = 3

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE):
        self.name = name
        self.fmt = fmt
        self.summary_type = summary_type
        self.val_history = list()
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        self.val_history = list()

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        self.val_history.append(val)

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)
    
    def summary(self):
        fmtstr = ''
        if self.summary_type is Summary.NONE:
            fmtstr = ''
        elif self.summary_type is Summary.AVERAGE:
            fmtstr = '{name} {avg:.3f}'
        elif self.summary_type is Summary.SUM:
            fmtstr = '{name} {sum:.3f}'
        elif self.summary_type is Summary.COUNT:
            fmtstr = '{name} {count:.3f}'
        else:
            raise ValueError('invalid summary type %r' % self.summary_type)        
        return fmtstr.format(**self.__dict__)


Files already downloaded and verified


# Following cell includes-
- Defining the train and eval loops.
- Method to trigger training loops based on config parameters.

In [42]:
import wandb
wandb.login()


[34m[1mwandb[0m: Currently logged in as: [33mraviteja_6[0m ([33mcvit_work[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [48]:
# The train function without wandb logging

def train(model, criterion, optimizer, scheduler, epochs, train_dataloader, val_dataloader, device):
    for epoch in range(epochs):
        model.train()
        loss_meter = AverageMeter("train_loss", ":.5f")
        epoch_outs, epoch_tgt = list(), list()
        for data, tgt_vec in tqdm(train_dataloader):
            data, tgt_vec = data.to(device), tgt_vec.to(device)
            targets = torch.argmax(tgt_vec, axis=1)
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out, targets)
            loss_meter.update(loss.item(), data.shape[0])
            loss.backward()
            optimizer.step()
            epoch_outs.append(out)
            epoch_tgt.append(tgt_vec)
        predictions = torch.vstack([torch.softmax(out, axis=1) for out in epoch_outs]).detach().cpu().numpy()
        targets = torch.cat([tgt for tgt in epoch_tgt], dim=0).detach().cpu().numpy()
        ap_score = average_precision_score(targets, predictions)
        eval_loss_meter, eval_ap_score = evaluate(model, criterion, val_dataloader, device)
        data_to_log = {
            "epoch": epoch+1,
            "train_loss": loss_meter.avg,
            "eval_loss": eval_loss_meter.avg,
            "train_ap_score": ap_score,
            "eval_ap_score": eval_ap_score,
            "lr": optimizer.state_dict()["param_groups"][0]["lr"],
        }
        scheduler.step(eval_loss_meter.avg)
        print(data_to_log)
        # wandb.log(data_to_log)


@torch.no_grad()
def evaluate(model, criterion, val_dataloader, device):
    model.eval()
    loss_meter = AverageMeter("eval_loss", ":.5f")
    epoch_outs, epoch_tgt = list(), list()
    for data, tgt_vec in val_dataloader:
        data, tgt_vec = data.to(device), tgt_vec.to(device)
        targets = torch.argmax(tgt_vec, axis=1)
        out = model(data)
        loss = criterion(out, targets)
        loss_meter.update(loss.item(), data.shape[0])
        epoch_outs.append(out)
        epoch_tgt.append(tgt_vec)
    predictions = torch.vstack([torch.softmax(out, axis=1) for out in epoch_outs]).detach().cpu().numpy()
    targets = torch.cat([tgt for tgt in epoch_tgt], dim=0).detach().cpu().numpy()
    ap_score = average_precision_score(targets, predictions)
    return loss_meter, ap_score


def trigger_training(config):
    model = get_model(config["model_type"])
    criterion, optimizer, scheduler = get_criterion_optimizer_scheduler(config, model)
    epochs = config["num_epochs"]

    train(model, criterion, optimizer, scheduler, epochs, train_dataloader, val_dataloader, device)


# Complete the config file, edit the cells in this notebook to log data to wandb and trigger training loops!

In [54]:
# Fill the Config file below and log the experiment at wandb
config = {
    "lr": 0.0, 
    "model_type": "scratch", # pretrained/scratch
    "optimizer": "adam", # adam/SGD/RMSprop
    "criterion": "ce",
    "scheduler_patience": 3,
    "scheduler_thresh": 0.001,
    "num_epochs": 40, # CHANGE
    "gpu_id": 0,
    "wandb_run_name": "ravi's_run" ### FILL YOUR NAME HERE
}


# WandB Steps

In [55]:
### Step 1: Import WandB in your code

import wandb

### Step 1 ends

In [56]:
### Step 2:
# Initiate wandb in your script. The moment we trigger wandb.init(), an active
# socket connection is established between your machine and wandb server.
# We specify the entity (wandb username) and project (which wandb project to use for logging)

wandb.init(entity = "dhruv_sri",   # wandb username. (NOT REQUIRED ARG. ANYMORE, it fetches from initial login)
           project = "wandb_demo", # wandb project name. New project will be created if given project is missing.
           config = config         # Config dict
          )
wandb.run.name = config["wandb_run_name"]

### Step 2 ends.


0,1
epoch,▁▂▃▃▄▅▆▆▇█
eval_ap_score,▅▄▃█▁▅▇▆▅▅
eval_loss,▅▇▆▂▆▅█▂▂▁
lr,▁▁▁▁▁▁▁▁▁▁
train_ap_score,▆▂▅▅▁▁█▁█▂
train_loss,▄▆█▃▆▅▁▅▄▆

0,1
epoch,10.0
eval_ap_score,0.10031
eval_loss,2.53483
lr,0.0
train_ap_score,0.10054
train_loss,2.53055


In [57]:
### Step 3: Trigger wandb log
# The train function without wandb logging

def train(model, criterion, optimizer, scheduler, epochs, train_dataloader, val_dataloader, device):
    for epoch in range(epochs):
        model.train()
        loss_meter = AverageMeter("train_loss", ":.5f")
        epoch_outs, epoch_tgt = list(), list()
        for data, tgt_vec in tqdm(train_dataloader):
            data, tgt_vec = data.to(device), tgt_vec.to(device)
            targets = torch.argmax(tgt_vec, axis=1)
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out, targets)
            loss_meter.update(loss.item(), data.shape[0])
            loss.backward()
            optimizer.step()
            epoch_outs.append(out)
            epoch_tgt.append(tgt_vec)
        predictions = torch.vstack([torch.softmax(out, axis=1) for out in epoch_outs]).detach().cpu().numpy()
        targets = torch.cat([tgt for tgt in epoch_tgt], dim=0).detach().cpu().numpy()
        ap_score = average_precision_score(targets, predictions)
        eval_loss_meter, eval_ap_score = evaluate(model, criterion, val_dataloader, device)
        data_to_log = {
            "epoch": epoch+1,
            "train_loss": loss_meter.avg,
            "eval_loss": eval_loss_meter.avg,
            "train_ap_score": ap_score,
            "eval_ap_score": eval_ap_score,
            "lr": optimizer.state_dict()["param_groups"][0]["lr"],
        }
        scheduler.step(eval_loss_meter.avg)
        print(data_to_log)
        wandb.log(data_to_log)


@torch.no_grad()
def evaluate(model, criterion, val_dataloader, device):
    model.eval()
    loss_meter = AverageMeter("eval_loss", ":.5f")
    epoch_outs, epoch_tgt = list(), list()
    for data, tgt_vec in val_dataloader:
        data, tgt_vec = data.to(device), tgt_vec.to(device)
        targets = torch.argmax(tgt_vec, axis=1)
        out = model(data)
        loss = criterion(out, targets)
        loss_meter.update(loss.item(), data.shape[0])
        epoch_outs.append(out)
        epoch_tgt.append(tgt_vec)
    predictions = torch.vstack([torch.softmax(out, axis=1) for out in epoch_outs]).detach().cpu().numpy()
    targets = torch.cat([tgt for tgt in epoch_tgt], dim=0).detach().cpu().numpy()
    ap_score = average_precision_score(targets, predictions)
    return loss_meter, ap_score


def trigger_training(config):
    model = get_model(config["model_type"])
    criterion, optimizer, scheduler = get_criterion_optimizer_scheduler(config, model)
    epochs = config["num_epochs"]

    train(model, criterion, optimizer, scheduler, epochs, train_dataloader, val_dataloader, device)

# This step is responsible for sending the logs to wandb
# data_to_log = {"epoch": epoch, "loss": loss_value, "accuracy": accuracy_value}

# wandb.log(data_to_log)
trigger_training(config)
### Step 3 ends.


100%|██████████| 200/200 [00:17<00:00, 11.75it/s]


{'epoch': 1, 'train_loss': 2.4630929005146025, 'eval_loss': 2.447629451751709, 'train_ap_score': 0.1050948485159302, 'eval_ap_score': 0.10802666556809715, 'lr': 0.0}


100%|██████████| 200/200 [00:17<00:00, 11.22it/s]


{'epoch': 2, 'train_loss': 2.463619065284729, 'eval_loss': 2.449119834899902, 'train_ap_score': 0.10491947021708939, 'eval_ap_score': 0.10795547850672346, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 11.88it/s]


{'epoch': 3, 'train_loss': 2.463335483074188, 'eval_loss': 2.4474487257003785, 'train_ap_score': 0.10493462033789558, 'eval_ap_score': 0.10805936382399439, 'lr': 0.0}


100%|██████████| 200/200 [00:17<00:00, 11.74it/s]


{'epoch': 4, 'train_loss': 2.4627153205871584, 'eval_loss': 2.4498410749435426, 'train_ap_score': 0.1049927181699011, 'eval_ap_score': 0.10815283059633651, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 11.99it/s]


{'epoch': 5, 'train_loss': 2.4636375868320464, 'eval_loss': 2.4488751983642576, 'train_ap_score': 0.10503393871200131, 'eval_ap_score': 0.10831088732936076, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.29it/s]


{'epoch': 6, 'train_loss': 2.463441170454025, 'eval_loss': 2.449700345993042, 'train_ap_score': 0.1050087542019599, 'eval_ap_score': 0.10798663858883546, 'lr': 0.0}


100%|██████████| 200/200 [00:17<00:00, 11.53it/s]


{'epoch': 7, 'train_loss': 2.4620381915569305, 'eval_loss': 2.4553409910202024, 'train_ap_score': 0.10533666951780987, 'eval_ap_score': 0.10786828225030898, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.22it/s]


{'epoch': 8, 'train_loss': 2.462830823659897, 'eval_loss': 2.451388368606567, 'train_ap_score': 0.10508876797302014, 'eval_ap_score': 0.10790572372535978, 'lr': 0.0}


100%|██████████| 200/200 [00:18<00:00, 10.86it/s]


{'epoch': 9, 'train_loss': 2.4636482226848604, 'eval_loss': 2.4514273834228515, 'train_ap_score': 0.10493987934186003, 'eval_ap_score': 0.10815494860855088, 'lr': 0.0}


100%|██████████| 200/200 [00:18<00:00, 10.98it/s]


{'epoch': 10, 'train_loss': 2.4647344267368316, 'eval_loss': 2.452953782081604, 'train_ap_score': 0.10479909518838357, 'eval_ap_score': 0.10789911600375512, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.12it/s]


{'epoch': 11, 'train_loss': 2.46432128071785, 'eval_loss': 2.4497513914108278, 'train_ap_score': 0.10486832716277764, 'eval_ap_score': 0.10795454747893518, 'lr': 0.0}


100%|██████████| 200/200 [00:18<00:00, 11.08it/s]


{'epoch': 12, 'train_loss': 2.462405964136124, 'eval_loss': 2.452073826789856, 'train_ap_score': 0.1051147288388371, 'eval_ap_score': 0.10800537293664533, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.12it/s]


{'epoch': 13, 'train_loss': 2.4645602822303774, 'eval_loss': 2.4501070308685304, 'train_ap_score': 0.10471026093555065, 'eval_ap_score': 0.10826815619846078, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 11.95it/s]


{'epoch': 14, 'train_loss': 2.463180202245712, 'eval_loss': 2.45191113948822, 'train_ap_score': 0.10494433816609097, 'eval_ap_score': 0.10811079863930668, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.17it/s]


{'epoch': 15, 'train_loss': 2.4633521246910095, 'eval_loss': 2.4526393032073974, 'train_ap_score': 0.10515763870063574, 'eval_ap_score': 0.10819773051665971, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.16it/s]


{'epoch': 16, 'train_loss': 2.4629261040687562, 'eval_loss': 2.4524674129486086, 'train_ap_score': 0.10504058690209975, 'eval_ap_score': 0.10753269858919527, 'lr': 0.0}


100%|██████████| 200/200 [00:17<00:00, 11.58it/s]


{'epoch': 17, 'train_loss': 2.4634186923503876, 'eval_loss': 2.453853702545166, 'train_ap_score': 0.1050677407101103, 'eval_ap_score': 0.10793572190669012, 'lr': 0.0}


100%|██████████| 200/200 [00:17<00:00, 11.13it/s]


{'epoch': 18, 'train_loss': 2.463993785381317, 'eval_loss': 2.450676245689392, 'train_ap_score': 0.10489269395757475, 'eval_ap_score': 0.10815301369839792, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.05it/s]


{'epoch': 19, 'train_loss': 2.4635503947734834, 'eval_loss': 2.452676281929016, 'train_ap_score': 0.10515155550027591, 'eval_ap_score': 0.10787076755730811, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 11.77it/s]


{'epoch': 20, 'train_loss': 2.4652561962604524, 'eval_loss': 2.4477973556518555, 'train_ap_score': 0.10461153758522541, 'eval_ap_score': 0.10811619056894617, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.04it/s]


{'epoch': 21, 'train_loss': 2.4647561168670653, 'eval_loss': 2.4475809717178345, 'train_ap_score': 0.10479760529889917, 'eval_ap_score': 0.10809772002567722, 'lr': 0.0}


100%|██████████| 200/200 [00:17<00:00, 11.59it/s]


{'epoch': 22, 'train_loss': 2.463616271018982, 'eval_loss': 2.452803988456726, 'train_ap_score': 0.10503731001662076, 'eval_ap_score': 0.1080260729665766, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.21it/s]


{'epoch': 23, 'train_loss': 2.4633716487884523, 'eval_loss': 2.4496699810028075, 'train_ap_score': 0.10517804854281279, 'eval_ap_score': 0.10797488201864999, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 11.86it/s]


{'epoch': 24, 'train_loss': 2.463193029165268, 'eval_loss': 2.449277300834656, 'train_ap_score': 0.10496978431590769, 'eval_ap_score': 0.10806247692695423, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.39it/s]


{'epoch': 25, 'train_loss': 2.463753961324692, 'eval_loss': 2.448135027885437, 'train_ap_score': 0.10493573643761833, 'eval_ap_score': 0.10836595068647807, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.22it/s]


{'epoch': 26, 'train_loss': 2.4642835903167724, 'eval_loss': 2.4499573564529418, 'train_ap_score': 0.10500237903098411, 'eval_ap_score': 0.10797503657151156, 'lr': 0.0}


100%|██████████| 200/200 [00:17<00:00, 11.17it/s]


{'epoch': 27, 'train_loss': 2.463860375881195, 'eval_loss': 2.4490883636474607, 'train_ap_score': 0.10496424067867187, 'eval_ap_score': 0.10797468113006174, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.15it/s]


{'epoch': 28, 'train_loss': 2.464039207696915, 'eval_loss': 2.4504351902008055, 'train_ap_score': 0.10499475985180391, 'eval_ap_score': 0.1078615982408659, 'lr': 0.0}


100%|██████████| 200/200 [00:17<00:00, 11.47it/s]


{'epoch': 29, 'train_loss': 2.463043735027313, 'eval_loss': 2.452168650627136, 'train_ap_score': 0.10509584111227717, 'eval_ap_score': 0.1079631392462077, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.14it/s]


{'epoch': 30, 'train_loss': 2.4638670921325683, 'eval_loss': 2.450228977203369, 'train_ap_score': 0.10480118999275782, 'eval_ap_score': 0.1082231438316688, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 11.91it/s]


{'epoch': 31, 'train_loss': 2.464011716842651, 'eval_loss': 2.449205894470215, 'train_ap_score': 0.10468689850572525, 'eval_ap_score': 0.1083958330070596, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.18it/s]


{'epoch': 32, 'train_loss': 2.4636079597473146, 'eval_loss': 2.448198928833008, 'train_ap_score': 0.10514433398056995, 'eval_ap_score': 0.10792045021959809, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.25it/s]


{'epoch': 33, 'train_loss': 2.463630796670914, 'eval_loss': 2.4512928915023804, 'train_ap_score': 0.10518019735810898, 'eval_ap_score': 0.10819562238167543, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.21it/s]


{'epoch': 34, 'train_loss': 2.464930305480957, 'eval_loss': 2.4523369121551513, 'train_ap_score': 0.10493750126819616, 'eval_ap_score': 0.10799055900140413, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.39it/s]


{'epoch': 35, 'train_loss': 2.463846756219864, 'eval_loss': 2.4499032068252564, 'train_ap_score': 0.10510503956599324, 'eval_ap_score': 0.10799107306500069, 'lr': 0.0}


100%|██████████| 200/200 [00:17<00:00, 11.23it/s]


{'epoch': 36, 'train_loss': 2.463838069438934, 'eval_loss': 2.448280987739563, 'train_ap_score': 0.10486057325770415, 'eval_ap_score': 0.10828568753721805, 'lr': 0.0}


100%|██████████| 200/200 [00:15<00:00, 12.74it/s]


{'epoch': 37, 'train_loss': 2.4631851994991303, 'eval_loss': 2.4506756353378294, 'train_ap_score': 0.10485564040901037, 'eval_ap_score': 0.10824245352216129, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 11.87it/s]


{'epoch': 38, 'train_loss': 2.4621990168094636, 'eval_loss': 2.452499279975891, 'train_ap_score': 0.10513007216410827, 'eval_ap_score': 0.10788896528331801, 'lr': 0.0}


100%|██████████| 200/200 [00:15<00:00, 12.69it/s]


{'epoch': 39, 'train_loss': 2.4636429631710053, 'eval_loss': 2.4503595542907717, 'train_ap_score': 0.10485716900722386, 'eval_ap_score': 0.10813438989000154, 'lr': 0.0}


100%|██████████| 200/200 [00:16<00:00, 12.06it/s]


{'epoch': 40, 'train_loss': 2.463783495426178, 'eval_loss': 2.451315178871155, 'train_ap_score': 0.10477871113172336, 'eval_ap_score': 0.10843871482792089, 'lr': 0.0}


In [58]:
### Step 4 (Optional)
# This closes the active socket connection to wandb server. Optional since wandb destructor does the same.

wandb.finish()

### Step 4 ends.


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval_ap_score,▅▄▅▆▇▅▄▄▆▄▄▅▇▅▆▁▄▆▄▆▅▅▄▅▇▄▄▄▄▆█▄▆▅▅▇▆▄▆█
eval_loss,▁▂▁▃▂▃█▄▅▆▃▅▃▅▆▅▇▄▆▁▁▆▃▃▂▃▂▄▅▃▃▂▄▅▃▂▄▅▄▄
lr,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_ap_score,▆▄▄▅▅▅█▆▄▃▃▆▂▄▆▅▅▄▆▁▃▅▆▄▄▅▄▅▆▃▂▆▆▄▆▃▃▆▃▃
train_loss,▃▄▄▂▄▄▁▃▅▇▆▂▆▃▄▃▄▅▄█▇▄▄▄▅▆▅▅▃▅▅▄▄▇▅▅▃▁▄▅

0,1
epoch,40.0
eval_ap_score,0.10844
eval_loss,2.45132
lr,0.0
train_ap_score,0.10478
train_loss,2.46378


# WandB sweeps related steps

In [63]:
### Step 1:
# Create a WandB sweep config file.
# This config file will be used at the WandB website to initialize a sweep server
program: "demo.py"
method: "grid"
metric:
  name: "eval_ap_score"
  goal: "maximize"
parameters:
    criterion:
      value: "ce"
    gpu_id:
      value: 0
    lr:
      values: [0.1, 0.001, 0.0001]
    model_type:
      values: ["scratch", "pretrained"]
    num_epochs:
      value: 25
    optimizer:
      values: ["adam", "SGD", "RMSprop"]
    scheduler_patience:
      value: 3
    scheduler_thresh:
      value: 0.01

        
### A sample sweep config file if bayes method is used-
# program: wandb_demo.py
# method: bayes
# metric:
#   name: "eval_ap_score"
#   goal: maximize
# parameters:
#   lr:
#     distribution: uniform
#     min: 0.00001
#     max: 0.1
#   criterion:
#     distribution: categorical
#     value:
#       - ce
#   optimizer:
#     distribution: categorical
#     values:
#       - adam
#       - SGD
#       - RMSprop
#   model_type:
#     distribution: categorical
#     values:
#       - pretrained
#       - scratch
#   num_epochs:
#     value:
#       - 30
#   scheduler_thresh:
#     distribution: uniform
#     min: 0.001
#     max: 0.01
#   scheduler_patience:
#     distribution: int_uniform
#     min: 2
#     max: 10


In [62]:
## Step 2
# After using the above config on wandb website, you will get a sweep id in return.
# E.g. sweep id- dhruv_sri/wandb_demo/hbyp0tl8
#
# Add the following agent line in your code-
# Use the generated sweep id in the below code

wandb.agent(sweep_id="### FILL SWEEP ID HERE ###", function=sweep_agent_manager, count=100)


In [60]:
### Step 3
# Notice in above command we mentioned an argument named "function"
# Wandb agents must trigger a function where they can initiate a socket to wandb and get a config.
# So, we will use the following sweep_agent_manager function here-

def sweep_agent_manager():
    wandb.init()
    config = dict(wandb.config)
    run_name = f"{config['model_type']}_{config['optimizer']}_{config['lr']}"
    wandb.run.name = run_name
    trigger_training(config)


In [61]:
### Done.
# Now execute your training script on multiple machines.
# Each run will request the config file from wandb and related experiments will be logged.
# 
# NOTE!! wandb.log(data_to_log) must be present inside the code!! Else there is no meaning to sweep.


# ------------------------------ Ends ------------------------------