# Recitation 0H: Weights and Biases

In this recitation, you will learn about the importance of performance visualization and model tracking using [WandB](https://wandb.ai/), a tool for performance visualization, model and data version controlling and hyperparameter tuning. 

## Installation and Libraries

In [1]:
## Installing WandB
!pip install wandb -qqq

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets
from torchvision.transforms import ToTensor 

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

from tqdm import tqdm

In [None]:
import wandb, os
os.environ['WANDB_API_KEY'] = '0fc513cd516b97b84b6678a30331771336830356'
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Helper functions and Model

In [None]:
data_train = datasets.CIFAR10(
    root = 'data',
    train = True,                         
    transform = ToTensor(), 
    download = True,            
)
data_test = datasets.CIFAR10(
    root = 'data', 
    train = False, 
    download = True, 
    transform = ToTensor()
)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting data/cifar-10-python.tar.gz to data
Files already downloaded and verified


In [None]:
def build_data(batch_size, data_train, data_test):
    train_loader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

In [None]:
class Network(nn.Module):
  
  def __init__(self):

    super(Network, self).__init__()

    self.CNN = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            
            # nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=2),
            # nn.BatchNorm2d(128),
            # nn.ReLU(),
            
            # nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=2),
            # nn.BatchNorm2d(256),
            # nn.ReLU(),
            
            nn.AvgPool2d(kernel_size=9),
            nn.Flatten()
    )

    self.classification = nn.Linear(576, 10)
  def forward(self, x):
    
    x_cnn = self.CNN(x)
    res = self.classification(x_cnn)
    
    return res

model = Network().to(device)
print(model)

Network(
  (CNN): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): AvgPool2d(kernel_size=9, stride=9, padding=0)
    (4): Flatten(start_dim=1, end_dim=-1)
  )
  (classification): Linear(in_features=576, out_features=10, bias=True)
)


In [None]:
train_loader, test_loader = build_data(64, data_train, data_test)

for x, y in train_loader:
  break
model(x.to(device)).shape

torch.Size([64, 10])

In [None]:
def get_optim(optimizer, learning_rate, model):
  if optimizer=='sgd':
    return optim.SGD(model.parameters(), lr=learning_rate)
  else:
    return optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
def train_epoch(model, loader, optimizer, criterion, scaler):
    num_correct = 0
    total_loss = 0

    for i, (x, y) in enumerate(loader):
          optimizer.zero_grad()

          x = x.cuda()
          y = y.cuda()

          with torch.cuda.amp.autocast():     
              outputs = model(x)
              loss = criterion(outputs, y)

          total_loss += float(loss)

          scaler.scale(loss).backward() 
          scaler.step(optimizer) 
          scaler.update() 
    ep_loss = float(total_loss / len(loader))

    return model, ep_loss

In [None]:
def train_sweep(config = None):
    with wandb.init(config=config):

        config = wandb.config

        train_loader, test_loader = build_data(config.batch_size, data_train, data_test)
        
        model = Network().to(device)

        optimizer = get_optim(config.optimizer, config.learning_rate, model)
        
        criterion = nn.CrossEntropyLoss()

        scaler = torch.cuda.amp.GradScaler()

        for epoch in range(config.epochs):
           
            model, loss = train_epoch(model, train_loader, optimizer, criterion, scaler)
           
            wandb.log({'loss': loss})

In [None]:
def train(model):

  # Dont worry about all this, you'll be very familiar with it after HW1

  best_acc = 0

  for epoch in range(run_config['epochs']):
      batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

      num_correct = 0
      total_loss = 0

      for i, (x, y) in enumerate(train_loader):
          optimizer.zero_grad()

          x = x.cuda()
          y = y.cuda()

          with torch.cuda.amp.autocast():     
              outputs = model(x)
              loss = criterion(outputs, y)

          num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
          total_loss += float(loss)

          batch_bar.set_postfix(
              acc="{:.04f}%".format(100 * num_correct / ((i + 1) * run_config['batch_size'])),
              loss="{:.04f}".format(float(total_loss / (i + 1))),
              num_correct=num_correct,
              lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
           
          scaler.scale(loss).backward()
          scaler.step(optimizer)
          scaler.update()

          
          batch_bar.update()
      batch_bar.close() 

      train_loss = float(total_loss / len(train_loader))
      train_acc = 100 * num_correct / (len(train_loader) * run_config['batch_size'])
      lr = float(optimizer.param_groups[0]['lr'])
      
      print("Epoch {}/{}: Train Acc {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f}".format(
          epoch + 1,
          run_config['epochs'],
          train_acc ,
          train_loss,
          lr
          )
      )
      
      # What to log 
      
      metrics = {
          "train_loss":train_loss,
          "train_acc": train_acc,
          'lr': lr
      }

      # Log to run
      wandb.log(metrics)

      # Updating the model version
      
      if train_acc > best_acc:
        best_acc = train_acc
        
        # Saving the model and optimizer states
        
        torch.save({
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict()
              }, "Model")
        
        # Creating Artifact
        
        model_artifact = wandb.Artifact(run_config['model'], type='model')
        
        # Adding model file to Artifact
        
        model_artifact.add_file("Model")

        # Saving Artifact to WandB

        run.log_artifact(model_artifact)

  wandb.finish()

## Simple Usage

You can run the training function and log the performance metrics of your choice into the WandB GUI. This simple method will allow you to monitor trends in a specefic run configuration as well as comparing different runs

In [None]:
run_config = {
    'model': '1-2dcnn',
    'optimizer':'sgd',
    'lr': 2e-3,
    'batch_size':64,
    'epochs': 100
}

train_loader, test_loader = build_data(run_config['batch_size'], data_train, data_test)

optimizer = get_optim(run_config['optimizer'], run_config['lr'], model)

criterion = nn.CrossEntropyLoss()

scaler = torch.cuda.amp.GradScaler()

In [None]:
run = wandb.init(
    project="wandb-quickstart", 
    job_type="model-training", 
    name=run_config['model'], 
    config=run_config
    )

[34m[1mwandb[0m: Currently logged in as: [33mmelamin[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
train(model)



Epoch 1/100: Train Acc 24.4945%, Train Loss 2.0993, Learning Rate 0.0020




Epoch 2/100: Train Acc 31.6696%, Train Loss 1.9526, Learning Rate 0.0020




Epoch 3/100: Train Acc 34.0893%, Train Loss 1.8903, Learning Rate 0.0020




Epoch 4/100: Train Acc 35.9355%, Train Loss 1.8480, Learning Rate 0.0020




Epoch 5/100: Train Acc 36.9865%, Train Loss 1.8156, Learning Rate 0.0020




Epoch 6/100: Train Acc 38.2253%, Train Loss 1.7888, Learning Rate 0.0020




Epoch 7/100: Train Acc 38.9706%, Train Loss 1.7668, Learning Rate 0.0020




Epoch 8/100: Train Acc 39.7778%, Train Loss 1.7459, Learning Rate 0.0020




Epoch 9/100: Train Acc 40.2873%, Train Loss 1.7272, Learning Rate 0.0020




Epoch 10/100: Train Acc 40.8728%, Train Loss 1.7102, Learning Rate 0.0020




Epoch 11/100: Train Acc 41.4462%, Train Loss 1.6952, Learning Rate 0.0020




Epoch 12/100: Train Acc 42.0556%, Train Loss 1.6812, Learning Rate 0.0020




Epoch 13/100: Train Acc 42.4932%, Train Loss 1.6674, Learning Rate 0.0020




Epoch 14/100: Train Acc 42.8549%, Train Loss 1.6544, Learning Rate 0.0020




Epoch 15/100: Train Acc 43.4383%, Train Loss 1.6431, Learning Rate 0.0020




Epoch 16/100: Train Acc 43.7980%, Train Loss 1.6314, Learning Rate 0.0020




Epoch 17/100: Train Acc 44.1896%, Train Loss 1.6189, Learning Rate 0.0020


Train:  87%|████████▋ | 682/782 [00:10<00:01, 62.21it/s, acc=44.5919%, loss=1.6076, lr=0.0020, num_correct=19492]

KeyboardInterrupt: ignored

## HyperParameter Sweeps


[Sweeps](https://docs.wandb.ai/guides/sweeps) are a way of automating hyperparameter tuning in Deep Learning Models. You set up the values that you want your sweep to try and then check the affect of changing each parameter on each value on the model.

In [None]:
# Initialize the sweep and set the method (grid, random or bayes"ian")

sweep_config = {
    'method': 'random'
    }

In [None]:
# What is the objective of the sweep (minimize loss, maximize accuracy)

metric = {
    'name':'loss',
    'goal':'minimize'
}
sweep_config['metric'] = metric

In [None]:
# Hyperparameters to work with

parameters_dict = {
    'optimizer':{
        'values': ['sgd', 'adam']
    },
    'learning_rate':{
        'distribution':'uniform',
        'min':2e-4,
        'max':1e-1
    },
    'batch_size': {
        'distribution': 'q_log_uniform_values',
        'q':4,
        'min': 16,
        'max': 128
    },
    'epochs':{
        'value': 5
    }
}
sweep_config['parameters'] = parameters_dict

In [None]:
# Initalizing the sweep

sweep_id = wandb.sweep(sweep_config, project="CIFAR-Sweep")

Create sweep with ID: ayb504kc
Sweep URL: https://wandb.ai/melamin/CIFAR-Sweep/sweeps/ayb504kc


In [None]:
# Running the sweep

wandb.agent(sweep_id, train_sweep, count=5)

[34m[1mwandb[0m: Agent Starting Run: xf6f6xuq with config:
[34m[1mwandb[0m: 	batch_size: 60
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.04159255916064954
[34m[1mwandb[0m: 	optimizer: sgd


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
loss,█▄▃▂▁

0,1
loss,1.33522


[34m[1mwandb[0m: Agent Starting Run: 0yn818cu with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.09121955125437652
[34m[1mwandb[0m: 	optimizer: adam


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
loss,█▁▁▁▁

0,1
loss,2.02501


[34m[1mwandb[0m: Agent Starting Run: 8dpe4y5g with config:
[34m[1mwandb[0m: 	batch_size: 28
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.07476097755698989
[34m[1mwandb[0m: 	optimizer: adam


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
loss,█▂▁▂▁

0,1
loss,2.04314


[34m[1mwandb[0m: Agent Starting Run: 83ykzv67 with config:
[34m[1mwandb[0m: 	batch_size: 28
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.0897292982869954
[34m[1mwandb[0m: 	optimizer: sgd


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


## Artifact and Model Versioning

Artifacts are a method of managing versions for data and models. You can use the artifacts to keep and compare versions of your model while training making it easier to share data and models between team members.

In [None]:
run_config = {
    'model': '1-2dcnn',
    'optimizer':'adam',
    'lr': 5e-3,
    'batch_size':20,
    'epochs': 100
}

train_loader, test_loader = build_data(run_config['batch_size'], data_train, data_test)
optimizer = get_optim(run_config['optimizer'], run_config['lr'], model)
criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler()

In [None]:
run = wandb.init(
    project="wandb-quickstart", 
    job_type="model-training", 
    name=run_config['model'], 
    config=run_config
    )

In [None]:
train(model)

In [None]:
## Retreiving the model

# Getting the latest version of the artifact
artifact = run.use_artifact('{}:latest'.format(run_config['model']))
# Downloading the artifact
artifact_dir = artifact.download()
# Loading the model
model_dict = torch.load(os.path.join(artifact_dir, 'Model'))



# Loading weights
model.load_state_dict(model_dict['model_state_dict'])
# Loading optimizer state
optimizer.load_state_dict(model_dict['optimizer_state_dict'])

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f617e130fd0>> (for pre_run_cell):


Exception: ignored

[34m[1mwandb[0m: [32m[41mERROR[0m Error while calling W&B API: failed to find run CIFAR-Sweep/392gihdc (<Response [404]>)


CommError: ignored

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f617e130fd0>> (for post_run_cell):


Exception: ignored

In [None]:
# Finishing runs
wandb.finish()

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f617e130fd0>> (for pre_run_cell):


Exception: ignored

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f617e130fd0>> (for post_run_cell):


Exception: ignored