In [1]:
# imports

import torch
from torch import nn,optim
import torch.nn.functional as F
from torchvision import datasets, transforms
#from torch.utils.tensorboard import SummaryWriter
import wandb

import matplotlib.pyplot as plt
import numpy as np
import time

In [2]:
transform = transforms.Compose([
    transforms.RandomRotation(degrees=(-90,90)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
    #transforms.RandomPerspective(distortion_scale=0.20, p=0.5), # random perspective transformation with a given probability.
])

trainset = datasets.CIFAR10("CIFAR10", train=True, transform=transform, download= True)
testset = datasets.CIFAR10("CIFAR10", train=False, transform=transform, download= True)

0.0%

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to CIFAR10\cifar-10-python.tar.gz


2.9%IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

5.3%IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

7.7%IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

10.1%IOPub me

Extracting CIFAR10\cifar-10-python.tar.gz to CIFAR10
Files already downloaded and verified


In [3]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mdeeplearning_project[0m (use `wandb login --relogin` to force relogin)


True

In [21]:
sweep_config = {
    'method': 'random', 
    'metrics':{'name':'loss','goal':'minimize'}
    }

parameters_dict = {
    'epochs':{'value':25},
    'optimizer': {'values': ['adam', 'sgd','adamax']},  # 3 combinations
    'learning_rate': {'values': [0.001, 0.005, 0.01, 0.05]}, # 4 combinations
    'batch_size': {'values': [32, 64, 128]},                 # 4 combinations
    'num_blocks': {'values': [(2,2,2)]}, # 3 combinations,(3,3,3),(4,4,4)
    'in_planes':{'values': [43]},     # 7 combinations 21, 26, 29, 34, 38, 
    'k':{'values':[2]}, 
    }

sweep_config['parameters'] = parameters_dict

print(sweep_config)

{'method': 'random', 'metrics': {'name': 'loss', 'goal': 'minimize'}, 'parameters': {'epochs': {'value': 25}, 'optimizer': {'values': ['adam', 'sgd', 'adamax']}, 'learning_rate': {'values': [0.001, 0.005, 0.01, 0.05]}, 'batch_size': {'values': [32, 64, 128]}, 'num_blocks': {'values': [(2, 2, 2)]}, 'in_planes': {'values': [43]}, 'k': {'values': [2]}}}


In [22]:
sweep_id = wandb.sweep(sweep_config, project="ResNet-sweeps-local-sudu")



Create sweep with ID: ynexar88
Sweep URL: https://wandb.ai/deeplearning_project/ResNet-sweeps-local-sudu/sweeps/ynexar88


In [6]:
def sweep(config=None):
    
    # Initialize a new wandb run
    with wandb.init(config=config):
        # this config will be set by sweep controller, randomly assigned each time
        config = wandb.config
        
        trainloader, testloader = loader(config.batch_size)
        network = build_network(config.num_blocks, config.in_planes, config.k)
        optimizer = build_optimizer(network, config.optimizer, config.learning_rate)
        run_epochs = check_run(network, config.epochs, 1)
        loss = nn.CrossEntropyLoss()
        history = train(network, run_epochs, optimizer, loss, trainloader, testloader,  device='cuda')

In [7]:
def check_run(network, epochs, terminate_epochs):
    total_params = sum(p.numel() for p in network.parameters())
    if total_params<5000000:
        run_epochs = epochs
    else:
        run_epochs = terminate_epochs
    return run_epochs

In [8]:
def build_optimizer(network, optimizer, learning_rate):
    if optimizer == "sgd":
        optimizer = optim.SGD(network.parameters(), lr=learning_rate, momentum=0.9)
    elif optimizer == "adam":
        optimizer = optim.Adam(network.parameters(), lr=learning_rate)
    elif optimizer == "adamax":
        optimizer = optim.Adamax(network.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
    elif optimizer == "rmsprop":
        optimizer = optim.RMSprop(network.parameters(), lr=learning_rate, alpha=0.99, eps=1e-08, weight_decay=0,
                                  momentum=0, centered=False)
    return optimizer

In [9]:
# num_blocks-no of blocks in each layers(list,.ie(2,2,2)); in_planes--> first conv plane depth; k--> widening factor 
def build_network(num_blocks, in_planes, k):
    if len(num_blocks) == 3:
        network = ResNet3(BasicBlock, num_blocks, in_planes, k)
    elif len(num_blocks) == 4:
        network = ResNet4(BasicBlock, num_blocks, in_planes, k)
    return network

In [1]:

class BasicBlock(nn.Module):

    def __init__(self, in_planes, planes, stride=1, kernel_size=3):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=kernel_size, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=kernel_size, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet4(nn.Module):
    def __init__(self, block, num_blocks, in_planes, k=2, num_classes=10, kernel_size=3):
        super(ResNet4, self).__init__()
        self.in_planes = in_planes
        self.avg_pool_kernal_size = 4
        self.kernel_size = kernel_size
        
        self.conv1 = nn.Conv2d(3, self.in_planes, kernel_size=kernel_size, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_planes)
        self.layer1 = self._make_layer(block, k*self.in_planes, num_blocks[0], stride=1) # 32
        self.layer2 = self._make_layer(block, k*self.in_planes, num_blocks[1], stride=2) # 64
        self.layer3 = self._make_layer(block, k*self.in_planes, num_blocks[2], stride=2) #128
        self.layer4 = self._make_layer(block, k*self.in_planes, num_blocks[3], stride=2) #256
        self.linear = nn.Linear(self.in_planes, num_classes) #512 dense layers

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride, self.kernel_size))
            self.in_planes = planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, self.avg_pool_kernal_size)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

class ResNet3(nn.Module):
    def __init__(self, block, num_blocks, in_planes, k=2, num_classes=10, kernel_size=3):
        super(ResNet3, self).__init__()
        self.in_planes = in_planes
        self.avg_pool_kernal_size = 4
        self.kernel_size = kernel_size
        
        self.conv1 = nn.Conv2d(3, self.in_planes, kernel_size=kernel_size, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_planes)
        self.layer1 = self._make_layer(block, k*self.in_planes, num_blocks[0], stride=1) # 32
        self.layer2 = self._make_layer(block, k*self.in_planes, num_blocks[1], stride=2) # 64
        self.layer3 = self._make_layer(block, k*self.in_planes, num_blocks[2], stride=2) #128
        #self.layer4 = self._make_layer(block, k*self.in_planes, num_blocks[3], stride=2) #256
        self.linear = nn.Linear(4*self.in_planes, num_classes)  #512 dense layers

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride, self.kernel_size))
            self.in_planes = planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        #out = self.layer4(out)
        out = F.avg_pool2d(out, self.avg_pool_kernal_size)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

NameError: name 'nn' is not defined

In [12]:
def train(model, epochs, optimizer, loss_fn, train_dl, val_dl, device='cuda'):
    
    #wandb.watch(model, loss_fn, log='all', log_freq=1000, log_graph=True)
    print('train() called: model=%s, opt=%s(lr=%f), epochs=%d, device=%s\n' % \
          (type(model).__name__, type(optimizer).__name__,
           optimizer.param_groups[0]['lr'], epochs, device))
    
    model.to(device)
    total_params = sum(p.numel() for p in model.parameters())
    wandb.log({'total_params':total_params})
#     val_loss_min = np.Inf  # set valid loss to be infinity, will change when trainig loop starts
#     history             = {}
#     history['loss']     = []
#     history['val_loss'] = []
#     history['acc']      = []
#     history['val_acc']  = []

    start_time_sec = time.time()

    for epoch in range(0, epochs):

        # --- TRAIN AND EVALUATE ON TRAINING SET ----
        model.train()
        train_loss         = 0.0
        num_train_correct  = 0
        num_train_examples = 0

        for batch in train_dl:

            optimizer.zero_grad()

            x = batch[0].to(device)
            y = batch[1].to(device)
            yhat = model(x)
            loss = loss_fn(yhat,y)

            loss.backward()
            optimizer.step()

            train_loss          += loss.data.item() *x.size(0) # 
            num_train_correct   += (torch.max(yhat, 1)[1] == y).sum().item() # 
            num_train_examples  += x.shape[0]     

        train_acc = num_train_correct / num_train_examples 
        train_loss = train_loss/ len(train_dl.dataset)  # len(train_dl.dataset) --> number of data integer

        # ---EVALUATE ON VALIDATION SET ---
        model.eval()
        val_loss         = 0.0
        num_val_correct  = 0
        num_val_examples = 0

        for batch in val_dl:
            x = batch[0].to(device)
            y = batch[1].to(device)
            yhat = model(x)
            loss = loss_fn(yhat, y)

            val_loss           += loss.data.item() * x.size(0)
            num_val_correct    += (torch.max(yhat,1)[1] == y).sum().item()
            num_val_examples   += y.shape[0]

        val_acc = num_val_correct / num_val_examples
        val_loss = val_loss/len(val_dl.dataset)

        print('Epoch %3d/%3d, train loss: %5.2f, train acc: %5.2f, val loss: %5.2f, val acc: %5.2f' % \
                        (epoch, epochs, train_loss, train_acc, val_loss, val_acc))
        #history['loss'].append(train_loss)
        #history['val_loss'].append(val_loss)
        #history['acc'].append(train_acc)
        #history['val_acc'].append(val_acc)
        
        # wandb collects all the info and provides interface
        wandb.log({'epoch':epoch, 'train_loss':train_loss, 'val_loss':val_loss, 'train_acc':train_acc,'val_acc':val_acc})  
        
        #early stopping
        #if val_loss <= val_loss_min:
            #print("Validation loss Decreased {} --> {}. Saving model...".format(val_loss_min, val_loss))
            #torch.save(model.state_dict(),'models/ResNet1'+str(sweep_num)+'.pt')
            #val_loss_min = val_loss
            #torch.onnx.export(model, x, 'onnxmodels/ResNet1'+str(sweep_num)+".onnx")
            #wandb.save('models/ResNet1'+str(sweep_num)+".onnx")
    
    # END OF TRAINING LOOP
    end_time_sec       = time.time()
    total_time_sec     = end_time_sec - start_time_sec
    time_per_epoch_sec = total_time_sec / epochs
    print()
    print('Time total:     %5.2f sec' % (total_time_sec))
    print('Time per epoch: %5.2f sec' % (time_per_epoch_sec))
    
    #wandb.finish()    
    
    #return history

In [13]:
def loader(batch_size):

    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True)
    
    return trainloader, testloader

In [23]:
wandb.agent(sweep_id, sweep, count=200)

[34m[1mwandb[0m: Agent Starting Run: y4p4sjiu with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 25
[34m[1mwandb[0m: 	in_planes: 43
[34m[1mwandb[0m: 	k: 2
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	num_blocks: [2, 2, 2]
[34m[1mwandb[0m: 	optimizer: adamax


train() called: model=ResNet3, opt=Adamax(lr=0.010000), epochs=25, device=cuda




0,1
total_params,▁

0,1
total_params,4991149


Run y4p4sjiu errored: RuntimeError('mat1 dim 1 must match mat2 dim 0')
[34m[1mwandb[0m: [32m[41mERROR[0m Run y4p4sjiu errored: RuntimeError('mat1 dim 1 must match mat2 dim 0')
[34m[1mwandb[0m: Agent Starting Run: 3nu7wscz with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 25
[34m[1mwandb[0m: 	in_planes: 43
[34m[1mwandb[0m: 	k: 2
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	num_blocks: [2, 2, 2]
[34m[1mwandb[0m: 	optimizer: adamax


train() called: model=ResNet3, opt=Adamax(lr=0.050000), epochs=25, device=cuda




0,1
total_params,▁

0,1
total_params,4991149


Run 3nu7wscz errored: RuntimeError('mat1 dim 1 must match mat2 dim 0')
[34m[1mwandb[0m: [32m[41mERROR[0m Run 3nu7wscz errored: RuntimeError('mat1 dim 1 must match mat2 dim 0')
[34m[1mwandb[0m: Agent Starting Run: ju2t15nd with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 25
[34m[1mwandb[0m: 	in_planes: 43
[34m[1mwandb[0m: 	k: 2
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	num_blocks: [2, 2, 2]
[34m[1mwandb[0m: 	optimizer: adamax


train() called: model=ResNet3, opt=Adamax(lr=0.010000), epochs=25, device=cuda




0,1
total_params,▁

0,1
total_params,4991149


Run ju2t15nd errored: RuntimeError('mat1 dim 1 must match mat2 dim 0')
[34m[1mwandb[0m: [32m[41mERROR[0m Run ju2t15nd errored: RuntimeError('mat1 dim 1 must match mat2 dim 0')
Detected 3 failed runs in the first 60 seconds, killing sweep.
[34m[1mwandb[0m: [32m[41mERROR[0m Detected 3 failed runs in the first 60 seconds, killing sweep.
[34m[1mwandb[0m: To disable this check set WANDB_AGENT_DISABLE_FLAPPING=true
