In [None]:
# execute this in command line before initiating the notebook: 
#    pip install -U pip
#    pip install -U ipywidgets==7.5.1
#    jupyter nbextension enable --py widgetsnbextension

# pip install with locked versions
! pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 -f https://download.pytorch.org/whl/torch_stable.html
! pip install -U torchvision==0.8.2
! pip install -U numpy
! pip install -U clearml
! pip install -U tensorboard
! pip install "boto3>=1.9"

In [None]:
%env CLEARML_WEB_HOST=http://103.176.146.129:8080
%env CLEARML_API_HOST=http://103.176.146.129:8008
%env CLEARML_FILES_HOST=http://103.176.146.129:8081
%env CLEARML_API_ACCESS_KEY=ICMS37GDBT8D2SI8SNI4
%env CLEARML_API_SECRET_KEY=L4EVUPDS7oPvV44cwYgeqFKRudOxNs1odCSWY31HAdYnJRz1Jh

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

import torchvision.datasets as datasets
import torchvision.transforms as transforms

from clearml import Task

In [None]:
task = Task.init(project_name='Haidx7 - Examples/Cifar10-VGG Classification',
                 task_name='Cifar10-VGG Image Classification',
                 tags=['Examples', 'Classification', 'Haidx7', 'Cifar10', 'VGG'],
                 output_uri=True
)

task.set_base_docker(
   docker_image='nvidia/cuda:11.6.2-runtime-ubuntu20.04',
   docker_arguments='--shm-size 16G',
   docker_setup_bash_script=['apt-get install -y curl ffmpeg']
)

configuration_dict = {'number_of_epochs': 2, 'batch_size': 256, 'dropout': 0.25, 'base_lr': 0.001}
configuration_dict = task.connect(configuration_dict)  # enabling configuration override by clearml
print(configuration_dict)  # printing actual configuration (after override in remote mode)

In [None]:

dataset_name = "cifar_dataset"
dataset_project = "Common_public_datasets"

from clearml import Dataset

dataset_path = Dataset.get(
    dataset_name=dataset_name, 
    dataset_project=dataset_project,
    alias="Cifar10 dataset"
).get_local_copy()

transform = transforms.Compose([transforms.ToTensor()])

trainset = datasets.CIFAR10(
    root=dataset_path,
    train=True,
    download=False,
    transform=transform
)

testset = datasets.CIFAR10(
    root=dataset_path,
    train=False,
    download=False,
    transform=transform
)

# trainset = datasets.CIFAR10(root='./data', train=True,
#                                         download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=configuration_dict.get('batch_size', 4),
                                          shuffle=True, num_workers=2)

# testset = datasets.CIFAR10(root='./data', train=False,
#                                        download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=configuration_dict.get('batch_size', 4),
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu')

In [None]:
import torchvision


net = torchvision.models.vgg19_bn().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=configuration_dict.get('base_lr', 0.001), momentum=0.9)

In [None]:
tensorboard_writer = SummaryWriter('./tensorboard_logs')

In [None]:
def test_model(test_dataloader, iteration):
    class_correct = list(0. for i in range(10))
    class_total = list(0. for i in range(10))
    with torch.no_grad():
        for j, data in enumerate(test_dataloader, 1):
            images, labels = data
            images = images.to(device)
            labels = labels.to(device)
            
            outputs = net(images)
            _, predicted = torch.max(outputs, 1)
            c = (predicted == labels).squeeze()
            for i in range(len(images)):
                label = labels[i].item()
                class_correct[label] += c[i].item()
                class_total[label] += 1
            
            if j % 500 == 0:    # report debug image every 500 mini-batches
                for n, (img, pred, label) in enumerate(zip(images, predicted, labels)):
                    tensorboard_writer.add_image("testing/{}-{}_GT_{}_pred_{}"
                                                 .format(j, n, classes[label], classes[pred]), img, iteration)

    for i in range(len(classes)):
        class_accuracy = 100 * class_correct[i] / class_total[i]
        print('[Iteration {}] Accuracy of {} : {}%'.format(iteration, classes[i], class_accuracy))
        tensorboard_writer.add_scalar('accuracy per class/{}'.format(classes[i]), class_accuracy, iteration)

    total_accuracy = 100 * sum(class_correct)/sum(class_total)
    print('[Iteration {}] Accuracy on the {} test images: {}%\n'.format(iteration, sum(class_total), total_accuracy))
    tensorboard_writer.add_scalar('accuracy/total', total_accuracy, iteration)

In [None]:
for epoch in range(configuration_dict.get('number_of_epochs', 3)):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 1):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        
        iteration = epoch * len(trainloader) + i
        if i % 2000 == 0:    # report loss every 2000 mini-batches
            print('[Epoch %d, Iteration %5d] loss: %.3f' %(epoch + 1, i + 1, running_loss / 2000))
            tensorboard_writer.add_scalar('training loss', running_loss / 2000, iteration)
            running_loss = 0.0
    
    test_model(testloader, iteration)

print('Finished Training')

In [None]:
PATH = './cifar_net.pth'
torch.save(net.state_dict(), PATH)
tensorboard_writer.close()

In [None]:
print('Task ID number is: {}'.format(task.id))