In [1]:
import time

import torch
import torch.autograd.profiler as profiler

from torch import nn, optim
from torch.utils.data import DataLoader

from torchvision import models, datasets, transforms

from test.models import *
from utils.losses import OnlineDistillationLoss

In [2]:

def train(model,
          loss_function,
          train_data_loader,
          valid_data_loader,
          target_valid_accuracy,
          epochs,
          learning_rate):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'device {device}')

    cnt_model = model.count
    model = model.to(device)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    train_losses = 0.0
    cnt_train_data = len(train_data_loader)
    cnt_valid_data = len(valid_data_loader)

    chances = 2
    remain_chances = chances

    prev_valid_loss = float("inf")

    start = time.time_ns()
    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        for batch_idx, (input_data, target) in enumerate(train_data_loader, 0):
            optimizer.zero_grad()

            input_data = input_data.to(device)
            target = target.to(device)

            outputs = model(input_data)
            loss = loss_function(outputs, target)
            train_losses += (loss.item() / cnt_model)

            loss.backward()
            optimizer.step()

            if batch_idx % 10000 == 0:
                print(f'avg loss among models: {loss / cnt_model}')

        train_loss = train_losses / cnt_train_data

        corrects = [0.0] * cnt_model
        valid_losses = 0.0
        for batch_idx, (input_data, target) in enumerate(valid_data_loader, 0):
            input_data = input_data.to(device)
            target = target.to(device)

            output = model(input_data)

            loss = loss_function(output, target)
            valid_losses += (loss.item() / cnt_model)

            for i in range(cnt_model):
                _, predicted = torch.max(output[i], 1) if cnt_model > 1 else torch.max(output, 1)

                if predicted == target:
                    corrects[i] += 1.0

        valid_loss = valid_losses / cnt_valid_data

        should_finish = False

        valid_accuracies = (np.array(corrects) / cnt_valid_data)

        max_valid_index = np.argmax(valid_accuracies)
        max_valid_acc = np.max(valid_accuracies)

        print(f'max valid accuracy from model #{max_valid_index}: {max_valid_acc * 100.0}')

        if max_valid_acc > target_valid_accuracy:
            should_finish = True

        print(f'train loss: {train_loss}, valid loss: {valid_loss}')
        if valid_loss > prev_valid_loss:
            if remain_chances == 0:
                should_finish = True
            else:
                remain_chances -= 1
        else:
            remain_chances = chances

        prev_valid_loss = valid_loss

        if should_finish:
            end = time.time_ns()
            print(f'stop training')
            best_accuracy = max_valid_acc * 100.0
            print(f'achieved best valid accuracy: {best_accuracy}%')
            print(f'executed epochs: {epoch}')
            elapsed_time = (end - start) / 1000_000_000.0
            print(f'elapsed training time {elapsed_time} seconds')

            with open(f'{type(model).__name__}.txt', 'w') as f:
                f.write(f'{epoch + 1} {elapsed_time} {target_valid_accuracy} {best_accuracy}')
            break

In [3]:
data_root = '../data/cifar-10'
batch_size = 1

data_transforms = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465),
                         std=(0.2023, 0.1994, 0.2010))
])

train_data = datasets.CIFAR10(root=data_root,
                              train=True,
                              transform=data_transforms,
                              download=True)
train_data_loader = DataLoader(train_data,
                               batch_size=batch_size,
                               shuffle=False)


test_data = datasets.CIFAR10(root=data_root,
                             train=False,
                             transform=data_transforms,
                             download=True)
test_data_loader = DataLoader(test_data,
                               batch_size=batch_size,
                               shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [4]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

def imshow(img):
    npimg = img.numpy()

    plt.imshow(np.transpose(npimg.reshape(3,32,32), (1,2,0)).copy())
    plt.show()

In [5]:
# training configuration
lr = 0.01
epochs = 1000
target_valid_accuracy = 0.4

In [6]:
train(model=MultiModel_2(),
      loss_function=OnlineDistillationLoss(),
      train_data_loader=train_data_loader,
      valid_data_loader=test_data_loader,
      target_valid_accuracy=target_valid_accuracy,
      epochs=epochs,
      learning_rate=lr)

device cuda
epoch: 0
avg loss among models: 2.049748659133911
avg loss among models: 1.8854020833969116
avg loss among models: 2.191504955291748
avg loss among models: 2.334974765777588
avg loss among models: 2.2918152809143066
max valid accuracy from model #0: 61.839999999999996
train loss: 1.8344030031967162, valid loss: 1.7514836047172546
stop training
achieved best valid accuracy: 61.839999999999996%
executed epochs: 0
elapsed training time 178.910313789 seconds




In [8]:
train(model=MultiModel_4(),
      loss_function=OnlineDistillationLoss(),
      train_data_loader=train_data_loader,
      valid_data_loader=test_data_loader,
      target_valid_accuracy=target_valid_accuracy,
      epochs=epochs,
      learning_rate=lr)

device cuda
epoch: 0
avg loss among models: 2.075756549835205
avg loss among models: 1.89504075050354
avg loss among models: 2.2569732666015625
avg loss among models: 2.3031485080718994
avg loss among models: 1.9140231609344482
accuracy of model #0: 59.95%
accuracy of model #1: 58.96%
accuracy of model #2: 60.01%
accuracy of model #3: 60.61%
epoch: 1
avg loss among models: 1.6761380434036255
avg loss among models: 1.8591288328170776
avg loss among models: 2.3160436153411865
avg loss among models: 2.3319597244262695
avg loss among models: 1.6956121921539307
accuracy of model #0: 65.62%
accuracy of model #1: 66.25999999999999%
accuracy of model #2: 65.8%
accuracy of model #3: 65.29%
epoch: 2
avg loss among models: 1.5011800527572632
avg loss among models: 1.647318959236145
avg loss among models: 2.3073654174804688
avg loss among models: 2.3466482162475586
avg loss among models: 1.6553175449371338
accuracy of model #0: 68.39%
accuracy of model #1: 67.99%
accuracy of model #2: 68.67%
accur

In [9]:
train(model=MultiModel_8(),
      loss_function=OnlineDistillationLoss(),
      train_data_loader=train_data_loader,
      valid_data_loader=test_data_loader,
      target_valid_accuracy=target_valid_accuracy,
      epochs=epochs,
      learning_rate=lr)

device cuda
epoch: 0
avg loss among models: 2.081385374069214
avg loss among models: 1.7985424995422363
avg loss among models: 2.2642011642456055
avg loss among models: 2.3326854705810547
avg loss among models: 2.1625137329101562
accuracy of model #0: 59.12%
accuracy of model #1: 61.14000000000001%
accuracy of model #2: 61.29%
accuracy of model #3: 61.08%
accuracy of model #4: 60.650000000000006%
accuracy of model #5: 59.760000000000005%
accuracy of model #6: 60.34%
accuracy of model #7: 60.660000000000004%
epoch: 1
avg loss among models: 1.696842908859253
avg loss among models: 1.652596354484558
avg loss among models: 2.3210926055908203
avg loss among models: 2.3656201362609863
avg loss among models: 1.499781608581543
accuracy of model #0: 66.11%
accuracy of model #1: 66.96%
accuracy of model #2: 66.99000000000001%
accuracy of model #3: 65.63%
accuracy of model #4: 65.47%
accuracy of model #5: 65.5%
accuracy of model #6: 65.3%
accuracy of model #7: 66.17%
epoch: 2
avg loss among model

In [7]:
train(model=SingleModel(),
      loss_function=nn.CrossEntropyLoss(),
      train_data_loader=train_data_loader,
      valid_data_loader=test_data_loader,
      target_valid_accuracy=target_valid_accuracy,
      epochs=epochs,
      learning_rate=lr)

device cuda
epoch: 0
avg loss among models: 2.3024001121520996
avg loss among models: 2.4309301376342773
avg loss among models: 2.461027145385742
avg loss among models: 2.460101842880249
avg loss among models: 2.460869789123535


RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 5.80 GiB total capacity; 3.05 GiB already allocated; 259.62 MiB free; 3.06 GiB reserved in total by PyTorch)

In [30]:
import torch.onnx as onnx

model = MultiModel()
model.eval()

x = torch.rand(1, 3, 32,  32, requires_grad=True)
out = model(x)

onnx.export(model,
            x,
            "multi_custom_model.onnx",
            export_params=True,
            input_names=['input'],
            output_names=['output_0', 'output_1'])

In [28]:
def train_multi_model():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    use_cuda = True if torch.cuda.is_available() else False
    print(f'device {device}')

    model = MultiModel().to(device)

    ce_loss = nn.CrossEntropyLoss()
    kld_loss = nn.KLDivLoss(reduction='mean')

    optimizer = optim.SGD(model.parameters(), lr=lr)

    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        for batch_idx, (input_data, target) in enumerate(train_data_loader, 0):
            if batch_idx >= 1:
                break

            input_data = input_data.to(device)
            target = target.to(device)

            with profiler.profile(record_shapes=True,
                                  profile_memory=True,
                                  use_cuda=use_cuda) as prof:
                with profiler.record_function("train_multi_custom_model"):
                    output_0, output_1 = model(input_data)

                    loss_0 = ce_loss(output_0, target) + kld_loss(output_1.detach().clone(), output_0)
                    loss_1 = ce_loss(output_1, target) + kld_loss(output_0.detach().clone(), output_1)

                    loss = loss_0 + loss_1

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

            if batch_idx % 10000 == 0:
                print(f'{loss_0.item()}, {loss_1.item()}')

            print(prof.key_averages())
            prof.export_chrome_trace("trace_multi_custom_model.json")

    correct_0 = 0
    correct_1 = 0
    total = len(test_data_loader)
    for batch_idx, (input_data, target) in enumerate(test_data_loader, 0):
        input_data = input_data.to(device)
        target = target.to(device)

        output_0, output_1 = model(input_data)
        _, predicted = torch.max(output_0, 1)
        if predicted == target:
            correct_0 += 1

        _, predicted = torch.max(output_1, 1)
        if predicted == target:
            correct_1 += 1

    print(correct_0 / total * 100.0)
    print(correct_1 / total * 100.0)

In [29]:
#train_multi_model()

device cuda
epoch: 0
2.0938072204589844, 2.027334690093994
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                   [memory]         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us           0 b           0 b     -16.08 Mb  



In [None]:
def train_resnet18():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'device {device}')

    model = models.resnet18()
    model.fc = nn.Linear(512, 10)
    model = model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        for batch_idx, (input_data, target) in enumerate(train_data_loader, 0):
            optimizer.zero_grad()

            input_data = input_data.to(device)
            target = target.to(device)

            output = model(input_data)

            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()

            if batch_idx % 10000 == 0:
                print(loss)

    correct = 0
    total = len(test_data_loader)
    for batch_idx, (input_data, target) in enumerate(test_data_loader, 0):
        input_data = input_data.to(device)
        target = target.to(device)

        output = model(input_data)

        _, predicted = torch.max(output, 1)

        if predicted == target:
            correct += 1
    print(correct / total * 100.0)

In [None]:
#train_resnet18()