In [1]:
import time
import json

import torch
import torch.autograd.profiler as profiler

import numpy as np

from torch import nn, optim
from torch.utils.data import DataLoader

from torchvision import models, datasets, transforms

from test.models import *
from utils.losses import OnlineDistillationLoss

In [5]:
def test_train(config):
    #device = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = config['device']
    print(f'device {device}')

    model = config['model']
    cnt_model = model.count
    print(f'model count: {cnt_model}')
    #model = model.to(device)
    #optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    # check Nimble availability and optimize a model
    if config['use_nimble']:
        dummy_input = torch.zeros(config['input_shape']).to(device)
        model = torch.cuda.Nimble(model)
        model.prepare(dummy_input, training=True)

    train_data_loader = config['train_data_loader']

    # instances for backpropagation & updating weights
    optimizer = config['optimizer']
    loss_function = config['loss_function']

    print(f'test one iteration...')

    million = 1000_000.0

    dict = {
        'elapsed_time': {
            'unit': 'millisecond',
            'forward': 0.0,
            'loss_calculation': 0.0,
            'backward': 0.0,
            'set_gradients_zero': 0.0,
            'update_weights': 0.0
        }
    }

    elapsed_time = dict['elapsed_time']

    for batch_idx, (input_data, target) in enumerate(train_data_loader, 0):

        input_data = input_data.to(device)
        target = target.to(device)

        start = time.time_ns()
        outputs = model(input_data)
        end = time.time_ns()
        elapsed_time['forward'] = (end - start) / million

        start = time.time_ns()
        loss = loss_function(outputs, target)
        end = time.time_ns()
        elapsed_time['loss_calculation'] = (end - start) / million

        start = time.time_ns()
        optimizer.zero_grad()
        end = time.time_ns()
        elapsed_time['set_gradients_zero'] = (end - start) / million

        start = time.time_ns()
        loss.backward()
        end = time.time_ns()
        elapsed_time['backward'] = (end - start) / million

        start = time.time_ns()
        optimizer.step()
        end = time.time_ns()
        elapsed_time['update_weights'] = (end - start) / million

        break

    print(f'stop training')

    with open(f'{config["output_file_path"]}', 'w', encoding='utf-8') as f:
        json.dump(dict, f, ensure_ascii=False)

def usual_train(config):
    #device = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = config['device']
    print(f'device {device}')

    model = config['model']
    cnt_model = model.count
    print(f'model count: {cnt_model}')
    #model = model.to(device)
    #optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    # check Nimble availability and optimize a model
    if config['use_nimble']:
        dummy_input = torch.zeros(config['input_shape']).to(device)
        model = torch.cuda.Nimble(model)
        model.prepare(dummy_input, training=True)

    train_losses = 0.0

    train_data_loader = config['train_data_loader']
    cnt_train_data = len(train_data_loader)

    valid_data_loader = config['valid_data_loader']
    cnt_valid_data = len(valid_data_loader)

    chances = 2
    remain_chances = chances

    prev_valid_loss = float("inf")

    # instances for backpropagation & updating weights
    optimizer = config['optimizer']
    loss_function = config['loss_function']

    start = time.time_ns()
    for epoch in range(config['epochs']):
        print(f'epoch: {epoch}')
        for batch_idx, (input_data, target) in enumerate(train_data_loader, 0):
            optimizer.zero_grad()

            input_data = input_data.to(device)
            target = target.to(device)

            outputs = model(input_data)
            loss = loss_function(outputs, target)
            train_losses += loss.item()

            loss.backward()
            optimizer.step()

            if batch_idx % 10000 == 0:
                print(f'avg loss among models: {loss}')

        train_loss = train_losses / cnt_train_data

        corrects = [0.0] * cnt_model
        valid_losses = 0.0

        for batch_idx, (input_data, target) in enumerate(valid_data_loader, 0):
            input_data = input_data.to(device)
            target = target.to(device)

            output = model(input_data)

            loss = loss_function(output, target)
            valid_losses += (loss.item() / cnt_model)

            for i in range(cnt_model):
                _, predicted = torch.max(output[i], 1) if cnt_model > 1 else torch.max(output, 1)

                if predicted == target:
                    corrects[i] += 1.0

        valid_loss = valid_losses / cnt_valid_data

        should_finish = False

        valid_accuracies = (np.array(corrects) / cnt_valid_data)

        max_valid_index = np.argmax(valid_accuracies)
        max_valid_acc = np.max(valid_accuracies)

        print(f'max valid accuracy from model #{max_valid_index}: {max_valid_acc * 100.0}')

        if max_valid_acc > target_valid_accuracy:
            should_finish = True

        print(f'train loss: {train_loss}, valid loss: {valid_loss}')
        if valid_loss > prev_valid_loss:
            if remain_chances == 0:
                should_finish = True
            else:
                remain_chances -= 1
        else:
            remain_chances = chances

        prev_valid_loss = valid_loss

        if should_finish:
            end = time.time_ns()
            print(f'stop training')
            best_accuracy = max_valid_acc * 100.0
            print(f'achieved best valid accuracy: {best_accuracy}%')
            print(f'executed epochs: {epoch}')
            elapsed_time = (end - start) / 1000_000_000.0
            print(f'ett (elapsed training time)')
            print(f'total ett: {elapsed_time} seconds')
            print(f'avg ett: {elapsed_time / float(epochs + 1)} seconds')

            with open(f'{config["output_file_path"]}', 'w') as f:
                f.write(f'{epoch + 1} {elapsed_time} {elapsed_time / float(epoch + 1)} {target_valid_accuracy} {best_accuracy}')
            break

def train(config,
          test_one_iter=False):
    if test_one_iter:
        test_train(config)
    else:
        usual_train(config)

In [6]:
data_root = '../data/cifar-10'
batch_size = 1
input_width = 32
input_height = input_width

data_transforms = transforms.Compose([
    transforms.Resize((input_width, input_height)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465),
                         std=(0.2023, 0.1994, 0.2010))
])

train_data = datasets.CIFAR10(root=data_root,
                              train=True,
                              transform=data_transforms,
                              download=True)
train_data_loader = DataLoader(train_data,
                               batch_size=batch_size,
                               shuffle=False)


test_data = datasets.CIFAR10(root=data_root,
                             train=False,
                             transform=data_transforms,
                             download=True)
test_data_loader = DataLoader(test_data,
                               batch_size=batch_size,
                               shuffle=False)


# training configuration
lr = 0.01
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MultiModel_2().to(device)
optimizer = optim.SGD(model.parameters(), lr=lr)
loss_function = OnlineDistillationLoss()

train_config = {
    'device': device,
    'model': model,
    'optimizer': optimizer,
    'loss_function': loss_function,
    'epochs': 100,
    'output_file_path': 'test_multi_model.json',
    'use_nimble': True,
    'input_shape': (batch_size, 3, input_height, input_width),
    'train_data_loader': train_data_loader
}

train(train_config, True)

Files already downloaded and verified
Files already downloaded and verified
device cuda
model count: 2
test one iteration...
stop training




In [None]:
def train(device,
          model,
          loss_function,
          optimizer,
          train_data_loader,
          valid_data_loader,
          target_valid_accuracy,
          epochs,
          #learning_rate,
          input_shape,
          use_nimble,
          output_file_path,
          iters_per_epoch=None):
    #device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'device {device}')

    cnt_model = model.count
    print(f'model count: {cnt_model}')
    #model = model.to(device)
    #optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    #
    if use_nimble:
        dummy_input = torch.zeros(input_shape).to(device)
        model = torch.cuda.Nimble(model)
        model.prepare(dummy_input, training=True)
    #

    train_losses = 0.0
    cnt_train_data = len(train_data_loader)
    cnt_valid_data = len(valid_data_loader)

    chances = 2
    remain_chances = chances

    prev_valid_loss = float("inf")

    start = time.time_ns()
    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        for batch_idx, (input_data, target) in enumerate(train_data_loader, 0):
            optimizer.zero_grad()

            input_data = input_data.to(device)
            target = target.to(device)

            outputs = model(input_data)
            loss = loss_function(outputs, target)
            train_losses += loss.item()#(loss.item() / cnt_model)

            loss.backward()
            optimizer.step()

            if batch_idx % 10000 == 0:
                print(f'avg loss among models: {loss}')

        train_loss = train_losses / cnt_train_data

        corrects = [0.0] * cnt_model
        valid_losses = 0.0

        for batch_idx, (input_data, target) in enumerate(valid_data_loader, 0):
            input_data = input_data.to(device)
            target = target.to(device)

            output = model(input_data)

            loss = loss_function(output, target)
            valid_losses += (loss.item() / cnt_model)

            for i in range(cnt_model):
                _, predicted = torch.max(output[i], 1) if cnt_model > 1 else torch.max(output, 1)

                if predicted == target:
                    corrects[i] += 1.0

        valid_loss = valid_losses / cnt_valid_data

        should_finish = False

        valid_accuracies = (np.array(corrects) / cnt_valid_data)

        max_valid_index = np.argmax(valid_accuracies)
        max_valid_acc = np.max(valid_accuracies)

        print(f'max valid accuracy from model #{max_valid_index}: {max_valid_acc * 100.0}')

        if max_valid_acc > target_valid_accuracy:
            should_finish = True

        print(f'train loss: {train_loss}, valid loss: {valid_loss}')
        if valid_loss > prev_valid_loss:
            if remain_chances == 0:
                should_finish = True
            else:
                remain_chances -= 1
        else:
            remain_chances = chances

        prev_valid_loss = valid_loss

        if should_finish:
            end = time.time_ns()
            print(f'stop training')
            best_accuracy = max_valid_acc * 100.0
            print(f'achieved best valid accuracy: {best_accuracy}%')
            print(f'executed epochs: {epoch}')
            elapsed_time = (end - start) / 1000_000_000.0
            print(f'ett (elapsed training time)')
            print(f'total ett: {elapsed_time} seconds')
            print(f'avg ett: {elapsed_time / float(epochs + 1)} seconds')

            with open(f'{output_file_path}', 'w') as f:
                f.write(f'{epoch + 1} {elapsed_time} {elapsed_time / float(epoch + 1)} {target_valid_accuracy} {best_accuracy}')
            break

In [2]:
def train(model,
          loss_function,
          train_data_loader,
          valid_data_loader,
          target_valid_accuracy,
          epochs,
          learning_rate,
          input_shape,
          use_nimble,
          output_file_path):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'device {device}')

    cnt_model = model.count
    print(f'model count: {cnt_model}')
    model = model.to(device)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    #
    if use_nimble:
        dummy_input = torch.randn(input_shape).cuda()
        model = torch.cuda.Nimble(model)
        model.prepare(dummy_input, training=True)
    #

    train_losses = 0.0
    cnt_train_data = len(train_data_loader)
    cnt_valid_data = len(valid_data_loader)

    chances = 2
    remain_chances = chances

    prev_valid_loss = float("inf")

    start = time.time_ns()
    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        for batch_idx, (input_data, target) in enumerate(train_data_loader, 0):
            optimizer.zero_grad()

            input_data = input_data.to(device)
            target = target.to(device)

            outputs = model(input_data)
            loss = loss_function(outputs, target)
            train_losses += (loss.item() / cnt_model)

            loss.backward()
            optimizer.step()

            if batch_idx % 10000 == 0:
                print(f'avg loss among models: {loss / cnt_model}')

        train_loss = train_losses / cnt_train_data

        corrects = [0.0] * cnt_model
        valid_losses = 0.0
        for batch_idx, (input_data, target) in enumerate(valid_data_loader, 0):
            input_data = input_data.to(device)
            target = target.to(device)

            output = model(input_data)

            loss = loss_function(output, target)
            valid_losses += (loss.item() / cnt_model)

            for i in range(cnt_model):
                _, predicted = torch.max(output[i], 1) if cnt_model > 1 else torch.max(output, 1)

                if predicted == target:
                    corrects[i] += 1.0

        valid_loss = valid_losses / cnt_valid_data

        should_finish = False

        valid_accuracies = (np.array(corrects) / cnt_valid_data)

        max_valid_index = np.argmax(valid_accuracies)
        max_valid_acc = np.max(valid_accuracies)

        print(f'max valid accuracy from model #{max_valid_index}: {max_valid_acc * 100.0}')

        if max_valid_acc > target_valid_accuracy:
            should_finish = True

        print(f'train loss: {train_loss}, valid loss: {valid_loss}')
        if valid_loss > prev_valid_loss:
            if remain_chances == 0:
                should_finish = True
            else:
                remain_chances -= 1
        else:
            remain_chances = chances

        prev_valid_loss = valid_loss

        if should_finish:
            end = time.time_ns()
            print(f'stop training')
            best_accuracy = max_valid_acc * 100.0
            print(f'achieved best valid accuracy: {best_accuracy}%')
            print(f'executed epochs: {epoch}')
            elapsed_time = (end - start) / 1000_000_000.0
            print(f'ett (elapsed training time)')
            print(f'total ett: {elapsed_time} seconds')
            print(f'avg ett: {elapsed_time / float(epochs + 1)} seconds')

            with open(f'{output_file_path}', 'w') as f:
                f.write(f'{epoch + 1} {elapsed_time} {elapsed_time / float(epoch + 1)} {target_valid_accuracy} {best_accuracy}')
            break

In [19]:
data_root = '../data/cifar-10'
batch_size = 1
input_width = 32
input_height = input_width

data_transforms = transforms.Compose([
    transforms.Resize((input_width, input_height)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465),
                         std=(0.2023, 0.1994, 0.2010))
])

train_data = datasets.CIFAR10(root=data_root,
                              train=True,
                              transform=data_transforms,
                              download=True)
train_data_loader = DataLoader(train_data,
                               batch_size=batch_size,
                               shuffle=False)


test_data = datasets.CIFAR10(root=data_root,
                             train=False,
                             transform=data_transforms,
                             download=True)
test_data_loader = DataLoader(test_data,
                               batch_size=batch_size,
                               shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data/cifar-10/cifar-10-python.tar.gz


15.3%IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

77.9%IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

95.3%IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100.0%

Extracting ../data/cifar-10/cifar-10-python.tar.gz to ../data/cifar-10
Files already downloaded and verified


In [5]:
train(model=MultiModel_2(),
      loss_function=OnlineDistillationLoss(),
      train_data_loader=train_data_loader,
      valid_data_loader=test_data_loader,
      target_valid_accuracy=target_valid_accuracy,
      epochs=epochs,
      learning_rate=lr,
      input_shape=[batch_size, 3, input_width, input_height],
      use_nimble=True,
      output_file_path='multi_model_2.txt')

device cuda
model count: 2
epoch: 0
avg loss among models: 2.0767369270324707
avg loss among models: 1.6453828811645508
avg loss among models: 2.262073040008545
avg loss among models: 2.341078758239746
avg loss among models: 1.745619535446167
max valid accuracy from model #1: 59.809999999999995
train loss: 1.8377732287168502, valid loss: 1.7612307681918145
epoch: 1
avg loss among models: 1.9813506603240967
avg loss among models: 1.4968984127044678
avg loss among models: 2.294503688812256
avg loss among models: 2.352879524230957
avg loss among models: 1.361149787902832
max valid accuracy from model #0: 66.44
train loss: 3.5534216199564934, valid loss: 1.7067400129675865
epoch: 2
avg loss among models: 1.9655776023864746
avg loss among models: 1.4884743690490723
avg loss among models: 2.287611961364746
avg loss among models: 2.210484027862549
avg loss among models: 1.3611485958099365
max valid accuracy from model #0: 68.51
train loss: 5.199470404629707, valid loss: 1.6874809146523475
epo



In [5]:
train(model=MultiModel_4(),
      loss_function=OnlineDistillationLoss(),
      train_data_loader=train_data_loader,
      valid_data_loader=test_data_loader,
      target_valid_accuracy=target_valid_accuracy,
      epochs=epochs,
      learning_rate=lr,
      input_shape=[batch_size, 3, input_width, input_height],
      use_nimble=True,
      output_file_path='multi_model_4.txt')

device cuda
model count: 4
epoch: 0
avg loss among models: 2.0780484676361084
avg loss among models: 2.124520778656006
avg loss among models: 2.2745704650878906
avg loss among models: 2.297234296798706
avg loss among models: 1.9206916093826294
max valid accuracy from model #3: 60.809999999999995
train loss: 1.8365279232549667, valid loss: 1.756417323410511
epoch: 1
avg loss among models: 1.5281035900115967
avg loss among models: 1.3734983205795288
avg loss among models: 2.3082122802734375
avg loss among models: 2.347811222076416
avg loss among models: 1.662687063217163
max valid accuracy from model #0: 66.84
train loss: 3.5486904705667497, valid loss: 1.7100672584414482
epoch: 2
avg loss among models: 1.7621984481811523
avg loss among models: 1.73323655128479
avg loss among models: 2.228783130645752
avg loss among models: 2.4068679809570312
avg loss among models: 1.6890041828155518
max valid accuracy from model #2: 68.73
train loss: 5.194493986582756, valid loss: 1.6863631024122239
epo



In [None]:
train(model=MultiModel_8(),
      loss_function=OnlineDistillationLoss(),
      train_data_loader=train_data_loader,
      valid_data_loader=test_data_loader,
      target_valid_accuracy=target_valid_accuracy,
      epochs=epochs,
      learning_rate=lr,
      input_shape=[batch_size, 3, input_width, input_height],
      use_nimble=True,
      output_file_path='multi_model_8.txt')

In [6]:
target_valid_accuracy = 0.5

train(model=SingleModel(),
      loss_function=nn.CrossEntropyLoss(),
      train_data_loader=train_data_loader,
      valid_data_loader=test_data_loader,
      target_valid_accuracy=target_valid_accuracy,
      epochs=epochs,
      learning_rate=lr,
      input_shape=[batch_size, 3, input_width, input_height],
      use_nimble=False,
      output_file_path='single_model.txt')

device cuda
epoch: 0
avg loss among models: 2.293501138687134
avg loss among models: 2.4611458778381348
avg loss among models: 2.4429893493652344
avg loss among models: 2.4611501693725586
avg loss among models: 2.461118698120117
max valid accuracy from model #0: 43.97
train loss: 2.062392676577568, valid loss: 2.0177929972052575
stop training
achieved best valid accuracy: 43.97%
executed epochs: 0
elapsed training time 110.249846087 seconds


In [30]:
import torch.onnx as onnx

model = MultiModel()
model.eval()

x = torch.rand(1, 3, 32,  32, requires_grad=True)
out = model(x)

onnx.export(model,
            x,
            "multi_custom_model.onnx",
            export_params=True,
            input_names=['input'],
            output_names=['output_0', 'output_1'])

In [28]:
def train_multi_model():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    use_cuda = True if torch.cuda.is_available() else False
    print(f'device {device}')

    model = MultiModel().to(device)

    ce_loss = nn.CrossEntropyLoss()
    kld_loss = nn.KLDivLoss(reduction='mean')

    optimizer = optim.SGD(model.parameters(), lr=lr)

    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        for batch_idx, (input_data, target) in enumerate(train_data_loader, 0):
            if batch_idx >= 1:
                break

            input_data = input_data.to(device)
            target = target.to(device)

            with profiler.profile(record_shapes=True,
                                  profile_memory=True,
                                  use_cuda=use_cuda) as prof:
                with profiler.record_function("train_multi_custom_model"):
                    output_0, output_1 = model(input_data)

                    loss_0 = ce_loss(output_0, target) + kld_loss(output_1.detach().clone(), output_0)
                    loss_1 = ce_loss(output_1, target) + kld_loss(output_0.detach().clone(), output_1)

                    loss = loss_0 + loss_1

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

            if batch_idx % 10000 == 0:
                print(f'{loss_0.item()}, {loss_1.item()}')

            print(prof.key_averages())
            prof.export_chrome_trace("trace_multi_custom_model.json")

    correct_0 = 0
    correct_1 = 0
    total = len(test_data_loader)
    for batch_idx, (input_data, target) in enumerate(test_data_loader, 0):
        input_data = input_data.to(device)
        target = target.to(device)

        output_0, output_1 = model(input_data)
        _, predicted = torch.max(output_0, 1)
        if predicted == target:
            correct_0 += 1

        _, predicted = torch.max(output_1, 1)
        if predicted == target:
            correct_1 += 1

    print(correct_0 / total * 100.0)
    print(correct_1 / total * 100.0)

In [29]:
#train_multi_model()

device cuda
epoch: 0
2.0938072204589844, 2.027334690093994
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                   [memory]         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us           0 b           0 b     -16.08 Mb  



In [None]:
def train_resnet18():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'device {device}')

    model = models.resnet18()
    model.fc = nn.Linear(512, 10)
    model = model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        for batch_idx, (input_data, target) in enumerate(train_data_loader, 0):
            optimizer.zero_grad()

            input_data = input_data.to(device)
            target = target.to(device)

            output = model(input_data)

            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()

            if batch_idx % 10000 == 0:
                print(loss)

    correct = 0
    total = len(test_data_loader)
    for batch_idx, (input_data, target) in enumerate(test_data_loader, 0):
        input_data = input_data.to(device)
        target = target.to(device)

        output = model(input_data)

        _, predicted = torch.max(output, 1)

        if predicted == target:
            correct += 1
    print(correct / total * 100.0)

In [None]:
#train_resnet18()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

def imshow(img):
    npimg = img.numpy()

    plt.imshow(np.transpose(npimg.reshape(3,32,32), (1,2,0)).copy())
    plt.show()