In [1]:
import torch
import torch.autograd.profiler as profiler

from torch import nn, optim
from torch.utils.data import DataLoader

from torchvision import models, datasets, transforms

ModuleNotFoundError: No module named 'torchvision'

In [2]:
# prepare dataset: CIFAR-10
# size: ?? x ??

data_root = '../data/cifar-10'
batch_size = 1

data_transforms = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465),
                         std=(0.2023, 0.1994, 0.2010))
])

train_data = datasets.CIFAR10(root=data_root,
                              train=True,
                              transform=data_transforms,
                              download=True)
train_data_loader = DataLoader(train_data,
                               batch_size=batch_size,
                               shuffle=False)


test_data = datasets.CIFAR10(root=data_root,
                             train=False,
                             transform=data_transforms,
                             download=True)
test_data_loader = DataLoader(test_data,
                               batch_size=batch_size,
                               shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [3]:
# prepare models

class SingleModel(nn.Module):
    def __init__(self):
        super(SingleModel, self).__init__()

        # conv 0
        self.conv2d_0 = nn.Conv2d(in_channels=3,
                                  out_channels=32,
                                  kernel_size=5,
                                  stride=1,
                                  padding=2)
        self.relu_0 = nn.ReLU()
        self.max_pool2d_0 = nn.MaxPool2d(kernel_size=2,
                                         stride=2)
        self.batch_norm_0 = nn.BatchNorm2d(32)

        # conv 1
        self.conv2d_1 = nn.Conv2d(in_channels=32,
                                  out_channels=64,
                                  kernel_size=5,
                                  stride=1,
                                  padding=2)
        self.relu_1 = nn.ReLU()
        self.max_pool2d_1 = nn.MaxPool2d(kernel_size=2,
                                         stride=2)
        self.batch_norm_1 = nn.BatchNorm2d(64)

        self.flatten = nn.Flatten()

        self.fc_0 = nn.Linear(8 * 8 * 64, 500)
        self.relu_2 = nn.ReLU()

        self.fc_1 = nn.Linear(500, 10)

        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        output = self.conv2d_0(x)
        output = self.relu_0(output)
        output = self.max_pool2d_0(output)
        output = self.batch_norm_0(output)

        output = self.conv2d_1(output)
        output = self.relu_1(output)
        output = self.max_pool2d_1(output)
        output = self.batch_norm_1(output)

        output = self.flatten(output)

        output = self.fc_0(output)
        output = self.relu_2(output)
        output = self.fc_1(output)

        output = self.softmax(output)

        return output

In [4]:
# training configuration
lr = 0.01
epochs = 1

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

def imshow(img):
    npimg = img.numpy()

    plt.imshow(np.transpose(npimg.reshape(3,32,32), (1,2,0)).copy())
    plt.show()

In [23]:
def train_single_model():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    use_cuda = True if torch.cuda.is_available() else False
    print(f'device {device}')


    model = SingleModel().to(device)
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    iters = 1
    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        for batch_idx, (input_data, target) in enumerate(train_data_loader, 0):
            input_data = input_data.to(device)
            target = target.to(device)

            with profiler.profile(record_shapes=True,
                                  profile_memory=True,
                                  use_cuda=use_cuda) as prof:
                with profiler.record_function("train_single_custom_model"):

                    output = model(input_data)
                    loss = loss_fn(output, target)

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                if batch_idx % 10000 == 0:
                    print(loss)

                if batch_idx >= iters:
                    break

            print(prof.key_averages())
            prof.export_chrome_trace("trace_single_custom_model.json")
    '''
    correct = 0
    total = len(test_data_loader)
    for batch_idx, (input_data, target) in enumerate(test_data_loader, 0):
        input_data = input_data.to(device)
        target = target.to(device)

        output = model(input_data)

        _, predicted = torch.max(output, 1)

        if predicted == target:
            correct += 1
    print(correct / total * 100.0)
    '''

In [24]:
train_single_model()

device cuda
epoch: 0
tensor(2.2977, device='cuda:0', grad_fn=<NllLossBackward>)
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                   [memory]         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us           0 b         

In [27]:
class MultiModel(nn.Module):
    def __init__(self):
        super(MultiModel, self).__init__()

        self.single_model_0 = SingleModel()
        self.single_model_1 = SingleModel()

    def forward(self, x):
        output_0 = self.single_model_0(x)
        output_1 = self.single_model_1(x)

        return output_0, output_1

In [30]:
import torch.onnx as onnx

model = MultiModel()
model.eval()

x = torch.rand(1, 3, 32,  32, requires_grad=True)
out = model(x)

onnx.export(model,
            x,
            "multi_custom_model.onnx",
            export_params=True,
            input_names=['input'],
            output_names=['output_0', 'output_1'])

In [28]:
def train_multi_model():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    use_cuda = True if torch.cuda.is_available() else False
    print(f'device {device}')

    model = MultiModel().to(device)

    ce_loss = nn.CrossEntropyLoss()
    kld_loss = nn.KLDivLoss(reduction='mean')

    optimizer = optim.SGD(model.parameters(), lr=lr)

    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        for batch_idx, (input_data, target) in enumerate(train_data_loader, 0):
            if batch_idx >= 1:
                break

            input_data = input_data.to(device)
            target = target.to(device)

            with profiler.profile(record_shapes=True,
                                  profile_memory=True,
                                  use_cuda=use_cuda) as prof:
                with profiler.record_function("train_multi_custom_model"):
                    output_0, output_1 = model(input_data)

                    loss_0 = ce_loss(output_0, target) + kld_loss(output_1.detach().clone(), output_0)
                    loss_1 = ce_loss(output_1, target) + kld_loss(output_0.detach().clone(), output_1)

                    loss = loss_0 + loss_1

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

            if batch_idx % 10000 == 0:
                print(f'{loss_0.item()}, {loss_1.item()}')

            print(prof.key_averages())
            prof.export_chrome_trace("trace_multi_custom_model.json")

    correct_0 = 0
    correct_1 = 0
    total = len(test_data_loader)
    for batch_idx, (input_data, target) in enumerate(test_data_loader, 0):
        input_data = input_data.to(device)
        target = target.to(device)

        output_0, output_1 = model(input_data)
        _, predicted = torch.max(output_0, 1)
        if predicted == target:
            correct_0 += 1

        _, predicted = torch.max(output_1, 1)
        if predicted == target:
            correct_1 += 1

    print(correct_0 / total * 100.0)
    print(correct_1 / total * 100.0)

In [29]:
train_multi_model()

device cuda
epoch: 0
2.0938072204589844, 2.027334690093994
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                   [memory]         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us           0 b           0 b     -16.08 Mb  



In [None]:
def train_resnet18():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'device {device}')

    model = models.resnet18()
    model.fc = nn.Linear(512, 10)
    model = model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        for batch_idx, (input_data, target) in enumerate(train_data_loader, 0):
            optimizer.zero_grad()

            input_data = input_data.to(device)
            target = target.to(device)

            output = model(input_data)

            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()

            if batch_idx % 10000 == 0:
                print(loss)

    correct = 0
    total = len(test_data_loader)
    for batch_idx, (input_data, target) in enumerate(test_data_loader, 0):
        input_data = input_data.to(device)
        target = target.to(device)

        output = model(input_data)

        _, predicted = torch.max(output, 1)

        if predicted == target:
            correct += 1
    print(correct / total * 100.0)

In [None]:
#train_resnet18()