### Importing Libarary

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import argparse
import numpy as np
import time

### Data Preparation

In [2]:
transform = transforms.Compose(
            [transforms.ToTensor(),
             transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10('./data', train=True,
                                        download=True, transform=transform)
trainset, valset = torch.utils.data.random_split(trainset, [40000, 10000])
testset =  torchvision.datasets.CIFAR10('./data', train=False,
                                        download=True, transform=transform)
partition = {'train': trainset, 'val': valset, 'test': testset}

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


### Model Architecture

In [3]:
class MLP(nn.Module):
    def __init__(self, in_dim, out_dim, hid_dim, n_layer, act, dropout, use_bn, use_xavier):
        super(MLP, self).__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.hid_dim = hid_dim
        self.n_layer = n_layer
        self.act = act
        self.dropout = dropout
        self.use_bn = use_bn
        self.use_xavier = use_xavier

        # ===== Create Linear Layers ===== #
        self.fc1 = nn.Linear(self.in_dim, self.hid_dim)
        self.linears = nn.ModuleList()
        self.bns = nn.ModuleList()
        for i in range(self.n_layer-1):
            self.linears.append(nn.Linear(self.hid_dim, self.hid_dim))
            if self.use_bn:
                self.bns.append(nn.BatchNorm1d(self.hid_dim))
        self.fc2 = nn.Linear(self.hid_dim, self.out_dim)

        # ===== Create Activation Function ===== #
        if self.act == 'relu':
            self.act = nn.ReLU()
        elif self.act == 'tanh':
            self.act = nn.Tanh()
        elif self.act == 'sigmoid':
            self.act = nn.Sigmoid()
        else:
            raise ValueError('no valid activation function selected.')

        # ===== Create Regularization Layer and Initialize weights ===== #
        self.dropout = nn.Dropout(self.dropout)
        if self.use_xavier:
            self.xavier_init()
        
    def xavier_init(self):
        for linear in self.linears:
            nn.init.xavier_normal_(linear.weight)
            linear.bias.data.fill_(0.01)
    
    def forward(self, x):
        x = self.act(self.fc1(x))
        for i in range(len(self.linears)):
            x = self.act(self.linears[i](x))
            x = self.bns[i](x)
            x = self.dropout(x)
        x = self.fc2(x)
        return x

### Train, Validate, Test and Experiment Function

In [10]:
def train(net, partition, optimizer, criterion, args):
    trainloader = torch.utils.data.DataLoader(partition['train'],
                                              batch_size=args.train_batch_size,
                                              shuffle=True,
                                              num_workers=2)
    net.train()
    correct = 0
    total = 0
    train_loss = 0.0
    for data in trainloader:
        optimizer.zero_grad()

        inputs, labels = data
        inputs = inputs.view(-1, 3*32*32)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)
        outputs = net(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)  # _는 max의 value, predicted는 max의 index
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    train_loss = train_loss / len(trainloader)
    train_acc = 100 * correct / total
    return net, train_loss, train_acc

In [11]:
def validate(net, partition, criterion, args):
    valloader = torch.utils.data.DataLoader(partition['val'],
                                              batch_size=args.test_batch_size,
                                              shuffle=False,
                                              num_workers=2)
    net.eval()
    correct = 0
    total = 0
    val_loss = 0.0
    with torch.no_grad():
        for data in valloader:
            images, labels = data
            images = images.view(-1, 3*32*32)
            images = images.to(args.device)
            labels = labels.to(args.device)
            outputs = net(images)

            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)  # _는 max의 value, predicted는 max의 index
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        val_loss = val_loss / len(valloader)
        val_acc = 100 * correct / total
    return val_loss, val_acc

In [12]:
def test(net, partition, args):
    testloader = torch.utils.data.DataLoader(partition['test'],
                                              batch_size=args.test_batch_size,
                                              shuffle=False,
                                              num_workers=2)
    net.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images = images.view(-1, 3*32*32)
            images = images.to(args.device)
            labels = labels.to(args.device)
            outputs = net(images)

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        test_acc = 100 * correct / total
    return test_acc   

In [13]:
def experiment(partition, args):
    net = MLP(args.in_dim, args.out_dim, args.hid_dim, args.n_layer, args.act, args.dropout, args.use_bn, args.use_xavier)
    net = net.to(args.device)

    criterion = nn.CrossEntropyLoss()
    if args.optim == 'SGD':
        optimizer = optim.SGD(net.parameters(), lr=args.lr, weight_decay=args.l2)
    elif args.optim == 'RMSprop':
        optimizer = optim.RMSprop(net.parameters(), lr=args.lr, weight_decay=args.l2)
    elif args.optim == 'Adam':
        optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.l2)
    else:
        raise ValueError('Invalid optimizer choice')
    
    for epoch in range(args.epoch):
        ts = time.time()
        net, train_loss, train_acc = train(net, partition, optimizer, criterion, args)
        val_loss, val_acc          = validate(net, partition, criterion, args)
        te = time.time()
        print('Epoch {:3d}  Accuracy(train/val): {:2.2f}%/{:2.2f}%  Loss(train/val): {:2.2f}/{:2.2f}  Took {:2.2f} sec'.format(epoch+1, train_acc, val_acc, train_loss, val_loss, te-ts))

    test_acc = test(net, partition, args)
    return train_loss, val_loss, train_acc, val_acc, test_acc

### Experiment

In [None]:
# ===== Random Seed ===== #
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)

parser = argparse.ArgumentParser()
args = parser.parse_args("")

# ===== Device Allocation ===== #
args.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# ===== Model Capacity ===== #
args.in_dim = 3*32*32
args.out_dim = 10
args.hid_dim = 100
args.act = 'relu'

# ===== Regularization ===== #
args.dropout = 0.2
args.use_bn = True
args.l2 = 0.00001
args.use_xavier = True

# ===== Optimizer & Training ===== #
args.optim = 'Adam'  # 'SGD', 'RMSprop', 'Adam'
args.lr = 0.001
args.epoch = 10

args.train_batch_size = 256
args.test_batch_size = 1024

# ===== Experiment Variable ===== #
list_n_layer = [3, 4, 5]
list_hid_dim = [300, 500, 700]

for n_layer in list_n_layer:
    for hid_dim in list_hid_dim:
        setattr(args, 'n_layer', n_layer)
        setattr(args, 'hid_dim', hid_dim)
        print(args)
        train_loss, val_loss, _, _, test_acc = experiment(partition, args)
        print('---'*20)
        print('Test Accuracy: {:2.3f}%   Train Loss: {:2.5f}  Val Loss: {:2.5f}'.format(test_acc, train_loss, val_loss))
        print('---'*20)


Namespace(act='relu', device='cpu', dropout=0.2, epoch=10, hid_dim=300, in_dim=3072, l2=1e-05, lr=0.001, n_layer=3, optim='Adam', out_dim=10, test_batch_size=1024, train_batch_size=256, use_bn=True, use_xavier=True)
Epoch   1  Accuracy(train/val): 37.90%/45.61%  Loss(train/val): 1.74/1.54  Took 16.67 sec
Epoch   2  Accuracy(train/val): 46.25%/48.97%  Loss(train/val): 1.50/1.44  Took 16.16 sec
Epoch   3  Accuracy(train/val): 50.76%/49.12%  Loss(train/val): 1.38/1.41  Took 16.87 sec
Epoch   4  Accuracy(train/val): 53.22%/51.91%  Loss(train/val): 1.31/1.36  Took 16.33 sec
Epoch   5  Accuracy(train/val): 55.67%/52.27%  Loss(train/val): 1.23/1.35  Took 17.95 sec
Epoch   6  Accuracy(train/val): 58.03%/52.80%  Loss(train/val): 1.17/1.35  Took 16.34 sec
Epoch   7  Accuracy(train/val): 59.84%/52.90%  Loss(train/val): 1.12/1.35  Took 16.31 sec
Epoch   8  Accuracy(train/val): 61.70%/53.97%  Loss(train/val): 1.08/1.33  Took 16.43 sec
Epoch   9  Accuracy(train/val): 63.57%/54.06%  Loss(train/val): 

### Experiment