## Objectives
* Instead of building a simple model, try creating a more advanced CNN model that can be easily tuned
* Create models with varying depth (number of convolutional layers) to observe performance vs time consumption
* Apply various hyperparameter tuning techniques to CNN models

In [1]:
import torch
import torchvision
from torchvision import transforms, datasets
from torchvision.transforms import Normalize, ToTensor
import torch.nn as nn  # neural network
import torch.optim as optim  # optimization layer
import torch.nn.functional as F  # activation functions
import matplotlib.pyplot as plt
import argparse
import time
from collections import OrderedDict

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # use gpu if available

# load data in
train_set = datasets.EMNIST(root="data", split="balanced",
                        train=True, download=True,
                        transform=transforms.Compose([ToTensor()])
                           )
test_set = datasets.EMNIST(root="data", split="balanced", 
                       train=False, download=True, 
                       transform=transforms.Compose([ToTensor()])
                          )

In [3]:
model_codes = {
    'model_1': [64, 'M', 128, 'M', 'D', 256, 'M', 512, 'M', 'D'],
    'model_2': [64, 64, 'M', 128, 128, 'M', 'D', 256, 256, 'M', 512, 512, 'M', 'D'],
    'model_3': [64, 64, 64, 'M', 128, 128, 128, 'M', 'D', 256, 256, 256, 'M', 512, 512, 512, 'M', 'D']
}

For a simple CNN model like below, you have to add layers manually, which is a problem when creating a larger neural network.

In [4]:
class Simple_CNN(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 3 convolutional layers
        self.cv1 = nn.Conv2d(in_channel=1,out_channels=16,kernel_size=5, stride=1)  # input: 1 if grayscale, 3 if RGB
        self.cv2 = nn.Conv2d(16, 64, 5)
        self.cv3 = nn.Conv2d(64, 128, 5)
        self.dropout1 = nn.Dropout(0.2)
        
        # Dense layer - (fully connected)
        self.fc1 = nn.Linear(in_features=128*3*3, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=128)
        self.out = nn.Linear(in_features=128, out_features=47)
        
    def forward(self, x):
        '''
        forward method explicitly defines the network's transformation.
        forward method maps an input tensor to a prediction output tensor
        '''
        # hidden convolutional layers
        x = F.relu(self.cv1(x))
        x = F.relu(self.cv2(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        x = F.relu(self.cv3(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        
        x = self.dropout1(x)
        
        # hidden linear layers
        x = torch.flatten(x, 1)
        #x = x.view(-1, 128*3*3)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        # output layer
        x = self.out(x)
        x = F.softmax(x, dim=1)
        
        return x

I created a function inside the CNN class to automatically create convolutional layers given a list of inputs of nodes.

In [5]:
# easily tunable model
class CNN(nn.Module):
    def __init__(self, model_code, in_channels, out_dim, act, use_bn, dropout):
        super(CNN, self).__init__()
        
        if act == 'relu':
            self.act = nn.ReLU()
        elif act == 'leakyrelu':
            self.act = nn.LeakyReLU()
        else:
            raise ValueError("Not a valid activation function")
            
        
        self.layers = self.make_layers(model_code, in_channels, use_bn, dropout)
        self.classifier = nn.Sequential(nn.Linear(512, 256),
                                        self.act,
                                        nn.Linear(256, out_dim)
                                       )

    def forward(self, x):
        x = self.layers(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        # skipped softmax siince cross-entropy loss is used
        return x
    
    def make_layers(self, model_code, in_channels, use_bn, dropout):
        layers = []
        for x in model_codes[model_code]:
            if x == "M":
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            elif x == 'D':
                layers += [nn.Dropout(dropout)]
            else:
                layers += [nn.Conv2d(in_channels=in_channels,
                                    out_channels=x,
                                    kernel_size=3,
                                    stride=1,
                                    padding=1)]
                if use_bn:
                    layers += [nn.BatchNorm2d(x)]
                layers += [self.act]
                in_channels = x
        return nn.Sequential(*layers)

# Train, Validate, Test Functions

In [6]:
def train(net, optimizer, criterion, args):
    '''
    Returns validation loss and accuracy
    
        Parameters:
            net (CNN): a convolutional neural network to train
            optimizer: optimizer
            criterion (loss function): a loss function to evaluate the model on
            args (ArgumentParser): hyperparameters
        
        Returns:
            net (CNN): a trained model
            train_loss (float): train loss
            train_acc (float): train accuracy
    '''
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.train_batch, shuffle=True)
    
    net.train()
    
    correct = 0
    total = 0
    train_loss = 0
    
    for i, data in enumerate(train_loader):
        inputs, labels = data
        inputs = inputs.cuda()
        labels = labels.cuda()
        outputs = net(inputs)
        
        optimizer.zero_grad()
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
        # the class with the highest value is the prediction
        _, prediction = torch.max(outputs.data, 1)  # grab prediction as one-dimensional tensor
        total += labels.size(0)
        correct += (prediction == labels).sum().item()

    train_loss = train_loss / len(train_loader)
    train_acc = 100 * correct / total
    
    return net, train_loss, train_acc  # net is returned to be fed to the test function later

In [7]:
def validate(net, criterion, args):
    '''
    Returns validation loss and accuracy
    
        Parameters:
            net (CNN): a convolutional neural network to validate
            criterion (loss function): a loss function to evaluate the model on
            args (ArgumentParser): hyperparameters
        
        Returns:
            val_loss (float): validation loss
            val_acc (float): validation accuracy
    '''
    val_loader = torch.utils.data.DataLoader(val_set, batch_size=args.test_batch, shuffle=True)
    
    net.eval()

    correct = 0
    total = 0
    val_loss = 0 
    
    with torch.no_grad():
        for data in val_loader:
            inputs, labels = data
            inputs = inputs.cuda()
            labels = labels.cuda()
            outputs = net(inputs)

            loss = criterion(outputs, labels)
            
            val_loss += loss.item()
            _, prediction = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (prediction == labels).sum().item()

        val_loss = val_loss / len(val_loader)
        val_acc = 100 * correct / total

    return val_loss, val_acc

In [8]:
def test(net, args):
    '''
    Returns test accuracy
    
        Parameters:
            net (CNN): a trained model
            args (ArgumentParser): hyperparameters
        
        Returns:
            test_acc (float): test accuracy of a trained model
    '''
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=args.test_batch, shuffle=True)

    net.eval()
    
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data
            inputs = inputs.cuda()
            labels = labels.cuda()
            outputs = net(inputs)

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        test_acc = 100 * correct / total

    return test_acc

In [9]:
def experiment(args):
    '''
    Execute train and validate functions epoch-times to train a CNN model.
    Each time, store train & validation loss and accuracy.
    Then, test the model and return the result.
    
        Parameter:
            args (ArgumentParser): hyperparameters
        
        Returns:
            vars(args) (Dictionary): settings of the model
            results (OrderedDict): stored stats of each epoch + test accuracy
    '''
    net = CNN(model_code = args.model_code,
              in_channels = args.in_channels, 
              out_dim = args.out_dim, 
              act = args.act, 
              use_bn = args.use_bn, 
              dropout = args.dropout
             )
    net = net.cuda()
    criterion = nn.CrossEntropyLoss()
    
    # select an optimizer
    if args.optim == 'adam':
        optimizer = optim.Adam(net.parameters(), lr=args.lr)  # learning rate
    elif args.optim == 'sgd':
        optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9)
    else:
        raise ValueError('Invalid optimizer')

    # containers to keep track of statistics
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    time_total = 0
        
    for epoch in range(args.epoch):  # number of training to be completed
        time_start = time.time()
        net, train_loss, train_acc = train(net, optimizer, criterion, args)
        val_loss, val_acc = validate(net, criterion, args)
        time_end = time.time()
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accs.append(train_acc)
        val_accs.append(val_acc)
        
        time_duration = round(time_end - time_start, 2)
        time_total += time_duration
        
        # print results of each iteration
        print(f'Epoch {epoch+1}, Accuracy(train, validation):{round(train_acc, 2), round(val_acc, 2)}, '
              f'Loss(train, validation):{round(train_loss, 4), round(val_loss, 4)}, Time: {time_duration}s')

    test_acc = test(net, args)

    results = OrderedDict()
    results['train_losses'] = [round(x, 4) for x in train_losses]
    results['val_losses'] = [round(x, 4) for x in val_losses]
    results['train_accs'] = [round(x, 2) for x in train_accs]
    results['val_accs'] = [round(x, 2) for x in val_accs]
    results['train_acc'] = round(train_acc, 2)
    results['val_acc'] = round(val_acc, 2)
    results['test_acc'] = round(test_acc, 2)
    results['time_total'] = round(time_total, 2)
    
    return vars(args), results

In [10]:
parser = argparse.ArgumentParser()
args = parser.parse_args("")

#### Model Capacity ####
args.model_code = 'model_2'
args.in_channels = 1
args.out_dim = 47
args.act = 'relu'

#### Regularization ####
args.dropout = 0.3
args.use_bn = True

#### Optimization ####
args.optim = 'sgd'
args.lr = 0.001  # learning rate
args.epoch = 10
args.train_batch = 256
args.test_batch = 256

#### Experimental Variables ####
models = ['model_1', 'model_2', 'model_3']
optims = ['adam', 'sgd']
split_sizes = [0.7, 0.8, 0.85]

entire_trainset = torch.utils.data.DataLoader(train_set, shuffle=True)
split_train_size = int(0.8*(len(entire_trainset)))  # use 80% as train set
split_valid_size = len(entire_trainset) - split_train_size  # use 20% as validation set

train_set, val_set = torch.utils.data.random_split(train_set, [split_train_size, split_valid_size]) 
print(f'train set size: {split_train_size}, validation set size: {split_valid_size}')

print(args)

train set size: 90240, validation set size: 22560
Namespace(act='relu', dropout=0.3, epoch=10, in_channels=1, lr=0.001, model_code='model_2', optim='sgd', out_dim=47, test_batch=256, train_batch=256, use_bn=True)


I will be focusing on the impact of structures and optimizers to a model first as I believe they're the hyperparmeters with the
biggest influence.

In [11]:
for model in models:
    for opt in optims:
        print(f'model code: {model} with {opt} optimizer')
        args.model_code = model
        args.optim = opt
        setting, results = experiment(args)
        print(setting)
        print('Test Accuracy: {}'.format(results['test_acc']))
        print('Total time duration: {}'.format(results['time_total']))
        print()

model code: model_1 with adam optimizer
Epoch 1, Accuracy(train, validation):(77.89, 86.23), Loss(train, validation):(0.7144, 0.3755), Time: 16.93s
Epoch 2, Accuracy(train, validation):(86.17, 87.09), Loss(train, validation):(0.3871, 0.3483), Time: 16.24s
Epoch 3, Accuracy(train, validation):(87.61, 88.16), Loss(train, validation):(0.3419, 0.3223), Time: 16.39s
Epoch 4, Accuracy(train, validation):(88.41, 88.15), Loss(train, validation):(0.313, 0.3167), Time: 16.37s
Epoch 5, Accuracy(train, validation):(89.08, 88.67), Loss(train, validation):(0.291, 0.3097), Time: 16.01s
Epoch 6, Accuracy(train, validation):(89.53, 89.0), Loss(train, validation):(0.2758, 0.3017), Time: 15.88s
Epoch 7, Accuracy(train, validation):(89.91, 89.18), Loss(train, validation):(0.2617, 0.2938), Time: 15.91s
Epoch 8, Accuracy(train, validation):(90.2, 89.14), Loss(train, validation):(0.2511, 0.3038), Time: 16.49s
Epoch 9, Accuracy(train, validation):(90.71, 88.82), Loss(train, validation):(0.239, 0.3028), Time: 

## Observation
As neural nets became more complex with more nodes:
   * execution time got extended significantly
   * train loss and validation loss did not change with the adam optimizer
       * surprisingly, model_1 with the adam optimizer yielded the best result among all models under the same condition.
   * train loss and validation loss improved gradually with the SGD optimizer
   
Side Notes
   * adam optimizer had tendency to overfit data in all models
   
   
## Conclusion
* After certain number of convolutional layers, the difference in performance between models is quite insignificant; however, the execution time tends to always increase with the depth, causing unwanted inefficiency. Therefore, the complexity of a convolutional neural network does not justify how efficient it is.

Later: Model of choice to test hyperparameter tuning: model_2 with SGD