In [1]:
from torchvision import datasets

from functools import partial

# preprocessing
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torch.utils.data import random_split

# model
import torch
from torch import nn
from torch import optim

# hyperparameter tuning
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

ModuleNotFoundError: No module named 'ray'

# 1. Load Data

In [24]:
def load_data(data_dir):
    """ Create train and test pytorch dataset objects from CIFAR10.
    
    The following tranformations are applied on CIFAR10:
        * rgb to grayscale,
        * standardization
    
    Args:
        data_dir:
            directory where data will be saved, as a string.
    
    Returns:
        train and test dataset, as pytorch dataset objects.
    """
    transform = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.ToTensor(),
        transforms.Normalize(0.5, 0.5)
    ])

    trainset = datasets.CIFAR10(root=data_dir,
                                train=True, 
                                download=True, 
                                transform=transform)

    testset = datasets.CIFAR10(root=data_dir, 
                               train=False, 
                               download=True, 
                               transform=transform)

    return trainset, testset

In [None]:
training_data, test_data = load_data(data_dir='cifar10')

In [None]:
print(f'Training data: {training_data}')
print(f'Test data: {test_data}')

# 2. Neural Network

* [ ] He initialization
* [x] ReLU activation function **(hidden layer)**
* [x] Softmax activation function **(output layer)**
* [x] Cross Entropy Loss
* [x] Adam optimizer
* [x] Dropout 

In [12]:
class NeuralNetwork(nn.Module):
    """ Represents a neural network for multiclass classifiacation.
    
    This neural network uses a ReLU activation function in hidden layers and a Softmax 
    activation function in output layer. Also, dropout layers have been defined. <br>
    The parameters of the neural network are:
        * number of hidden layers,
        * sizes of input, hidden and output layers (number of neurons),
        * probability of dropout layer.
    
    Arguments:
        num_layers:
            number of hidden layers, as an integer.
        input_size:
            size of input (size of embeddings), as an integer.
        hidden_size:
            number of neurons in hidden layers, as an integer.
        output_size:
            size of output (number of labels), as an integer.
        p:
            dropout probability, as a float.
    """
    
    def __init__(self, num_layers, input_size, hidden_size, output_size, p):
        """ Initialize neural network.
        
        Args:
            num_layers:
                number of hidden layers, as an integer.
            input_size:
                size of input (size of embeddings), as an integer.
            hidden_size:
                number of neurons in hidden layers, as an integer.
            output_size:
                size of output (number of labels), as an integer.
            p:
                dropout probability, as a float.
        """
        super(NeuralNetwork, self).__init__()
        
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.p = p
        
        # flatten
        self.flatten = nn.Flatten()
        # input
        self.input_linear = nn.Linear(input_size, hidden_size)
        # hidden
        self.hidden_linears = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for i in range(num_layers)])
        # output
        self.output_linear = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        logits = self.flatten(x)
        
        logits = nn.ReLU()(self.input_linear(logits))
        logits = nn.Dropout(self.p)(logits)

        for i, layer in enumerate(self.hidden_linears):
            logits = nn.ReLU()(layer(logits))
            logits = nn.Dropout(self.p)(logits)

        logits = self.output_linear(logits)
        
        return logits

# 3. Hyperparameter Tuning

* [x] early stopping

In [None]:
def tune_fit(config, data_dir, input_size, num_labels, batch_size, num_workers, epochs):
    """ Fit neural network and hyperparameter tune in validation set.
    
    The validation set is 20% hold out data from the training set. <br>
    The metric used is accuracy score.
    
    Args:
        config:
            hyperparameters of neural network, as a dictionary.
        data_dir:
            directory where data will be saved, as a string.
        input_size:
            size of input, as an integer.
        num_labels:
            number of labels, as an integer.
        batch_size:
            size of batches to be processed, as an integer.
        num_workers:
            how many subprocesses to use for data loading (0 means that the data 
            will be loaded in the main process), as an integer.
        epochs:
            number of epochs (times the neural network will see the data), as an integer.
    """
    # configurable hidden layer dimensions
    model = NeuralNetwork(num_layers=config['num_layers'], 
                          input_size=input_size,
                          hidden_size=config['hidden_size'],
                          output_size=num_labels,
                          p=config['p'])
    
    # support data parallel training on multiple GPUs
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)
    model.to(device)
    
    loss_fn = nn.CrossEntropyLoss()
    
    # configurable learning rate in optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])

    trainset, testset = load_data(data_dir)
    
    # split data: training (80%) - validation (20%)
    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(trainset, [test_abs, len(trainset) - test_abs])

    train_dataloader = DataLoader(train_subset,
                                  batch_size=batch_size,
                                  shuffle=True, 
                                  num_workers=num_workers)
    
    val_dataloader = DataLoader(val_subset,
                                batch_size=batch_size,
                                shuffle=True,
                                num_workers=num_workers)
    
    for epoch in range(epochs):
        running_loss = 0.0
        epoch_steps = 0
        model.train()  # put on train mode
        for batch, (X, Y) in enumerate(train_dataloader, 0):
            # send the data to the GPU memory explicitly
            X, Y = X.to(device), Y.to(device)

            # reset the gradients
            optimizer.zero_grad()
            
            # compute prediction
            pred = model(X)

            # compute loss
            loss = loss_fn(pred, Y)

            # backpropagate
            loss.backward()

            # update parameters
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            # if i % 2000 == 1999:  # print every 2000 mini-batches
                # print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps))
            running_loss = 0.0

        # validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        model.eval()  # put on evaluation mode
        for batch, (X, Y) in enumerate(val_dataloader, 0):
            with torch.no_grad():
                X, Y = X.to(device), Y.to(device)

                pred = model(X)
                
                _, predicted = torch.max(pred.data, 1)
                total += Y.size(0)
                correct += (predicted == Y).sum().item()

                loss = loss_fn(pred, Y)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)

In [None]:
def hyperparameter_tune(batch_size,
                        epochs,
                        config,
                        grace_period,
                        num_samples, 
                        cpus_per_trial,
                        gpus_per_trial):
    """ Run hyperparameter tuning and report best hyperparameters.
    
    Args:
        batch_size:
            size of batches to be processed, as an integer.
        epochs:
            number of epochs (times the neural network will see the data), as an integer.
        config:
            hyperparametes, as a dictionary.
        grace_period:
            stop trials at least this old in time, as an integer.
        num_samples:
            number of times to sample from the hyperparameter space (if grid_search is provided as an argument,
            the grid will be repeated num_samples of times), as an integer.
        cpus_per_trial:
            CPUs to allocate per trial, as integer.
        gpus_per_trial:
            GPUs to allocate per trial, as integer.
    """
    input_size = 32 * 32
    num_labels = 10
    data_dir = 'cifar10'
    
    # hyperparameter search space
    cofig = config
    
    # used for early stopping
    scheduler = ASHAScheduler(metric="loss", 
                              mode="min",
                              max_t=epochs,
                              grace_period=grace_period,
                              reduction_factor=2)
    
    reporter = CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"])
    
    # hyperparamet tuning
    # data_dir, input_size, num_labels, batch_size, num_workers, epochs
    result = tune.run(partial(tune_fit, 
                              data_dir=data_dir
                              input_size=input_size, 
                              num_labels=num_labels,
                              batch_size=batch_size,
                              num_workers=3, 
                              epochs=epochs),
                      resources_per_trial={"cpu": cpus_per_trial, "gpu": gpus_per_trial},
                      config=config,
                      num_samples=num_samples,
                      scheduler=scheduler,
                      progress_reporter=reporter)
    
    # report best results
    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
    print("Best trial final validation acurracy: {}".format(best_trial.last_result["f1"]))

In [None]:
# hyperparameter search space
config = {
        "num_layers": tune.grid_search([3, 5]),
        "lr": tune.grid_search([1e-3, 1e-4]),
        "hidden_size": tune.grid_search([100]),
        "p": tune.grid_search([0.5])
    }

In [None]:
hyperparameter_tune(batch_size=64,
                    epochs=90,
                    config=config,
                    grace_period=10,
                    num_samples=10, 
                    cpus_per_trial=2,
                    gpus_per_trial=0)

# 4. Fit and Predict

In [13]:
def fit(dataloader, model, loss_fn, optimizer, print_loss=False):
    """ Fit neural network.
    
    Args:
        dataloader:
            pytorch DataLoader object.
        model:
            neural network, as pytorch object.
        loss_fn:
            loss function, as pytorch object.
        optimizer:
            optimizer function, as pytorch object.
        print_loss:
            print loss on every batch, as boolean (default False)
    """
    size = len(dataloader.dataset)
    model.train()  # put on train mode
    for batch, (X, Y) in enumerate(dataloader):
        # compute prediction
        pred = model(X)
        
        # compute loss
        loss = loss_fn(pred, Y)

        # reset the gradients
        optimizer.zero_grad()
        
        # backpropagate
        loss.backward()
        
        # update parameters
        optimizer.step()

        if print_loss and batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

In [14]:
def predict(dataloader, model, loss_fn):
    """ Predict with neural network.
    
    Args:
        dataloader:
            pytorch DataLoader object.
        model:
            neural network, as pytorch object.
        loss_fn:
            loss function, as pytorch object.
    """
    size = len(dataloader.dataset)
    
    test_loss = 0
    correct = 0
    
    model.eval()  # put on evaluation mode
    with torch.no_grad():
        for X, Y in dataloader:
            pred = model(X)
            
            test_loss += loss_fn(pred, Y).item()
            
            correct += (pred.argmax(1) == Y).type(torch.float).sum().item()

    test_loss /= size
    correct /= size
    print(f"average loss: {test_loss:>8f} \naccuracy: {(100*correct):>0.1f}%\n")

# 5. Final Model

In [25]:
batch_size = 64

training_data, test_data = load_data(data_dir='cifar10')

train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cpu device


In [27]:
num_layers = 5
input_size = 32 * 32
hidden_size = 100
num_labels = 10
p = 0.0

model = NeuralNetwork(num_layers=num_layers, 
                      input_size=input_size,
                      hidden_size=hidden_size,
                      output_size=num_labels,
                      p=p).to(device)
print(model)

learning_rate = 0.001

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

NeuralNetwork(
  (flatten): Flatten()
  (input_linear): Linear(in_features=1024, out_features=100, bias=True)
  (hidden_linears): ModuleList(
    (0): Linear(in_features=100, out_features=100, bias=True)
    (1): Linear(in_features=100, out_features=100, bias=True)
    (2): Linear(in_features=100, out_features=100, bias=True)
    (3): Linear(in_features=100, out_features=100, bias=True)
    (4): Linear(in_features=100, out_features=100, bias=True)
  )
  (output_linear): Linear(in_features=100, out_features=10, bias=True)
)


In [28]:
epochs = 90

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    
    fit(train_dataloader, model, loss_fn, optimizer, print_loss=False)
    
    print('\nTrain:\n-------')
    predict(train_dataloader, model, loss_fn)
    
    print('\nTest:\n-------')
    predict(test_dataloader, model, loss_fn)
    
    print(f"-------------------------------")

Epoch 1
-------------------------------

Train:
-------
average loss: 0.028085 
accuracy: 35.1%


Test:
-------
average loss: 0.028556 
accuracy: 34.4%

-------------------------------
Epoch 2
-------------------------------

Train:
-------
average loss: 0.026034 
accuracy: 39.7%


Test:
-------
average loss: 0.027090 
accuracy: 37.5%

-------------------------------
Epoch 3
-------------------------------

Train:
-------
average loss: 0.024617 
accuracy: 44.3%


Test:
-------
average loss: 0.026023 
accuracy: 40.8%

-------------------------------
Epoch 4
-------------------------------

Train:
-------
average loss: 0.023583 
accuracy: 46.7%


Test:
-------
average loss: 0.025769 
accuracy: 41.7%

-------------------------------
Epoch 5
-------------------------------

Train:
-------
average loss: 0.022567 
accuracy: 48.5%


Test:
-------
average loss: 0.025418 
accuracy: 42.8%

-------------------------------
Epoch 6
-------------------------------

Train:
-------
average loss: 0.021


Train:
-------
average loss: 0.010250 
accuracy: 77.2%


Test:
-------
average loss: 0.039635 
accuracy: 41.5%

-------------------------------
Epoch 46
-------------------------------

Train:
-------
average loss: 0.009492 
accuracy: 78.7%


Test:
-------
average loss: 0.040201 
accuracy: 42.2%

-------------------------------
Epoch 47
-------------------------------

Train:
-------
average loss: 0.009686 
accuracy: 78.1%


Test:
-------
average loss: 0.041213 
accuracy: 41.6%

-------------------------------
Epoch 48
-------------------------------

Train:
-------
average loss: 0.009308 
accuracy: 79.2%


Test:
-------
average loss: 0.042171 
accuracy: 41.2%

-------------------------------
Epoch 49
-------------------------------

Train:
-------
average loss: 0.009311 
accuracy: 79.0%


Test:
-------
average loss: 0.040421 
accuracy: 41.0%

-------------------------------
Epoch 50
-------------------------------

Train:
-------
average loss: 0.008958 
accuracy: 80.0%


Test:
------

average loss: 0.006408 
accuracy: 85.6%


Test:
-------
average loss: 0.058339 
accuracy: 40.6%

-------------------------------
Epoch 90
-------------------------------

Train:
-------
average loss: 0.006264 
accuracy: 85.9%


Test:
-------
average loss: 0.058146 
accuracy: 40.2%

-------------------------------
