This notebook will provide the steps taken to perform the optimization of the hyperparameters in the sparse invarient convolutional neural network.

To start off all dependencies are defined installed and imported and the model like defined in the main notebook is altered to make it more suitable to change the hyperparameters.

In [None]:
!pip install scikit-optimize
!pip install bayesian-optimization

In [None]:
import multiprocessing as mp
import random
import time
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from concurrent.futures import ProcessPoolExecutor

from skopt import gp_minimize
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args
from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours

In [None]:
# Functions below are reused from the original sparse invarient model

def same_padding(kernel_size, stride=1):
    pad_val = (kernel_size - stride) // 2
    return (pad_val, pad_val + (kernel_size % 2 - 1))

def create_sparse_representations(batch, sparsity=0.5):
    masks = torch.bernoulli(torch.ones_like(batch) * (1 - sparsity))
    sparse_batch = batch * masks
    return sparse_batch, masks

def get_data(batch_size=64):
    train_dataset = datasets.MNIST(root='mnist_data', train=True, transform=transforms.ToTensor(), download=True)
    test_dataset = datasets.MNIST(root='mnist_data', train=False, transform=transforms.ToTensor())
    train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)
    return train_loader, test_loader

def evaluate_accuracy(data_loader, sparcity, net):
    net.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in data_loader:
            images, labels = data[0].to(device), data[1].to(device)

            sparse_images, masks = create_sparse_representations(images, sparsity=sparcity)
            sparse_images, masks = sparse_images.to(device), masks.to(device)
            outputs = net(sparse_images, masks)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

In [None]:
# Classes below have been modified from the sparse invarient model to be apple to change hyperparameters such as the number of layers in an efficient way

class SparseConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=False):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=bias)
        self.bias = torch.rand_like(torch.Tensor(out_channels)).to('cuda')

    def forward(self, input, mask):
        new_mask = F.conv2d(mask, torch.ones_like(self.conv.weight), None, self.conv.stride, self.conv.padding)
        valid_pixel_amount = new_mask.clone().detach()
        output = self.conv(input * mask) * (self.conv.kernel_size[0] * self.conv.kernel_size[1])
        output = output.div(torch.add(valid_pixel_amount, 1e-5))
        output = output + self.bias.view(1,-1,1,1)
        new_mask = torch.ceil(new_mask.clamp_(0, 1))
        return output, new_mask

class SparseInvariantCNN(nn.Module):
    def __init__(self, num_layers, activation_fn=nn.ReLU):
        super().__init__()
        layers = [nn.Conv2d(1, 16, kernel_size=11, padding=same_padding(11)), activation_fn()]
        kernel_size = 11

        for _ in range(num_layers - 3):
            kernel_size -= 2
            layers += [nn.Conv2d(16, 16, kernel_size=kernel_size, padding=same_padding(kernel_size)), activation_fn()]

        layers += [nn.Conv2d(16, 16, kernel_size=3, padding=same_padding(3)), activation_fn(),
                   nn.Conv2d(16, 10, kernel_size=1, padding=same_padding(1)), activation_fn()]

        self.features = nn.Sequential(*layers)

    def forward(self, x, mask=None):
        for layer in self.features:
            if isinstance(layer, SparseConv2d):
                x, mask = layer(x, mask)
            else:
                x = layer(x)
        x = x.view(x.size(0), -1)
        return x

In [None]:
# Original Model

train_loader, test_loader = get_data(batch_size=64)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net = SparseInvariantCNN(num_layers=8, activation_fn=nn.ReLU).to(device)
optimizer = optim.Adam(net.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

train_losses = []
train_accs = []
test_accs = []
epochs = 25

for epoch in range(epochs):
    net.train()  # Set the model to training mode
    running_loss = 0.0
    sparcity = 0.9
    for i, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        sparse_images, masks = create_sparse_representations(images, sparsity=sparcity)
        sparse_images, masks = sparse_images.to(device), masks.to(device)
        optimizer.zero_grad()
        outputs = net(sparse_images, masks)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    train_acc = evaluate_accuracy(train_loader, sparcity, net) * 100
    train_accs.append(train_acc)
    test_acc = evaluate_accuracy(test_loader, sparcity, net) * 100
    test_accs.append(test_acc)

    print(f'Epoch {epoch+1}/{epochs}')
    print(f'Loss: {avg_train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test Accuracy: {test_acc:.2f}%')

print('Finished Training')


In [None]:
# The model is modified into its own class to make reusing for hyperparameter optimization more efficient

class TrainModel:
    def __init__(self, num_epochs, num_layers, activation_fn, batch_size, learning_rate):
        self.num_epochs = num_epochs
        self.num_layers = num_layers
        self.activation_fn = activation_fn
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def get_data(self):
        train_loader, test_loader = get_data(batch_size=self.batch_size)
        return train_loader, test_loader

    def train(self):
        train_loader, test_loader = self.get_data()
        net = SparseInvariantCNN(num_layers=self.num_layers, activation_fn=self.activation_fn).to(self.device)
        optimizer = optim.Adam(net.parameters(), lr=self.learning_rate)
        criterion = nn.CrossEntropyLoss()

        train_losses, train_accs, test_accs = [], [], []

        for epoch in range(self.num_epochs):
            net.train()
            running_loss = 0.0
            sparcity = 0.9
            for i, (images, labels) in enumerate(train_loader):
                images, labels = images.to(self.device), labels.to(self.device)
                sparse_images, masks = create_sparse_representations(images, sparsity=sparcity)
                sparse_images, masks = sparse_images.to(self.device), masks.to(self.device)

                optimizer.zero_grad()
                outputs = net(sparse_images, masks)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()

            avg_train_loss = running_loss / len(train_loader)
            train_losses.append(avg_train_loss)

            train_acc = evaluate_accuracy(train_loader, sparcity, net) * 100
            train_accs.append(train_acc)
            test_acc = evaluate_accuracy(test_loader, sparcity, net) * 100
            test_accs.append(test_acc)

            print(f'Epoch {epoch+1}/{self.num_epochs}')
            print(f'Loss: {avg_train_loss:.4f}, Training Accuracy: {train_acc:.2f}%, Test Accuracy: {test_acc:.2f}%')

        print('Finished Training')
        return train_losses

In [None]:
# Example usage
model_trainer = TrainModel(num_epochs=25, num_layers=8, activation_fn=nn.ReLU, batch_size=64, learning_rate=0.001)
train_losses = model_trainer.train()

In [None]:
# Attempting the use of multiprocessing to speed up the training process

def train_model_in_parallel(model_trainer):
    return model_trainer.train()

if __name__ == "__main__":
    ctx = mp.get_context('forkserver')

    model_trainers = [
        TrainModel(num_epochs=1, num_layers=8, activation_fn=nn.ReLU, batch_size=64, learning_rate=0.001),
        TrainModel(num_epochs=1, num_layers=7, activation_fn=nn.ReLU, batch_size=64, learning_rate=0.001),
        TrainModel(num_epochs=1, num_layers=6, activation_fn=nn.ReLU, batch_size=64, learning_rate=0.001)
    ]

    with ctx.Pool(processes=2) as pool:
        results = pool.map(train_model_in_parallel, model_trainers)

        for result in results:
            print(result)

In [None]:
# Grid search with random selected variables

def search(n_iter, num_epochs, num_layers_options, activation_fn_options, batch_size_options, learning_rate_options):
    lowest_loss = float('inf')
    best_params = {}

    for i in range(n_iter):
        num_layers = random.choice(num_layers_options)
        activation_fn = random.choice(activation_fn_options)
        batch_size = random.choice(batch_size_options)
        learning_rate = random.choice(learning_rate_options)

        print(f"Iteration {i+1}: num_layers={num_layers}, activation_fn={activation_fn.__name__}, "
              f"batch_size={batch_size}, learning_rate={learning_rate}")

        try:
            model = TrainModel(num_epochs=num_epochs,
                               num_layers=num_layers,
                               activation_fn=activation_fn,
                               batch_size=batch_size,
                               learning_rate=learning_rate)
            train_losses = model.train()
            final_loss = train_losses[-1]

            if final_loss < lowest_loss:
                lowest_loss = final_loss
                best_params = {
                    "num_layers": num_layers,
                    "activation_fn": activation_fn.__name__,
                    "batch_size": batch_size,
                    "learning_rate": learning_rate,
                    "final_loss": final_loss
                }
        except Exception as e:
            print(f"An error occurred: {e}")
            continue

    print(f"Best Parameters: {best_params}, Lowest Final Training Loss: {lowest_loss}")
    return best_params

In [None]:
# Step 1: Grid Search, logging results to log1.txt

opt1 = search(n_iter=15,
                num_epochs=15,
                num_layers_options=[5, 6, 7, 8],
                activation_fn_options=[nn.ReLU, nn.LeakyReLU, nn.ELU],
                batch_size_options=[64, 128, 256],
                learning_rate_options=[0.001, 0.005, 0.01, 0.05, 0.1])

In [None]:
# Step 2: Grid Search, logging results to log2.txt

opt2 = search(n_iter=15,
                num_epochs=25,
                num_layers_options=[7, 8],
                activation_fn_options=[nn.ReLU, nn.LeakyReLU],
                batch_size_options=[256, 512, 1024],
                learning_rate_options=[0.0005, 0.001, 0.004, 0.007])

In [None]:
# Step 3: Grid Search, logging results to log3.txt

opt3 = search(n_iter=15,
                num_epochs=25,
                num_layers_options=[7, 8],
                activation_fn_options=[nn.ReLU, nn.LeakyReLU],
                batch_size_options=[1024, 2048, 4096],
                learning_rate_options=[0.00025, 0.0005, 0.00075, 0.001, 0.002])

In [None]:
# Step 4: Grid Search, logging results to log4.txt

opt1 = search(n_iter=25,
                num_epochs=25,
                num_layers_options=[7, 8],
                activation_fn_options=[nn.ReLU, nn.LeakyReLU],
                batch_size_options=[32, 64, 128, 256],
                learning_rate_options=[0.0005, 0.00075, 0.001, 0.0015, 0.003])


In [None]:
# Create model suitable for Bayesian optimization

current_iteration = 0

def train_model_wrapper(num_layers, activation_fn_index, batch_size, learning_rate):
    global current_iteration
    current_iteration += 1

    num_layers = int(num_layers)
    batch_size = int(batch_size)
    learning_rate = round(float(learning_rate), 5)
    activation_fn_index = int(activation_fn_index)
    activation_functions = [nn.ReLU, nn.LeakyReLU]
    activation_fn = activation_functions[activation_fn_index]

    print_statement = f"\nIteration {current_iteration}, Parameters: num_layers={num_layers}, activation_fn={activation_fn.__name__}, batch_size={batch_size}, learning_rate={learning_rate}"
    print(print_statement)

    model = TrainModel(
        num_epochs=25,
        num_layers=num_layers,
        activation_fn=activation_fn,
        batch_size=batch_size,
        learning_rate=learning_rate
    )
    try:
        train_losses = model.train()
        final_loss = train_losses[-1]
    except Exception as e:
        print(f"An error occurred during training: {e}")
        final_loss = float('inf')  # Assign a high loss when it fails
    return -final_loss  # Return negative loss for maximization


In [None]:
# Function for Bayesian optimization

def optimize_hyperparameters(pbounds, init_points=3, n_iter=25):
    optimizer = BayesianOptimization(
        f=train_model_wrapper,
        pbounds=pbounds,
        random_state=0,
    )

    optimizer.maximize(
        init_points=init_points,
        n_iter=n_iter,
    )

    best_params = optimizer.max['params']
    print("Best Parameters:")
    for param, value in best_params.items():
        if param == 'activation_fn_index':
            print(f"{param}: {['ReLU', 'LeakyReLU'][int(value)]}")
        elif param == 'batch_size' or param == 'num_layers':
            print(f"{param}: {int(value)}")
        else:
            print(f"{param}: {value}")

    return best_params

In [None]:
# Step 5: Bayesian optimization, logging results to log5.txt

pbounds5 = {
    'num_layers': (7, 8),
    'activation_fn_index': (0, 1),
    'batch_size': (32, 128),
    'learning_rate': (0.0005, 0.00125),
}

opt5 = optimize_hyperparameters(pbounds5)

In [None]:
# Step 6: Bayesian optimization, logging results to log6.txt

pbounds6 = {
    'num_layers': (7,8),
    'activation_fn_index': (0, 1),
    'batch_size': (32, 128),
    'learning_rate': (0.0005, 0.00125),
}

opt6 = optimize_hyperparameters(pbounds6)

In [None]:
# Step 7: Bayesian optimization, logging results to log7.txt

pbounds7 = {
    'num_layers': (8,8),
    'activation_fn_index': (0, 1),
    'batch_size': (32, 128),
    'learning_rate': (0.0005, 0.00125),
}

opt7 = optimize_hyperparameters(pbounds7)