In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

import load_cifar10


In [3]:
train_loader, val_loader, test_loader = load_cifar10.DatasetandLoader()

Data loaded succesfully! as <class 'torch.Tensor'>
Training data shape: torch.Size([40000, 3, 32, 32])


In [4]:
class MLP(nn.Module):
    def __init__(self, input_size, neuron_structure, num_classes):
        super().__init__()
        # Define first layer
        layers = []
        prev_size = input_size

        # Create hidden layers
        # Neuron Structure is expecting a list
        for neurons in neuron_structure:
            layers.append(nn.Linear(prev_size, neurons))
            layers.append(nn.ReLU())
            prev_size = neurons

        # Define output layer
        layers.append(nn.Linear(prev_size, num_classes))
        
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)
    


In [5]:
# Set GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [11]:
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        # Flatten images from (batch, 3, 32, 32) to (batch, 3072)
        images = images.view(images.size(0), -1)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    return running_loss / len(train_loader)

def evaluate(model, test_loader, criterion, device):
    model.eval()
    correct = 0
    total = 0
    test_loss = 0.0

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)

            # Flatten images
            images = images.view(images.size(0), -1)

            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = 100 * correct / total
    return test_loss / len(test_loader), accuracy



# Search Space

In [10]:
from itertools import product

n_hidden_layers = [1, 2, 4]  # Number of hidden layers
n_neurons_x_layer = [50, 200, 1000]  # Neurons per layer
learning_rate = [10**-3, 10**-4, 10**-5]  # Learning rates

# Define activation functions
activation_functions = {
    'relu': nn.ReLU(),
    'tanh': nn.Tanh(),
    'sigmoid': nn.Sigmoid()
}

activation = 'relu' # Fixed 
# Choose activation function (default to ReLU)
activation_fn = activation_functions.get(activation, nn.ReLU())

architectures = list(product(n_hidden_layers, n_neurons_x_layer, learning_rate))
print('Total architectures:', len(architectures)) 


Total architectures: 27


[(2, 200, 0.0001), (1, 1000, 0.001), (1, 1000, 1e-05), (1, 50, 0.0001), (2, 200, 0.001), (2, 1000, 0.001), (4, 1000, 1e-05), (2, 50, 0.0001), (4, 50, 0.0001), (2, 1000, 0.0001), (1, 200, 0.001), (4, 200, 1e-05), (4, 50, 1e-05), (2, 200, 1e-05), (1, 50, 1e-05), (4, 1000, 0.0001), (4, 50, 0.001), (1, 200, 0.0001), (1, 1000, 0.0001), (1, 200, 1e-05), (2, 50, 0.001), (4, 1000, 0.001), (2, 50, 1e-05), (2, 1000, 1e-05), (4, 200, 0.0001), (1, 50, 0.001), (4, 200, 0.001)]


In [21]:
import random

# Shuffle architecture search space
random.shuffle(architectures)

results = []
best_score = 0
best_architecture = None

search_configurations_len = len(architectures)
observation_phase = search_configurations_len // 3  # First 33% as observation phase

best_found_before_selection = None  # Store best model found in Phase 1

# Phase 1: Observe the first 33% without selecting
for i, (n_layers, neurons_per_layer, lr) in enumerate(architectures):
    print(f'Testing architecture {i+1}/{search_configurations_len}')
    print(f"Training MLP with {n_layers} layers, {neurons_per_layer} neurons per layer, LR={lr}")

    hidden_layers = [neurons_per_layer] * n_layers
    model = MLP(input_size=32*32*3, neuron_structure=hidden_layers, num_classes=10).to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    num_epochs = 1
    train_loss = train(model, train_loader, criterion, optimizer, device)
    test_loss, test_accuracy = evaluate(model, test_loader, criterion, device)

    results.append({
        'n_layers': n_layers,
        'neurons_per_layer': neurons_per_layer,
        'learning_rate': lr,
        'test_accuracy': test_accuracy
    })

    # Observation phase: Track the best model seen so far
    if i < observation_phase:
        if test_accuracy > best_score:
            best_score = test_accuracy
            best_found_before_selection = (n_layers, neurons_per_layer, lr)
    else:
        # Selection phase: Pick the first architecture better than any seen before
        if test_accuracy > best_score:
            print(f"Optimal stopping triggered at {i+1}/{search_configurations_len}")
            best_architecture = (n_layers, neurons_per_layer, lr)
            break

# If no architecture was selected in phase 2, return to the best one from Phase 1
if best_architecture is None:
    best_architecture = best_found_before_selection
    print(f"No better architecture found in Phase 2, returning to best from Phase 1.")

print("Selected Architecture:", best_architecture)


Testing architecture 1/27
Training MLP with 4 layers, 50 neurons per layer, LR=1e-05
Testing architecture 2/27
Training MLP with 1 layers, 200 neurons per layer, LR=0.0001
Testing architecture 3/27
Training MLP with 1 layers, 1000 neurons per layer, LR=0.001
Testing architecture 4/27
Training MLP with 4 layers, 1000 neurons per layer, LR=0.0001
Testing architecture 5/27
Training MLP with 1 layers, 1000 neurons per layer, LR=1e-05
Testing architecture 6/27
Training MLP with 1 layers, 50 neurons per layer, LR=1e-05
Testing architecture 7/27
Training MLP with 4 layers, 1000 neurons per layer, LR=1e-05
Testing architecture 8/27
Training MLP with 2 layers, 200 neurons per layer, LR=1e-05
Testing architecture 9/27
Training MLP with 4 layers, 50 neurons per layer, LR=0.001
Testing architecture 10/27
Training MLP with 4 layers, 1000 neurons per layer, LR=0.001
Testing architecture 11/27
Training MLP with 4 layers, 200 neurons per layer, LR=1e-05
Testing architecture 12/27
Training MLP with 1 l