In [2]:
import torch

# Creating a multilayer perceptron with two hidden layers

class NeuralNetwork(torch.nn.Module):
  def __init__(self, num_inputs, num_outputs):
    super().__init__()
    self.layers = torch.nn.Sequential(
        # First hidden layer
        torch.nn.Linear(num_inputs, 30),
        torch.nn.ReLU(),

        # Second hidden layer
        torch.nn.Linear(30, 20),
        torch.nn.ReLU(),

        # Output layer
        torch.nn.Linear(20, num_outputs)
    )

  def forward(self, x):
    logits = self.layers(x)
    return logits


In [5]:
# Creating a small toy dataset
X_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
])

y_train = torch.tensor([0, 0, 0, 1, 1])

x_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6],])

y_test = torch.tensor([0, 1])

In [6]:
# Defining a custom Dataset class
from torch.utils.data import Dataset

class ToyDataset(Dataset):
  def __init__(self, X, y):
    self.features = X
    self.labels = y

  def __getitem__(self, index):
    one_x = self.features[index]
    one_y = self.labels[index]
    return one_x, one_y

  def __len__(self):
    return self.labels.shape[0]


train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(x_test, y_test)

In [7]:
# Instantiating data loaders
from torch.utils.data import DataLoader

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=0)


test_loader = DataLoader(
    dataset=test_ds,
    batch_size=2,
    shuffle=False,
    num_workers=0
)

In [9]:
# Neural Network training in PyTorch
import torch.nn.functional as F

torch.manual_seed(123)

model = NeuralNetwork(num_inputs=2, num_outputs=2)
optimiser = torch.optim.SGD(
    model.parameters(), lr=0.5
)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (features, labels) in enumerate(train_loader):
        logits = model(features)

        loss = F.cross_entropy(logits, labels)

        optimiser.zero_grad()
        loss.backward()
        optimiser.step()   # The optimiser uses the gradients to update the model parameters making them slightly better for next time to minimise the loss. In the case of the SGD optimiser, this means multiplying the gradients with the learning rate and adding the scaled negative gradient to the parameters.

        # Logging
        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
              f" | Batch {batch_idx+1:03d}/{len(train_loader):03d}"
              f" | Train Loss: {loss:.2f}")

    # model.eval()

Epoch: 001/003 | Batch 001/003 | Train Loss: 0.75
Epoch: 001/003 | Batch 002/003 | Train Loss: 0.65
Epoch: 001/003 | Batch 003/003 | Train Loss: 0.42
Epoch: 002/003 | Batch 001/003 | Train Loss: 0.05
Epoch: 002/003 | Batch 002/003 | Train Loss: 0.13
Epoch: 002/003 | Batch 003/003 | Train Loss: 0.00
Epoch: 003/003 | Batch 001/003 | Train Loss: 0.01
Epoch: 003/003 | Batch 002/003 | Train Loss: 0.00
Epoch: 003/003 | Batch 003/003 | Train Loss: 0.02


After we have trained the model, we can use it to make predictions:

In [10]:
with torch.no_grad():
    outputs = model(X_train)
print(outputs)

tensor([[ 2.9320, -4.2563],
        [ 2.6045, -3.8389],
        [ 2.1484, -3.2514],
        [-2.1461,  2.1496],
        [-2.5004,  2.5210]])


To obtain the class membership probabilities, we can then use PyTorch’s `softmax` function:

In [11]:
torch.set_printoptions(sci_mode=False)
probas = torch.softmax(outputs, dim=1) # Computes softmax for each row independently.
print(probas)

tensor([[    0.9992,     0.0008],
        [    0.9984,     0.0016],
        [    0.9955,     0.0045],
        [    0.0134,     0.9866],
        [    0.0066,     0.9934]])


We can convert these values into class label predictions using PyTorch’s `argmax` function, which returns the index position of the highest value in each row if we set `dim=1` (setting dim=0 would return the highest value in each column instead):

In [12]:
predictions = torch.argmax(probas, dim=1)
print(predictions)

tensor([0, 0, 0, 1, 1])


We could also apply the argmax function to the logits (outputs) directly:

In [13]:
predictions = torch.argmax(outputs, dim=1)
print(predictions)

tensor([0, 0, 0, 1, 1])


Here, we computed the predicted labels for the training dataset. Since the training dataset is relatively small, we could compare it to the true training labels by eye and see that the model is 100% correct. We can double-check this using the `==` comparison operator:

In [14]:
predictions == y_train

tensor([True, True, True, True, True])

Using `torch.sum`, we can count the number of correct predictions:

In [15]:
torch.sum(predictions == y_train)

tensor(5)

In [16]:
type(predictions)

torch.Tensor

In [17]:
type(y_train)

torch.Tensor

Since the dataset consists of five training examples, we have five out of five predictions that are correct, which has 5/5 × 100% = 100% prediction accuracy. To generalise the computation of the prediction accuracy, let’s implement a compute_accuracy function, as shown in the following listing.

In [22]:
def compute_accuracy(model, dataloader): # `model` is the trained NN, `dataloader` provides batches of test/validation data
    model = model.eval() # Puts the model in evaluation mode (Disables dropout layers, Uses the running statistics in batch normalization)
    correct = 0
    total_examples = 0

    for idx, (features, labels) in enumerate(dataloader):
        with torch.no_grad():
            logits = model(features)

        predictions = torch.argmax(logits, dim=1)
        compare = labels == predictions # Creates a boolean tensor where: `True`--> prediction matches the label, `False`--> prediction was wrong
        print(f" compare is: {compare}")
        correct+= torch.sum(compare)
        total_examples += len(compare)

    return (correct / total_examples).item() # .item() converts from tensor to Python scalar

The code iterates over a data loader to compute the number and fraction of the correct predictions. When we work with large datasets, we typically can only call the model on a small part of the dataset due to memory limitations. The `compute_accuracy` function here is a general method that scales to datasets of arbitrary size since, in each iteration, the dataset chunk that the model receives is the same size as the batch size seen during training. The internals of the `compute_accuracy` function are similar to what we used before when we converted the logits to the class labels.

We can then apply the function to the training:

In [23]:
print(compute_accuracy(model, train_loader))

 compare is: tensor([True, True])
 compare is: tensor([True, True])
 compare is: tensor([True])
1.0


Similarly, we can apply the function to the test set:

In [24]:
print(compute_accuracy(model, test_loader))

 compare is: tensor([True, True])
1.0


### A.8 Saving and loading models

In [26]:
model

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=2, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=2, bias=True)
  )
)

In [25]:
model.state_dict()

OrderedDict([('layers.0.weight',
              tensor([[-0.3094,  0.1056],
                      [-0.3556,  0.2842],
                      [-0.6042,  0.5255],
                      [-0.5140, -0.5622],
                      [-0.4625,  0.3818],
                      [-0.2798,  0.3371],
                      [-0.6001, -0.4290],
                      [-0.2596, -0.1390],
                      [-0.5326,  0.4366],
                      [-0.1789,  0.2748],
                      [ 0.5309,  0.0905],
                      [ 0.1360,  0.7413],
                      [-0.3296,  0.2661],
                      [-0.3307,  0.5679],
                      [ 0.6438, -0.6922],
                      [ 0.5157,  0.3847],
                      [ 0.2495,  0.3144],
                      [ 0.6622, -0.1331],
                      [-0.5989, -0.2006],
                      [-0.5107,  0.0958],
                      [-0.0131, -0.4395],
                      [-0.0753,  0.7126],
                      [ 0.0250, -0.0593],
 

In [27]:
model.state_dict().keys()

odict_keys(['layers.0.weight', 'layers.0.bias', 'layers.2.weight', 'layers.2.bias', 'layers.4.weight', 'layers.4.bias'])

In [33]:
from pathlib import Path
import subprocess
from loguru import logger


def git_codebase_root():
    try:
        root = subprocess.check_output(
            ["git", "rev-parse", "--show-toplevel"], stderr=subprocess.DEVNULL
        )
        return Path(root.decode().strip())
    except subprocess.CalledProcessError:
        logger.warning("Not inside a Git repository.")
        return None

def get_working_directory_or_git_root():
    git_root = git_codebase_root()
    return git_root if git_root is not None else Path.cwd()

root_dir = get_working_directory_or_git_root()
#print(root_dir)

In [35]:
torch.save(model.state_dict(), root_dir / "models/NN2x2.pth")

The model's `state_dict` is a Python dictionary object that maps each layer in the model to its trainable parameters (weights and biases).

`model.pth` is an arbitrary filename for the model file saved to disk. We can give it any name and file ending we like; however, `.pth` and `.pt` are the most common conventions.

Once we saved the model, we can restore it from disk:

In [36]:
model_restored = NeuralNetwork(2,2) # This line is not strictly necessary if you execute this code in the same session where you saved a model. However, I included it here to illustrate that we need an instance of the model in memory to apply the saved parameters. Here, the NeuralNetwork(2, 2) architecture needs to match the original saved model exactly.

In [37]:
model_restored.load_state_dict(torch.load(root_dir / "models/NN2x2.pth"))
# The `torch.load(root_dir / "models/NN2x2.pth")` function reads the file "NN2x2.pth" and reconstructs the Python dictionary object containing the model’s parameters while model.load_state_dict() applies these parameters to the model, effectively restoring its learned state from when we saved it.

  model_restored.load_state_dict(torch.load(root_dir / "models/NN2x2.pth"))


<All keys matched successfully>

In [38]:
model_restored

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=2, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=2, bias=True)
  )
)