Konvolusjonelle lag bruker en "sliding window"-mekanisme hvor de samme vektene multipliseres med forskjellige deler av et bilde. Da brukes vanligvis et mindre antall parametre.

https://ezyang.github.io/convolution-visualizer/

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# Pipeline of processing operations
image_processing = transforms.Compose([
    transforms.ToTensor(), # Cast into torch.Tensor
    transforms.Normalize((0.5,), (0.5,)) # Pixel-values will range in [-1, 1]
])

train_dataset = MNIST(root='../data', train=True, transform=image_processing, download=True)
test_dataset = MNIST(root='../data', train=False, transform=image_processing, download=True) # Test data for later
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32) # Iterable that provides 16 data samples each iteration

data, labels = next(iter(train_loader)) 

In [4]:
torch.manual_seed(1) # Set random seed for reproducibility
class CNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1), # 1x28x28 -> 16x28x28
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2), # 16x28x28 -> 16x14x14
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1), # 16x14x14 -> 32x14x14
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2), # 32x14x14 -> 32x7x7
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1), # 32x7x7 -> 64x7x7
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2), # 64x7x7 -> 64x3x3
            nn.Flatten(), # 64x3x3 -> 576
            nn.Linear(576, 10) # 576 -> 10
        )

    def logits(self, data):
        return self.model(data)
    
    def forward(self, data):
        logits = self.logits(data)
        return F.softmax(logits, dim=1)
    
model = CNNModel() # Initialize model
model.to(device) # Move model to GPU device

CNNModel(
  (model): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (9): Flatten(start_dim=1, end_dim=-1)
    (10): Linear(in_features=576, out_features=10, bias=True)
  )
)

In [5]:
print(f"Number of parameters: {sum(p.numel() for p in model.parameters())}")

Number of parameters: 29066


In [57]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [58]:
from tqdm import tqdm # Progress bar

num_epochs = 3

for epoch in range(num_epochs):
    with tqdm(train_loader, unit="batch") as pbar:
        pbar.set_description(f"Epoch {epoch}")
        for i, (data, labels) in enumerate(pbar):
            data = data.to(device)
            labels = labels.to(device)
            pred = model.logits(data) # Query model for predictions
            loss = loss_fn(pred, labels)
            
            loss.backward() # Propagate the computational graph and calculate gradients
            optimizer.step() # Uses the calculated gradients on the registered parameters to perform an update
            optimizer.zero_grad() # Remove the gradients

            # pbar.set_postfix(loss=loss.cpu().item()) if i%40 == 0 else None
    

Epoch 0: 100%|██████████| 1875/1875 [00:08<00:00, 234.14batch/s]
Epoch 1: 100%|██████████| 1875/1875 [00:07<00:00, 236.47batch/s]
Epoch 2: 100%|██████████| 1875/1875 [00:07<00:00, 242.38batch/s]


In [59]:
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=32)

correct = 0
for data, labels in test_loader:
    data = data.to(device)
    labels = labels.to(device)

    pred = model.forward(data)
    correct += torch.sum(pred.argmax(dim=1) == labels)
accuracy = correct/len(test_dataset)

print(f"The accuracy of the model on the test set is {accuracy}")

The accuracy of the model on the test set is 0.9902999997138977


Med en arkitektur som tar i bruk konvolusjonelle lag, klarer modellen å strekke seg over 99% nøyaktighet på testsettet.