In [1]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR


In [2]:
class CNN_MNIST(nn.Module):
    """
    A simple CNN for MNIST digit classification.
    """
    def __init__(self):
        super(CNN_MNIST, self).__init__()
        
        # Convolution layers
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1)
        
        # Dropout layers
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.dropout3 = nn.Dropout(0.3)
        
        # Calculate the size of flattened features
        # Input: 28x28 -> After 3 conv layers (no padding) and 1 pooling
        # Conv1: 26x26 -> Conv2: 24x24 -> Conv3: 22x22 -> Pooling: 11x11
        # Final size: 128 channels * 11 * 11 = 15488
        self.fc1 = nn.Linear(15488, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 10)  # 10 output classes for MNIST
    
    def forward(self, x):
        """
        Forward pass of the CNN.
        """
        # First convolution + ReLU
        x = self.conv1(x)
        x = F.relu(x)
        
        # Second convolution + ReLU
        x = self.conv2(x)
        x = F.relu(x)
        
        # Third convolution + ReLU
        x = self.conv3(x)
        x = F.relu(x)
        
        # Max Pooling
        x = F.max_pool2d(x, 2)
        
        # Dropout
        x = self.dropout1(x)
        
        # Flatten
        x = torch.flatten(x, 1)
        
        # First fully-connected layer + ReLU
        x = self.fc1(x)
        x = F.relu(x)
        
        # Second Dropout
        x = self.dropout2(x)
        
        # Second fully-connected layer + ReLU
        x = self.fc2(x)
        x = F.relu(x)
        
        # Third Dropout
        x = self.dropout3(x)
        
        # Final fully-connected layer
        x = self.fc3(x)
        
        # Log softmax
        output = F.log_softmax(x, dim=1)
        return output

In [3]:
# Cell 3: Training and Testing Functions

def train(model, device, train_loader, optimizer, epoch, dry_run=False):
    """
    Train the model for one epoch.
    """
    model.train()  # set model to training mode
    for batch_idx, (data, target) in enumerate(train_loader):
        # Move data and target to the specified device (CPU or GPU)
        data, target = data.to(device), target.to(device)
        
        # Clear the gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = model(data)
        
        # Calculate the loss (negative log likelihood)
        loss = F.nll_loss(output, target)
        
        # Backpropagation
        loss.backward()
        
        # Update parameters
        optimizer.step()
        
        # Print training status every 10 batches
        if batch_idx % 10 == 0:
            print(
                f"Train Epoch: {epoch} "
                f"[{batch_idx * len(data)}/{len(train_loader.dataset)} "
                f"({100.0 * batch_idx / len(train_loader):.0f}%)]\t"
                f"Loss: {loss.item():.6f}"
            )
            
            # If dry_run is True, break after the first logging
            if dry_run:
                break

def test(model, device, test_loader):
    """
    Evaluate the model on the test dataset.
    """
    model.eval()   # set model to evaluation mode
    test_loss = 0
    correct = 0
    
    with torch.no_grad():  # no need to track gradients during evaluation
        for data, target in test_loader:
            # Move data and target to the specified device
            data, target = data.to(device), target.to(device)
            
            # Forward pass
            output = model(data)
            
            # Sum up batch loss
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            
            # Get the index of the max log-probability
            pred = output.argmax(dim=1, keepdim=True)
            
            # Count how many predictions match the target
            correct += pred.eq(target.view_as(pred)).sum().item()
    
    test_loss /= len(test_loader.dataset)
    accuracy = 100.0 * correct / len(test_loader.dataset)
    
    print(
        f"\nTest set: Average loss: {test_loss:.4f}, "
        f"Accuracy: {correct}/{len(test_loader.dataset)} "
        f"({accuracy:.0f}%)\n"
    )

In [4]:
# Cell 4: Main Execution (Data Loading, Model Training, and Testing)

def main():
    # Device configuration (CPU or GPU)
    device = torch.device("cpu")  # Change to torch.device("cuda") if a GPU is available
    dry_run = False  # Set to True for a quick dry run

    # Define transformations for MNIST dataset
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))  # mean and std for MNIST
    ])

    # Configure batch sizes for training and testing
    train_kwargs = {'batch_size': 64}
    test_kwargs = {'batch_size': 1000}

    # Download and create MNIST datasets
    dataset1 = datasets.MNIST('../data', train=True, download=True, transform=transform)
    dataset2 = datasets.MNIST('../data', train=False, transform=transform)

    # Create data loaders
    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

    # Initialize the model and move it to the selected device
    model = CNN_MNIST().to(device)

    # Define the optimizer (Adadelta) and the learning rate scheduler
    optimizer = optim.Adadelta(model.parameters(), lr=1.0)
    scheduler = StepLR(optimizer, step_size=1, gamma=0.7)

    # Number of epochs to train
    num_epochs = 14

    # Train and test the model for the specified number of epochs
    for epoch in range(1, num_epochs + 1):
        train(model, device, train_loader, optimizer, epoch, dry_run)
        test(model, device, test_loader)
        scheduler.step()

# Call the main function to run the entire process
main()


Test set: Average loss: 0.0493, Accuracy: 9860/10000 (99%)


Test set: Average loss: 0.0415, Accuracy: 9873/10000 (99%)


Test set: Average loss: 0.0261, Accuracy: 9918/10000 (99%)


Test set: Average loss: 0.0254, Accuracy: 9920/10000 (99%)


Test set: Average loss: 0.0240, Accuracy: 9930/10000 (99%)


Test set: Average loss: 0.0240, Accuracy: 9930/10000 (99%)


Test set: Average loss: 0.0232, Accuracy: 9936/10000 (99%)


Test set: Average loss: 0.0219, Accuracy: 9936/10000 (99%)


Test set: Average loss: 0.0230, Accuracy: 9937/10000 (99%)


Test set: Average loss: 0.0222, Accuracy: 9936/10000 (99%)


Test set: Average loss: 0.0226, Accuracy: 9939/10000 (99%)


Test set: Average loss: 0.0230, Accuracy: 9937/10000 (99%)


Test set: Average loss: 0.0231, Accuracy: 9938/10000 (99%)


Test set: Average loss: 0.0232, Accuracy: 9940/10000 (99%)

