In [3]:
# Library import
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision

In [4]:
# Constants definition
batch_size = 256
epochs = 200
num_classes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [25]:
# Creating dataloaders
data_transforms = torchvision.transforms.Compose([
    torchvision.transforms.RandomCrop(28, padding=4),
    torchvision.transforms.RandomAffine(degrees=5, translate=(0.10, 0.15), scale=(0.9, 1.1)),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(0.5, 0.5)
])

test_transforms = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.5), (0.5))
])

train_set = torchvision.datasets.MNIST('.data/', train=True, download=True, transform=data_transforms)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)

test_set = torchvision.datasets.MNIST('.data/', train=False, download=True, transform=test_transforms)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=True)

In [26]:
# Statistics printing
x_batch, y_batch = iter(train_loader).next()
print("Training set: {} samples - Max value: {} - Min value: {}".format(len(train_loader.dataset),
                                                                        x_batch.max(), x_batch.min()))
x_batch, y_batch = iter(test_loader).next()
print("Test set: {} samples - Max value: {} - Min value: {}".format(len(test_loader.dataset),
                                                                    x_batch.max(), x_batch.min()))
print("Example batch shape: {}".format(x_batch.shape))

Training set: 60000 samples - Max value: 1.0 - Min value: -1.0
Test set: 10000 samples - Max value: 1.0 - Min value: -1.0
Example batch shape: torch.Size([256, 1, 28, 28])


In [32]:
# There are no GaussianNoise Layer in Pytorch
# https://discuss.pytorch.org/t/writing-a-simple-gaussian-noise-layer-in-pytorch/4694/4
class GaussianNoise(nn.Module):
    """Gaussian noise regularizer.
    Args:
        sigma (float, optional): relative standard deviation used to generate the
            noise. Relative means that it will be multiplied by the magnitude of
            the value your are adding the noise to. This means that sigma can be
            the same regardless of the scale of the vector.
        is_relative_detach (bool, optional): whether to detach the variable before
            computing the scale of the noise. If `False` then the scale of the noise
            won't be seen as a constant but something to optimize: this will bias the
            network to generate vectors with smaller values.
    """

    def __init__(self, sigma=0.1, is_relative_detach=True):
        super().__init__()
        self.sigma = sigma
        self.is_relative_detach = is_relative_detach
        self.noise = torch.tensor(0).to(device).float()

    def forward(self, x):
        if self.training and self.sigma != 0:
            scale = self.sigma * x.detach() if self.is_relative_detach else self.sigma * x
            sampled_noise = self.noise.repeat(*x.size()).normal_() * scale
            x = x + sampled_noise
        return x

# Creating our Neural Network - Fully Connected
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.gn0 = GaussianNoise(0.1)
        
        self.linear1 = nn.Linear(784, 1024)
        self.norm1 = nn.BatchNorm1d(1024)
        self.gn1 = GaussianNoise(0.1)
        self.relu1 = nn.ReLU()
        
        self.linear2 = nn.Linear(1024, 1024)
        self.norm2 = nn.BatchNorm1d(1024)
        self.gn2 = GaussianNoise(0.1)
        self.relu2 = nn.ReLU()
        
        '''self.linear3 = nn.Linear(1024, 1024)
        self.norm3 = nn.BatchNorm1d(1024)
        self.gn3 = GaussianNoise(0.1)
        self.relu3 = nn.ReLU()'''
        
        self.classifier = nn.Linear(1024, num_classes)
        #self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        out = self.gn0(x)
        out = self.relu1(self.gn1(self.norm1(self.linear1(out))))
        out = self.relu2(self.gn2(self.norm2(self.linear2(out))))
        #out = self.relu3(self.gn3(self.norm3(self.linear3(out))))
        out = self.classifier(out)
        
        return out

In [33]:
# Instantiating the network and printing its architecture
net = Net().to(device)
print(net)

Net(
  (gn0): GaussianNoise()
  (linear1): Linear(in_features=784, out_features=1024, bias=True)
  (norm1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (gn1): GaussianNoise()
  (relu1): ReLU()
  (linear2): Linear(in_features=1024, out_features=1024, bias=True)
  (norm2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (gn2): GaussianNoise()
  (relu2): ReLU()
  (classifier): Linear(in_features=1024, out_features=10, bias=True)
)


In [34]:
# Training hyperparameters
criterion = nn.CrossEntropyLoss()
#optimizer = optim.Adam(net.parameters(), lr=0.1, weight_decay=1e-6)
optimizer = optim.SGD(net.parameters(), lr=0.1, weight_decay=1e-6)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5)

In [35]:
epoch = 0

In [36]:
# Start training
print("\n---- Start Training ----")
best_accuracy = -1
for i in range(epochs):

    # TRAIN THE NETWORK
    train_loss, train_correct = 0, 0
    net.train()
    for inputs, targets in train_loader:
        # data is a list of [inputs, labels]
        inputs, targets = inputs.to(device), targets.to(device)
        # care! net expect a 784 size vector and our dataset provide 1x28x28 (channels, height, width) -> Reshape!
        inputs = inputs.view(inputs.size(0), -1)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        _, pred = outputs.max(1)  # get the index of the max log-probability
        train_correct += pred.eq(targets).sum().item()

        # print statistics
        train_loss += loss.item()

    train_loss /= len(train_loader.dataset)

    # TEST NETWORK
    net.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            # care! net expect a 784 size vector and our dataset provide 1x28x28 (channels, height, width) -> Reshape!
            inputs = inputs.view(inputs.size(0), -1)
            outputs = net(inputs)
            test_loss += criterion(outputs, targets)
            _, pred = outputs.max(1)  # get the index of the max log-probability
            correct += pred.eq(targets).sum().item()

    test_loss /= len(test_loader.dataset)
    test_accuracy = 100. * correct / len(test_loader.dataset)
    
    # Get current learning rate via the optimizer
    for param_group in optimizer.param_groups:
        current_lr = param_group['lr']
    
    print("[Epoch {}] LR: {:.4f} - Train Loss: {:.6f} - Test Loss: {:.6f} - Train Accuracy: {:.2f}% - Test Accuracy: {:.2f}%".format(
            epoch + 1, current_lr, train_loss, test_loss, 100. * train_correct / len(train_loader.dataset), test_accuracy
        ))

    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        
    scheduler.step(test_loss)
    
    epoch += 1

print("Finished Training")
print("Best Test accuracy: {:.2f}%".format(best_accuracy))


---- Start Training ----
[Epoch 1] LR: 0.1000 - Train Loss: 0.003335 - Test Loss: 0.001002 - Train Accuracy: 73.15% - Test Accuracy: 92.74%
[Epoch 2] LR: 0.1000 - Train Loss: 0.001596 - Test Loss: 0.000999 - Train Accuracy: 87.53% - Test Accuracy: 92.00%
[Epoch 3] LR: 0.1000 - Train Loss: 0.001279 - Test Loss: 0.000903 - Train Accuracy: 89.94% - Test Accuracy: 92.81%
[Epoch 4] LR: 0.1000 - Train Loss: 0.001087 - Test Loss: 0.000492 - Train Accuracy: 91.39% - Test Accuracy: 96.18%
[Epoch 5] LR: 0.1000 - Train Loss: 0.000986 - Test Loss: 0.000405 - Train Accuracy: 92.21% - Test Accuracy: 96.63%
[Epoch 6] LR: 0.1000 - Train Loss: 0.000907 - Test Loss: 0.000505 - Train Accuracy: 92.79% - Test Accuracy: 95.82%
[Epoch 7] LR: 0.1000 - Train Loss: 0.000840 - Test Loss: 0.000441 - Train Accuracy: 93.46% - Test Accuracy: 96.59%
[Epoch 8] LR: 0.1000 - Train Loss: 0.000799 - Test Loss: 0.000382 - Train Accuracy: 93.64% - Test Accuracy: 96.91%
[Epoch 9] LR: 0.1000 - Train Loss: 0.000760 - Test Los

[Epoch 72] LR: 0.0010 - Train Loss: 0.000264 - Test Loss: 0.000098 - Train Accuracy: 97.89% - Test Accuracy: 99.17%
[Epoch 73] LR: 0.0010 - Train Loss: 0.000263 - Test Loss: 0.000099 - Train Accuracy: 97.90% - Test Accuracy: 99.15%
[Epoch 74] LR: 0.0010 - Train Loss: 0.000260 - Test Loss: 0.000098 - Train Accuracy: 97.89% - Test Accuracy: 99.10%
[Epoch 75] LR: 0.0010 - Train Loss: 0.000263 - Test Loss: 0.000099 - Train Accuracy: 97.93% - Test Accuracy: 99.20%
[Epoch 76] LR: 0.0010 - Train Loss: 0.000262 - Test Loss: 0.000098 - Train Accuracy: 97.89% - Test Accuracy: 99.19%
[Epoch 77] LR: 0.0010 - Train Loss: 0.000261 - Test Loss: 0.000098 - Train Accuracy: 97.91% - Test Accuracy: 99.15%
[Epoch 78] LR: 0.0010 - Train Loss: 0.000252 - Test Loss: 0.000100 - Train Accuracy: 98.00% - Test Accuracy: 99.13%
[Epoch 79] LR: 0.0010 - Train Loss: 0.000264 - Test Loss: 0.000098 - Train Accuracy: 97.92% - Test Accuracy: 99.20%
[Epoch 80] LR: 0.0010 - Train Loss: 0.000254 - Test Loss: 0.000099 - Tra

KeyboardInterrupt: 

In [37]:
torch.save(net.state_dict(), "./models/mnist_mlp_param_state.pt")
torch.save(optimizer.state_dict(), "./models/mnist_mlp_optim_state.pt")