# About
This is the step-by-step version of `./src/mybaseline-all-in-one.py`

### Models
Model `CNNMnistWyBn` adds batchnormalization `nn.BatchNorm2d` after convolutional filter `nn.Conv2d`, which leads to better performance in baseline learning of MNIST. However, in a test run under non-IID case {E=5, B=10, C=0.0} the validation performance is lower than original model `CNNMnistWy`.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data

import torchvision
from torchvision import transforms
from torchvision import datasets
import time

##### Use tensorboard writer to visualize the training results

In [2]:
from torch.utils.tensorboard import SummaryWriter

#### Option classes and help function

In [3]:
class TaskMnist():
    def __init__(self, nn='cnn'):
        self.path = '..\data\mnist'
        self.name = 'mnist'
        self.nn = nn
        
class TaskCifar():
    def __init__(self,nn='torch'):
        self.path = '..\data\cifar'
        self.name = 'cifar10'
        self.nn = nn

class HyperParam():
    def __init__(self,path,learning_rate=0.1, batch_size=100, epoch=10, momentum=0.9, nesterov=False):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.datapath = path
        self.lr=learning_rate
        self.bs=batch_size
        self.epoch=epoch
        self.momentum=momentum
        self.nesterov=nesterov        
    
# the function used to count the number of trainable parameters
def get_count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

#### Define models

In [4]:
# the 2NN model described in the vanilla FL paper for experiments with MNIST
class TwoNN(nn.Module):
    def __init__(self):
        super(TwoNN,self).__init__()
        self.nn_layer=nn.Sequential(
            nn.Linear(in_features=28*28,out_features=100),
            nn.ReLU(),
            nn.Linear(in_features=100,out_features=100),
            nn.ReLU(),
            nn.Linear(in_features=100,out_features=10)
        )
    def forward(self,x):
        x = x.view(-1,28*28)
        logits = self.nn_layer(x)
        return F.log_softmax(logits,dim=1)
                 
# the 2NN model in AshwinRJ's repository
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layer_input = nn.Linear(28*28, 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout()
        self.layer_hidden = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, x.shape[1]*x.shape[-2]*x.shape[-1])
        x = self.layer_input(x)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.layer_hidden(x)
        return F.log_softmax(x,dim=1)

# the CNN model describted in the vanilla FL paper for experiments with MNIST
class CNNMnistWy(nn.Module):
    def __init__(self):
        super(CNNMnistWy,self).__init__()
        self.conv_layer = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2)
        )
        self.fc_layer = nn.Sequential(
            nn.Linear(in_features=1024,out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512,out_features=10),
        )
    
    def forward(self,x):
        x=self.conv_layer(x)
        x=x.view(-1,1024)
        logits = self.fc_layer(x)
        return F.log_softmax(logits,dim=1)

# Same with CNNMnistWy with batchnormalization
class CNNMnistWyBn(nn.Module):
    def __init__(self):
        super(CNNMnistWyBn,self).__init__()
        self.conv_layer = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=5),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2)
        )
        self.fc_layer = nn.Sequential(
            nn.Linear(in_features=1024,out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512,out_features=10),
        )
    
    def forward(self,x):
        x=self.conv_layer(x)
        x=x.view(-1,1024)
        logits = self.fc_layer(x)
        return F.log_softmax(logits,dim=1)
    
# Same with CNNMnistWy with additional dropout layer after conv filter
class CNNMnistWy2(nn.Module):
    def __init__(self):
        super(CNNMnistWy2,self).__init__()
        self.conv_layer = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5),
            nn.Dropout(),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2)
        )
        self.fc_layer = nn.Sequential(
            nn.Linear(in_features=1024,out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512,out_features=10),
        )
    
    def forward(self,x):
        x=self.conv_layer(x)
        x=x.view(-1,1024)
        logits = self.fc_layer(x)
        return F.log_softmax(logits,dim=1)

# the CNN model in AshwinRJ's repository
class CNNMnist(nn.Module):
    def __init__(self):
        super(CNNMnist, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, x.shape[1]*x.shape[2]*x.shape[3])
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

# the example model used in the official CNN training tutorial of PyTorch using CIFAR10
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
class CNNCifarTorch(nn.Module):
    def __init__(self):
        super(CNNCifarTorch,self).__init__()
        self.conv_layer = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2)
        )
        self.fc_layer = nn.Sequential(
            nn.Linear(16*5*5,120),
            nn.ReLU(),
            nn.Linear(120,84),
            nn.ReLU(),
            nn.Linear(84,10)
        )

    def forward(self,x):
        x=self.conv_layer(x)
        x=x.view(-1, 16 * 5 * 5)
        logits=self.fc_layer(x)
        return F.log_softmax(logits,dim=1)

# the exmaple model used in the official CNN tutorial of TensorFlow using CIFAR10
# https://www.tensorflow.org/tutorials/images/cnn
class CNNCifarTf(nn.Module):
    def __init__(self):
        super(CNNCifarTf,self).__init__()
        self.conv_layer = nn.Sequential(
            nn.Conv2d(in_channels=3,out_channels=32, kernel_size=3), # output size 30*30, i.e., (32, 30 ,30)
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2), # output size 15*15, i.e., (32, 15 ,15)
            nn.Conv2d(in_channels=32,out_channels=64,kernel_size=3), # output size 13*13, i.e., (64, 13 ,13)
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2), # output size 6*6, i.e., (64, 6, 6)
            nn.Conv2d(in_channels=64,out_channels=64,kernel_size=3), # output size 4*4, i.e., (64, 4, 4)
            nn.ReLU()
        )
        self.fc_layer = nn.Sequential(
            nn.Linear(in_features=1024,out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64,out_features=10),
        )

    def forward(self,x):
        x=self.conv_layer(x)
        x=x.view(-1,1024)
        logits=self.fc_layer(x)
        return F.log_softmax(logits,dim=1)

#### Define data loader functions
**BE CAUTION** that, the transform applied to the test data loader should be **the same as the training data loader**

In [5]:
def data_cifar(path, batch_size=100):
    """
    returns training data loader and test data loader
    """
    # no brainer normalization used in the pytorch tutorial
    mean_0 = (0.5, 0.5, 0.5)
    std_0 = (0.5, 0.5, 0.5)

    # alternative normilzation
    mean_1 = (0.4914, 0.4822, 0.4465)
    std_1 = (0.2023, 0.1994, 0.2010)

    # configure tranform for training data
    # standard transform used in the pytorch tutorial 
    transform_train_0 = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean_0,std_0),
    ])

    # configure transform for test data
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean_0,std_0),
    ])
    
    '''
    # Alternative transform
    # enhanced transform, random crop and flip is optional
    transform_train_1 = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean_1,std_1),
    ])

    # alternative, only random crop is used
    transform_train_2 = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.ToTensor(),
        transforms.Normalize(mean_1,std_1),
    ])

    # configure transform for test data
    transform_test_1 = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean_1,std_1),
    ])
    '''
    # setup the CIFAR10 training dataset
    data_train = datasets.CIFAR10(root=path, train=True, download=False, transform=transform_train_0)
    loader_train = data.DataLoader(data_train, batch_size=batch_size, shuffle=True)

    # setup the CIFAR10 test dataset
    data_test = datasets.CIFAR10(root=path, train=False, download=False, transform=transform_test_0)
    loader_test = data.DataLoader(data_test, batch_size=100, shuffle=False)

    return loader_train, loader_test

def data_mnist(path,batch_size=64):
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.1307,), (0.3081,))
    ])
    
    # setup the MNIST training dataset
    data_train = datasets.MNIST(root=path, train=True, download=False, transform=transform)
    loader_train = data.DataLoader(data_train, batch_size=batch_size, shuffle=True) 
    
    # setup the MNIST training dataset
    data_test = datasets.MNIST(root=path, train=False, download=False, transform=transform)
    loader_test = data.DataLoader(data_test, batch_size=100, shuffle=False)
    return loader_train,loader_test

#### Training function

In [6]:
def train_model(loader_train, loader_test, epochs, loss_fn, optimizer, device):
    for epoch in range(1, epochs+1):
        train_loss = 0.0
        test_loss = 0.0
        test_acc = 0.0

        # training of each epoch
        model.train()
        for batch, (images, labels) in enumerate(loader_train):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * images.size(0)
        train_loss /= len(loader_train.dataset)

        # test after each epoch
        model.eval()
        num_correct = 0 
        with torch.no_grad():
            for batch, (images, labels) in enumerate(loader_test):
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = loss_fn(outputs,labels)
                test_loss += loss.item() * images.size(0)
                pred = outputs.argmax(dim=1)
                num_correct += pred.eq(labels.view_as(pred)).sum().item()
        test_loss /= len(loader_test.dataset)
        test_acc = 100*num_correct/len(loader_test.dataset)
        print('Epoch: {} | Training Loss: {:.2f} | Test Loss: {:.2f} | Test accuracy = {:.2f}%'.format(epoch, train_loss, test_loss, test_acc))

        # user tensorboard writer
        writer.add_scalar("Training loss", train_loss, epoch)
        writer.add_scalar("Test/Loss", test_loss, epoch)
        writer.add_scalar("Test/Acc", test_acc, epoch)

#### Setup the training task

In [7]:
# configure the task
task = TaskMnist(nn='cnn_wy2')

# configure the training parameters
settings = HyperParam(path=task.path, learning_rate=0.1, nesterov=False, batch_size=32, epoch=100)
print('Train', task.nn, 'with', task.name)

# start a tensorboard writer, writes to runs/
writer_path = f'runs/mnist_{task.nn}_Lr{settings.lr}_E{settings.epoch}_Bs{settings.bs}'
writer = SummaryWriter(writer_path)

Train cnn_wy2 with mnist


#### Setup the data loader

In [8]:
if task.name == 'mnist':
    if task.nn == 'cnn_wy':
        model = CNNMnistWy().to(settings.device)
    elif task.nn == 'cnn_wy_bn':
        model = CNNMnistWyBn().to(settings.device)
    elif task.nn == 'cnn_wy2':
        model = CNNMnistWy2().to(settings.device)   
    elif task.nn == 'cnn':
        model = CNNMnist().to(settings.device)
    elif task.nn == '2nn_wy':
        model = TwoNN().to(settings.device)
    else:
        model = MLP().to(settings.device)
    loader_train, loader_test = data_mnist(path=settings.datapath,batch_size=settings.bs)
elif task.name == 'cifar':
    if task.cnn == 'torch':
        model = CNNCifarTorch().to(settings.device)
    else:
        model = CNNCifarTf().to(settings.device)
    loader_train, loader_test = data_cifar(path=settings.datapath,batch_size=settings.bs)

#### Setup the loss function and optimizer

In [9]:
# set the loss function and optimizer
loss_fn = nn.CrossEntropyLoss().to(settings.device)
if settings.nesterov:
    optimizer = torch.optim.SGD(model.parameters(), lr=settings.lr, momentum=settings.momentum, nesterov=settings.nesterov)
else:
    optimizer = torch.optim.SGD(model.parameters(), lr=settings.lr)

#### Show confirmation messages

In [10]:
# print some welcome messages
print('\nModel training has setup...\n')
print(f'Dataset:\t{task.name}')
print(f'Loss function:\t{loss_fn}')
print('Optimizer:\tSGD with Nesterov momentum=0.9') if settings.nesterov else print('Optimizer:\tvanilla SGD')
print(f'learning rate:\t{settings.lr}')
print(f'Batch size:\t{settings.bs}')
print(f'Num of epochs:\t{settings.epoch}')
print('Model to train:\n', model)
print(f'Trainable model parameters:\t{get_count_params(model)}')


Model training has setup...

Dataset:	mnist
Loss function:	CrossEntropyLoss()
Optimizer:	vanilla SGD
learning rate:	0.1
Batch size:	32
Num of epochs:	100
Model to train:
 CNNMnistWy2(
  (conv_layer): Sequential(
    (0): Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
    (4): Dropout(p=0.5, inplace=False)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc_layer): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=10, bias=True)
  )
)
Trainable model parameters:	582026


#### Start the training process

In [11]:
# start training
start = time.time()

# start the training
train_model(loader_train=loader_train,
            loader_test=loader_test,
            loss_fn=loss_fn,
            optimizer=optimizer,
            epochs=settings.epoch,
            device=settings.device)
    
# print the wall-clock-time used
end=time.time() 
print('\nTraining completed, time elapsed on this device: {:.2f}s'.format(end-start))

# flush and close the tensorboard writer
writer.flush()
writer.close()

Epoch: 1 | Training Loss: 0.14 | Test Loss: 0.04 | Test accuracy = 98.90%
Epoch: 2 | Training Loss: 0.05 | Test Loss: 0.04 | Test accuracy = 99.11%
Epoch: 3 | Training Loss: 0.04 | Test Loss: 0.02 | Test accuracy = 99.37%
Epoch: 4 | Training Loss: 0.03 | Test Loss: 0.03 | Test accuracy = 99.20%
Epoch: 5 | Training Loss: 0.03 | Test Loss: 0.02 | Test accuracy = 99.25%
Epoch: 6 | Training Loss: 0.02 | Test Loss: 0.02 | Test accuracy = 99.32%
Epoch: 7 | Training Loss: 0.02 | Test Loss: 0.02 | Test accuracy = 99.30%
Epoch: 8 | Training Loss: 0.02 | Test Loss: 0.02 | Test accuracy = 99.27%
Epoch: 9 | Training Loss: 0.02 | Test Loss: 0.02 | Test accuracy = 99.38%
Epoch: 10 | Training Loss: 0.01 | Test Loss: 0.02 | Test accuracy = 99.31%
Epoch: 11 | Training Loss: 0.01 | Test Loss: 0.02 | Test accuracy = 99.41%
Epoch: 12 | Training Loss: 0.01 | Test Loss: 0.02 | Test accuracy = 99.26%
Epoch: 13 | Training Loss: 0.01 | Test Loss: 0.02 | Test accuracy = 99.45%
Epoch: 14 | Training Loss: 0.01 | 