In [45]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim

In [46]:
# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Hyper parameters
num_epochs = 5
num_classes = 10
batch_size = 100
learning_rate = 0.001

In [47]:
# load the data
train_dataset = torchvision.datasets.MNIST(root='../../data', 
                                           train=True, 
                                           transform=transforms.ToTensor(),
                                           download=True)

test_dataset = torchvision.datasets.MNIST(root='../../data',
                                          train = False,
                                          transform=transforms.ToTensor(),
                                          download=True)

In [48]:
image1, label1 = train_dataset[0]
print(image1.size(),image1.size(0),list(image1.size())[0])

torch.Size([1, 28, 28]) 1 1


In [49]:
# data loader
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                          batch_size = batch_size,
                                          shuffle = True)

test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                          batch_size = batch_size,
                                          shuffle = False)

# Make a CNN

#first some explanation of batch normalization:

how batch normalization works:
once we have z(i) = a(i-1) * w(i) + b(i),
we calculated the mean and standard deviation of z(i).
then znorm(i) = (z(i) - mean) / sqrt(standard_deviation + C)

then Z(i) = gamma * znorm(i) + beta, gamma and beta are learnable parameters

then do the activation of a(i) = g(Z(i)).


#why do we need batch norm?
- first, it's like why we need normalization in the training features. we can speed up the training by normalize the ranges of different features.

- Second it's due to covariate shift. The idea is that if we have a model trained on black cat, this model should not be expected as good to tell colorful cat even the underlying math might be the same. It shows that the shift in the data may cause the training to be failed. In the neural network, the features in each layer (activation in each layer) is always changed (and shifted) by what's going on before this layer. Batch norm can efficiently reduce this shift.

#Batch normalization may cause noise:
the mean and variance are calculated based on the current mini-batch so it may introduce the noise to the activation.



the architecture of CNN
--------

feature channel 1 -> CNN (kernel = 5, stride = 1, pad = 2) -> feature channel 16

-> BatchNorm2d

-> ReLu

-> Maxpool2d (kernel = 2, stride = 2)

-> CNN (Same setting) -> feature channel 32 -> BatchNorm2d -> ReLu -> Maxpool2d

-> Flat (either use torch.flatten() or out.reshape())

-> linear -> number_classes

In [50]:
import torch.nn.functional as F

class ConvNet(nn.Module):
    def __init__(self, num_class = 10):
        super(ConvNet,self).__init__() # the thing in this parenthesis should be the argument for parent class, which is none here
        
        self.num_class = num_class
        
        self.layer1 = nn.Sequential(  # put layers together
                    nn.Conv2d(1,16, 5, stride = 1, padding = 2),
                    nn.BatchNorm2d(16),
                    nn.ReLU(),   # introduce the non-linearity to NN
                    nn.MaxPool2d(kernel_size = 2, stride = 2))
        
        self.layer2 = nn.Sequential(
                    nn.Conv2d(16,32,5,stride = 1, padding = 2),
                    nn.BatchNorm2d(32),
                    nn.ReLU(),
                    nn.MaxPool2d(kernel_size = 2, stride = 2))
        
        self.linear = nn.Linear(7*7* 32 , num_class) # the image dimension reduces from 28 to 7 after to maxpooling layer
        
    def forward(self, x): # you have to have a forward function with arguments (self,x) in torch neural network
        a = self.layer1(x)
        a = self.layer2(a)
        a = torch.flatten(a,1) # or a = a.reshape(a.size(0),-1)
        out = self.linear(a)
        return out

net = ConvNet(num_classes)
print(net)

ConvNet(
  (layer1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (linear): Linear(in_features=1568, out_features=10, bias=True)
)


In [51]:
# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(),lr = learning_rate)

In [52]:
# train the model
total_step = len(train_loader)

for ep in range(0,num_epochs):
    
    total_loss = 0
    
    for index, data in enumerate(train_loader,0):
        images,labels = data
        
        outputs = net(images)
        
        
        loss = criterion(outputs,labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss
        
        if (index) % 100 == 99:
            print('now is epoch %d, batch %d, total_loss is %.3f' % (ep, index, total_loss.item() / 100))
            total_loss = 0
        

now is epoch 0, batch 99, total_loss is 0.411
now is epoch 0, batch 199, total_loss is 0.117
now is epoch 0, batch 299, total_loss is 0.095
now is epoch 0, batch 399, total_loss is 0.080
now is epoch 0, batch 499, total_loss is 0.066
now is epoch 0, batch 599, total_loss is 0.060
now is epoch 1, batch 99, total_loss is 0.056
now is epoch 1, batch 199, total_loss is 0.051
now is epoch 1, batch 299, total_loss is 0.052
now is epoch 1, batch 399, total_loss is 0.051
now is epoch 1, batch 499, total_loss is 0.048
now is epoch 1, batch 599, total_loss is 0.044
now is epoch 2, batch 99, total_loss is 0.042
now is epoch 2, batch 199, total_loss is 0.036
now is epoch 2, batch 299, total_loss is 0.036
now is epoch 2, batch 399, total_loss is 0.032
now is epoch 2, batch 499, total_loss is 0.038
now is epoch 2, batch 599, total_loss is 0.042
now is epoch 3, batch 99, total_loss is 0.025
now is epoch 3, batch 199, total_loss is 0.032
now is epoch 3, batch 299, total_loss is 0.027
now is epoch 3, b

# Test the model

Important thing is that we need to add net.eval() so batchnorm uses moving mean/variance instead of mini-batch mean/variance
--------

In [57]:
net.eval()
with torch.no_grad():
    correct = 0
    for index, data in enumerate(test_loader):
        images, labels = data
        
        outputs = net(images)
        
        outputs = torch.argmax(outputs.data,1)
        
        correct += sum(outputs == labels).item()
        
accuracy = correct / test_dataset.__len__()
print(accuracy)
        
        

0.9898


# SAVE THE MODEL
here we use two methods, save the ckpt and pth

In [58]:
print("Model's state_dict:")
for param_tensor in net.state_dict():
    print(param_tensor, "\t", net.state_dict()[param_tensor].size())
    

# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

Model's state_dict:
layer1.0.weight 	 torch.Size([16, 1, 5, 5])
layer1.0.bias 	 torch.Size([16])
layer1.1.weight 	 torch.Size([16])
layer1.1.bias 	 torch.Size([16])
layer1.1.running_mean 	 torch.Size([16])
layer1.1.running_var 	 torch.Size([16])
layer1.1.num_batches_tracked 	 torch.Size([])
layer2.0.weight 	 torch.Size([32, 16, 5, 5])
layer2.0.bias 	 torch.Size([32])
layer2.1.weight 	 torch.Size([32])
layer2.1.bias 	 torch.Size([32])
layer2.1.running_mean 	 torch.Size([32])
layer2.1.running_var 	 torch.Size([32])
layer2.1.num_batches_tracked 	 torch.Size([])
linear.weight 	 torch.Size([10, 1568])
linear.bias 	 torch.Size([10])
Optimizer's state_dict:
state 	 {0: {'step': 3000, 'exp_avg': tensor([[[[ 9.1137e-03,  5.2510e-03,  4.7147e-03,  5.4166e-03,  8.6822e-03],
          [ 2.0114e-03,  2.0290e-03,  8.1744e-04,  3.5166e-03,  8.1336e-03],
          [ 1.0925e-03,  1.6095e-03,  1.6401e-04,  3.2716e-03,  6.8287e-03],
          [-4.3295e-04,  6.9347e-04,  2.7794e-03,  4.6885e-03,  3.6465e-

In [60]:
# method 1
PATH = 'cnn_example.ckpt'
torch.save({
            'epoch': ep,
            'model_state_dict': net.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, PATH)

In [61]:
# method 2
PATH = 'cnn_examplt.pth'
torch.save(net.state_dict(),PATH)