# **Train on EMNIST (for letters) and Transfer for MNIST**
* **Basic concepts learnt from: A Deep understanding of Deep Learning (with Python intro) - Mark X Cohen (Udemy) - https://www.udemy.com/course/deeplearning_x**
* **Extended learning and understanding by VigyannVeshi**
* **EMINST Dataset**
    * Dataset containing (A-Z, a-z, 0-9) handwritten character images of size (28,28) grayscale
    * Total 814255 characters

* **MNIST:**
    * MNIST - Modified National Institute of Standards and Technology
    * Image Size: 28x28 pixels
    * Total Images: 70,000 (60,000 training, 10,000 testing)
    * Channels: Grayscale, 1 channel

In [None]:
# basic deep learning libraries
import numpy as np
import torch as tr
import torch.nn as nn
import torch.nn.functional as F
import torchvision as tv

# import dataset/loader libraries
from torch.utils.data import TensorDataset,DataLoader
from sklearn.model_selection import train_test_split

# import summary libraries for model information
from torchsummary import summary

# import plotting libraries
import matplotlib.pyplot as plt
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# extra imports
import copy

_________________________
**Accessing Dataset**

In [1]:
cd ..

/home/rudraksha14/Desktop/RAY_RISE_ABOVE_YOURSELF/Programming/advanced_deep_learning


In [2]:
cd datasets

/home/rudraksha14/Desktop/RAY_RISE_ABOVE_YOURSELF/Programming/advanced_deep_learning/datasets


______________

In [2]:
# Import and inspect the data
# cdata=tv.datasets.EMNIST(root='emnist',split='letters')
cdata=tv.datasets.EMNIST(root='emnist',split='letters',download=True)


0.1%

In [None]:
#inspect the data

# the categories (but how many letters?)

print(cdata.classes)
print(str(len(cdata.classes))+'classes')

print("\nData Size:")
print(cdata.data.shape)

# transform to 4D tensor for conv layers (and transform from int8 to float)
images=cdata.data.view([124800,1,28,28]).float()
print('\nTensor Data: ')
print(images.shape)


In [None]:
# brief aside: class 'N/A' doesn't exist in the data
print(tr.sum(cdata.targets==0))

# however, it causes problems in one-hot encoding...
tr.unique(cdata.targets)

In [None]:
cdata.class_to_idx

In [None]:
# so therefore we'll eliminate 'N/A' and subtract 1 from the original labels

# remove the first class category
letterCategories = cdata.classes[1:]

# relabel labels to start at 0
labels=copy.deepcopy(cdata.targets)-1
print(labels.shape)

print(tr.sum(labels==0))
print(tr.unique(labels))


In [None]:
# next issue: do we need to normalize the images?
plt.hist(images[:10,:,:,:].view(1,-1).detach(),40)
plt.title('Raw values')
plt.show()

# yarp.{Means YES!}
images/=tr.max(images)
plt.hist(images[:10,:,:,:].view(1,-1).detach(),40)
plt.title('After Normalization!!')
plt.show()

In [None]:
# visualization of some images
fig,axs=plt.subplots(3,7,figsize=(13,6))

for i,ax in enumerate(axs.flatten()):

  # pick a random pic
  whichpic=np.random.randint(images.shape[0])

  # extract the image and its target letter
  I=np.squeeze(images[whichpic,:,:])
  letter=letterCategories[labels[whichpic]]

  # visualize
  ax.imshow(I.T,cmap='gray')
  ax.set_title(f'The letter {letter}')
  ax.set_xticks([])
  ax.set_yticks([])

plt.show()


In [None]:
# Create train/test groups using Dataloader

# step 2: use scikit-learn to split the data
train_data,test_data,train_labels,test_labels=train_test_split(images,labels,test_size=0.1)

# step 3: convert to Pytorch Datasets
train_data=TensorDataset(train_data,train_labels)
test_data= TensorDataset(test_data,test_labels)

# step 4: translate into dataloader objects
batchsize=32
train_loader=DataLoader(train_data,batch_size=batchsize,shuffle=True,drop_last=True)
test_loader=DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])

In [None]:
# check size (should be images x channels x width x height)
print(train_loader.dataset.tensors[0].shape)
print(train_loader.dataset.tensors[1].shape)

In [None]:
# creating a class for the model
def createNet(printtoggle=False):
    class mnistNet(nn.Module):
        def __init__(self,printtoggle):
            super().__init__()

            ### convolution layers

            # first convolution layer
            self.conv1=nn.Conv2d(1,6,kernel_size=3,stride=1,padding=1)
            self.bnorm1=nn.BatchNorm2d(6) # input the number of channels in this layers
            ### output size: (28+2*1-3)/1 + 1 = 28 --> //2 = 14 (After maxpool (2x2))

            # second convolutional layer
            self.conv2=nn.Conv2d(6,6,kernel_size=3,stride=1,padding=1)
            self.bnorm2=nn.BatchNorm2d(6) # input the number of channels in this layers
            ### output size: (14+2*1-3)/1 + 1 = 14 --> //2 = 7 (After maxpool (2x2))


            ### linear decision layer

            # fully-connected layer
            self.fc1=nn.Linear(7*7*6,50)

            # output layer
            self.out=nn.Linear(50,26)

            # toggle for printing out tensor sizes during forward prop
            self.print=printtoggle

        # forward propogation
        def forward(self,x):
            print(f'Input: {x.shape}') if self.print else None

            ### convolution -> maxpool -> relu (1)
            x = F.leaky_relu(self.bnorm1(F.max_pool2d(self.conv1(x),2)))
            print(f'Layer conv1/pool1: {x.shape}') if self.print else None

            ### convolution -> maxpool -> relu (2)
            x = F.leaky_relu(self.bnorm2(F.max_pool2d(self.conv2(x),2)))
            print(f'Layer conv2/pool2: {x.shape}') if self.print else None

            # reshape for linear layer
            ### x.shape.numel() -->  gives total number of elements in the array/tuple
            nUnits=x.shape.numel()/x.shape[0]
            ### vectorizing
            x=x.view(-1,int(nUnits))
            print(f'Vectorize: {x.shape}') if self.print else None

            # linear layers
            x=F.leaky_relu(self.fc1(x))
            print(f'Layer fc1: {x.shape}') if self.print else None
            x=self.out(x)
            print(f'Layer fc1: {x.shape}') if self.print else None

            return x

    # create the model instance
    net=mnistNet(printtoggle)

    # loss function
    lossfun=nn.CrossEntropyLoss()

    # optimizer
    optimizer=tr.optim.Adam(net.parameters(),lr=0.001)

    return net,lossfun,optimizer

In [None]:
# test the model with one batch
net,lossfun,optimizer = createNet(True)

X,y = next(iter(train_loader))
yHat = net(X)

print('\nLabel size:')
print(y.shape)

# check size of output
print('\nOutput size:')
print(yHat.shape)

# # now let's compute the loss
loss = lossfun(yHat,tr.squeeze(y))
print(' ')
print('Loss:')
print(loss)

In [None]:
# setting up gpu
# use GPU if available
device = tr.device('cuda:0' if tr.cuda.is_available() else 'cpu')
device

In [None]:
# count the total number of parameters in the model
net.to(device)
summary(net,(1,28,28))

In [None]:
# Create a function that trains the model

def trainModel(n_epochs=10):
    # create a new model
    net,lossfun,optimizer=createNet()

    # send the model to the GPU
    net.to(device)

    # initialize losses
    trainLoss = tr.zeros(n_epochs)
    testLoss = tr.zeros(n_epochs)
    trainErr = tr.zeros(n_epochs)
    testErr = tr.zeros(n_epochs)

    # loop over epochs
    for epochi in range(n_epochs):

        # loop over training data batches
        batchLoss=[]
        batchErr=[]

        for X,y in train_loader:

            # push data to GPU
            X = X.to(device)
            y = y.to(device)

            # forward pass and loss
            yHat=net(X)
            loss=lossfun(yHat,y)

            # backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # loss from this batch
            batchLoss.append(loss.item())
            batchErr.append(tr.mean((tr.argmax(yHat,axis=1)!=y).float()).item())

            # end of batch loop ...

        # and get the average losses across the batches
        trainLoss[epochi]=np.mean(batchLoss)
        trainErr[epochi]=100*np.mean(batchErr)

        # test accuracy
        X,y=next(iter(test_loader))
        # push data to GPU
        X = X.to(device)
        y = y.to(device)
        with tr.no_grad(): # deactivates autograd
            yHat=net(X)
            loss=lossfun(yHat,y)

        # compare the following really long lines of code to the training accuracy lines
        testLoss[epochi]=loss.item()
        testErr[epochi]=100*tr.mean((tr.argmax(yHat,axis=1)!=y).float()).item()

        print(f"{epochi+1}/{n_epochs} complete!")
    # end of epochs

    # function output
    return trainLoss,testLoss,trainErr,testErr,net

In [None]:
# Run the model and show the results!
# GPU ~ 1min; CPU~2.47 mins (Ryzen 7 5700u)
trainLoss,testLoss,trainErr,testErr,net=trainModel()

In [None]:
fig,ax = plt.subplots(1,2,figsize=(16,5))

ax[0].plot(trainLoss,'s-',label='Train')
ax[0].plot(testLoss,'o-',label='Test')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Loss (Binary Cross Entropy)')
ax[0].set_title('Model loss')

ax[1].plot(trainErr,'s-',label='Train')
ax[1].plot(testErr,'o-',label='Test')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Error rates (%)')
ax[1].set_title(f'Final model test error rate: {testErr[-1]:.2f}%')
ax[1].legend()

plt.show()

In [None]:
### visualize some images

# extract X,y from test dataloader
X,y = next(iter(test_loader))
X = X.to(device) # push data to GPU
y = y.to(device) # push data to GPU
yHat = net(X)

# pick some examples at random to show
randex = np.random.choice(len(y),size=21,replace=False)

# visualize some images
fig,axs = plt.subplots(3,7,figsize=(15,6))

for i,ax in enumerate(axs.flatten()):

  # extract the image and its target letter
  I = np.squeeze( X[randex[i],0,:,:] ).cpu() # .cpu() to transfer back from GPU!
  trueLetter = letterCategories[ y[randex[i]] ]
  predLetter = letterCategories[ tr.argmax(yHat[randex[i],:]) ]

  # color-code the accuracy (using ternary operator)
  col = 'gray' if trueLetter==predLetter else 'hot'

  # visualize
  ax.imshow(I.T,cmap=col)
  ax.set_title('True %s, predicted %s' %(trueLetter,predLetter),fontsize=10)
  ax.set_xticks([])
  ax.set_yticks([])

plt.show()

In [None]:
import sklearn.metrics as skm

# compute the confusion matrix
C = skm.confusion_matrix(y.cpu(),tr.argmax(yHat.cpu(),axis=1),normalize='true')

# visualize it
fig = plt.figure(figsize=(10,10))
plt.imshow(C,'Blues',vmax=.05)

# make the plot look nicer
plt.xticks(range(26),labels=letterCategories)
plt.yticks(range(26),labels=letterCategories)
plt.title('TEST confusion matrix')
plt.xlabel('Predicted letter')
plt.ylabel('True letter')

plt.show()

**Additional Explorations**

In [None]:
# 1) I added batch normalization to the convolution layers, but not to the linear (fc*) layers. But linear layers also  benefit from batchnorm just like convolution layers do. Add it!

# 2) In the next few videos, we will see whether we can improve the model's performance by experimenting with the number of layers, kernel size, and linear-layer units. Is there anything you could think of, other than these three features, that might help boost model performance?