In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
from torchsummary import summary

In [2]:
data = pd.read_csv(r"D:\repos\udemy\.dataset\mnist_train.csv", delimiter=',', header=None)

In [3]:
data.shape

(60000, 785)

In [4]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
labels = data[0]
data = data.drop(columns=[0])

In [6]:
data = np.array(data)
labels = np.array(labels)

In [7]:
# The min and max values of the data are 0 and 255
# It is normalized to 0 and 1
data_norm = data/np.max(data)

## Tensor dataset and loader

In [8]:
data_tensor = torch.tensor(data_norm).float()
label_tensor = torch.tensor(labels).long()

In [9]:
data_tensor

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [10]:
label_tensor

tensor([5, 0, 4,  ..., 5, 6, 8])

In [11]:
x_train, x_test, y_train, y_test = train_test_split(data_tensor, label_tensor,train_size=0.7)

In [12]:
train_data = TensorDataset(x_train, y_train)
test_data = TensorDataset(x_test, y_test)

In [13]:
train_loader = DataLoader(train_data, batch_size=512, drop_last=True, shuffle=True)
test_loader = DataLoader(test_data, batch_size=test_data.tensors[0].shape[0])

In [14]:
for i in train_loader:
    print(i[0].shape, i[1].shape)
    break

torch.Size([512, 784]) torch.Size([512])


In [15]:
for i in test_loader:
    print(i[0].shape, i[1].shape)

torch.Size([18000, 784]) torch.Size([18000])


# Model

In [16]:
def model(nunits, nlayers):
    class ann(nn.Module):
        def __init__(self, units, layers):
            super().__init__()
            # Module to store the layers
            self.layers = nn.ModuleDict()
            self.nlayers = layers
            # Input layer
            self.layers['input'] = nn.Linear(784, units)
            for i in range(layers):
                self.layers[f'layer{i}'] = nn.Linear(units, units)
                
            self.layers['output'] = nn.Linear(units, 10)

        def forward(self, x):
            x = F.relu(self.layers['input'](x))
            for i in range(self.nlayers):
                x = F.relu(self.layers[f'layer{i}'](x))
            x = self.layers['output'](x)
            return x
    net = ann(nunits, nlayers)
    opt = torch.optim.SGD(net.parameters(), lr=0.01)
    lossfn = nn.CrossEntropyLoss()
    return net, opt, lossfn

In [17]:
net, opt, lossfn = model(64, 1)

In [18]:
net

ann(
  (layers): ModuleDict(
    (input): Linear(in_features=784, out_features=64, bias=True)
    (layer0): Linear(in_features=64, out_features=64, bias=True)
    (output): Linear(in_features=64, out_features=10, bias=True)
  )
)

In [19]:
# testing the model with random data
tempx = torch.randn(5, 784)
y = net(tempx)
print(y.shape)
print(y)

torch.Size([5, 10])
tensor([[ 0.2597,  0.1608,  0.1250,  0.0518,  0.0179,  0.0308,  0.0472, -0.1895,
         -0.0327,  0.1022],
        [ 0.1455,  0.0141,  0.1109, -0.0443, -0.0199, -0.0313,  0.0661, -0.0960,
         -0.0286,  0.1317],
        [ 0.1156,  0.1709,  0.1327, -0.0041,  0.0680, -0.0062,  0.0605, -0.1271,
         -0.0549,  0.0601],
        [ 0.1660,  0.0772,  0.2065,  0.0658, -0.0270,  0.0330, -0.0518, -0.1706,
         -0.1269,  0.1322],
        [ 0.1942,  0.0427,  0.2497,  0.0639, -0.0516,  0.1450, -0.0125, -0.1290,
         -0.0514,  0.1071]], grad_fn=<AddmmBackward0>)


In [20]:
epochs = 60
def train(nunits, nlayers):
    net, opt, lossfn = model(nunits, nlayers)
    losses = []
    train_acc = []
    test_acc = []

    for i in range(epochs):
        # training
        batchacc = []
        batchloss = []
        for trainx, trainy in train_loader:
            # batch train
            yhat = net(trainx)
            loss = lossfn(yhat, trainy)
            # backprop
            opt.zero_grad()
            loss.backward()
            opt.step()
            # metrics
            batchloss.append(loss.item())
            matches = torch.argmax(yhat, axis=1) == trainy # Booleans
            matches_num = matches.float() # convert booleans to numbers
            accuracy = 100 * torch.mean(matches_num)
            batchacc.append(accuracy)
        train_acc.append(np.mean(batchacc))
        losses.append(np.mean(batchloss))
        # testing
        testx, testy = next(iter(test_loader))
        with torch.no_grad():
            yhat = net(testx)
        # metrics
        test_acc.append(100*torch.mean((torch.argmax(yhat, axis=1)==testy).float()))
    return train_acc, test_acc, losses

In [21]:
num_layers = range(1, 4)
num_units = np.arange(50,251,50)

In [22]:
num_units

array([ 50, 100, 150, 200, 250])

In [23]:
list(num_layers)

[1, 2, 3]

In [24]:
train_acc = np.zeros((len(num_layers), len(num_units)))
test_acc = np.zeros((len(num_layers), len(num_units)))
losses = np.zeros((len(num_layers), len(num_units)))

In [25]:
for i in range(len(num_layers)):
    for j in range(len(num_units)):
        x, y, z = train(num_units[j], num_layers[i])
        train_acc[i,j] = np.mean(x[-5:])
        test_acc[i,j] = np.mean(y[-5:])
        losses[i,j] = np.mean(z[-5:])
        print(f'Finished - num_units: {num_units[j]}, num_layers: {num_layers[i]}')
        print(f'Train accuracy: {train_acc[i,j]}, Test accuracy: {test_acc[i,j]}, Loss: {losses[i,j]}')

Finished - num_units: 50, num_layers: 1
Train accuracy: 91.13758087158203, Test accuracy: 90.60333251953125, Loss: 0.31177703153069425
Finished - num_units: 100, num_layers: 1
Train accuracy: 91.6125259399414, Test accuracy: 91.15666198730469, Loss: 0.2923477446160666
Finished - num_units: 150, num_layers: 1
Train accuracy: 91.58060455322266, Test accuracy: 90.98999786376953, Loss: 0.2968666564764046
Finished - num_units: 200, num_layers: 1
Train accuracy: 91.87071228027344, Test accuracy: 91.40332794189453, Loss: 0.28483149372949834
Finished - num_units: 250, num_layers: 1
Train accuracy: 91.69874572753906, Test accuracy: 91.2844467163086, Loss: 0.2897292212015245
Finished - num_units: 50, num_layers: 2
Train accuracy: 90.67930603027344, Test accuracy: 90.03333282470703, Loss: 0.3227127610546786
Finished - num_units: 100, num_layers: 2
Train accuracy: 91.47865295410156, Test accuracy: 90.8722152709961, Loss: 0.29995831970761466
Finished - num_units: 150, num_layers: 2
Train accuracy: 

#### In general wider networks are better than deeper networks because the wider networks have more parameters to learn from. However, the wider networks are more prone to overfitting. The deeper networks are better at generalizing but they are more prone to vanishing gradient problem. So, the best approach is to use a combination of both wider and deeper networks