In [28]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

## Load data

In [29]:
mnist_ds,mnist_info = tfds.load(name='mnist',with_info=True,as_supervised=True)
mnist_info

tfds.core.DatasetInfo(
    name='mnist',
    full_name='mnist/3.0.1',
    description="""
    The MNIST database of handwritten digits.
    """,
    homepage='http://yann.lecun.com/exdb/mnist/',
    data_path='C:\\Users\\manik\\tensorflow_datasets\\mnist\\3.0.1',
    file_format=tfrecord,
    download_size=11.06 MiB,
    dataset_size=21.00 MiB,
    features=FeaturesDict({
        'image': Image(shape=(28, 28, 1), dtype=uint8),
        'label': ClassLabel(shape=(), dtype=int64, num_classes=10),
    }),
    supervised_keys=('image', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=10000, num_shards=1>,
        'train': <SplitInfo num_examples=60000, num_shards=1>,
    },
    citation="""@article{lecun2010mnist,
      title={MNIST handwritten digit database},
      author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
      journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},
      volume={2},
      year={2010}
    }""",
)

Note that mnist data consists of 60000 samples of train and 10000 samples of test but no validation samples

In [30]:
#Create validatation data from train data(usually 10%)
mnist_train,mnist_test = mnist_ds['train'],mnist_ds['test']
no_valid_samples = 0.1*mnist_info.splits['train'].num_examples
#typecase to int64
no_valid_samples = tf.cast(no_valid_samples,tf.int64)

no_test_samples = mnist_info.splits['test'].num_examples
no_test_samples = tf.cast(no_test_samples,tf.int64)

## Scale data

In [31]:
def scale(image,label):
    image = tf.cast(image,tf.float32)
    image /= 255. #dot is to make sure result to be in float
    return image, label

scale_train_valid_data = mnist_train.map(scale)
scale_test = mnist_test.map(scale)

It's possible that the targets are stored in ascending order, resulting in the first X batches having only zero targets and the other batches having only one as a target.Since we'll be batching, it would be better shuffle the data. It should be as randomly spread as possible so that batching works as intended.

In [32]:
BUFFER_SIZE = 10000 #10000 samples at a time. If BUFFER_SIZE=1, no shuffling. If BUFFER_SIZE>no.of samples, shuffling shuffling 
                        #takesplace uniformly once.

shuffled_train_valid = scale_train_valid_data.shuffle(BUFFER_SIZE)
validation_data = shuffled_train_valid.take(no_valid_samples)
train_data = shuffled_train_valid.skip(no_valid_samples)


## mini batch gradient descent

In [33]:
BATCH_SIZE = 100 #hyper parameter need to be tuned
train_data = train_data.batch(BATCH_SIZE) #No need to batch for validation data since we do not back propagate validation data
validation_data = validation_data.batch(no_valid_samples) #since model expects validation and test data also be in batches.
test_data = scale_test.batch(no_test_samples)

validation_inputs, validation_targets = next(iter(validation_data)) #next takes next batch for validation. Since there is only 
                                                                    #one batch, load only once
#test_inputs, test_targets = next(iter(test_data))

## Model
* Input layers has 28 x 28 = 784 neurons, 
* 2 hidden layers each with 50 neurons, and
* one output layer with 10 neurons each for each digit.

In [59]:
# Tune these input parameters for better results
input_size = 784
output_size = 10
hidden_layer_size = 100

In [60]:
#tf.keras.Dense(output_size) takes input to the model and finds the dot product of the inputs and weights, and adds the bias.
#This is where we apply an activation function to this expression.
model = tf.keras.Sequential([
                            tf.keras.layers.Flatten(input_shape=(28,28,1)),
                            tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
                            tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
                            tf.keras.layers.Dense(output_size,activation='softmax')
])

## Select optimizer and loss function

In [61]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])#If o/ps are one-hot encoded use categorical cross entropy,
                                                                        #if not sparse categorical cross entropy

                                                                        

In [62]:
EPOCHS = 5
model.fit(train_data,epochs=EPOCHS,validation_data=(validation_inputs,validation_targets),verbose=2)

Epoch 1/5
540/540 - 4s - loss: 0.3295 - accuracy: 0.9053 - val_loss: 0.1635 - val_accuracy: 0.9528 - 4s/epoch - 7ms/step
Epoch 2/5
540/540 - 3s - loss: 0.1350 - accuracy: 0.9597 - val_loss: 0.1187 - val_accuracy: 0.9640 - 3s/epoch - 6ms/step
Epoch 3/5
540/540 - 3s - loss: 0.0959 - accuracy: 0.9711 - val_loss: 0.0962 - val_accuracy: 0.9708 - 3s/epoch - 5ms/step
Epoch 4/5
540/540 - 3s - loss: 0.0758 - accuracy: 0.9772 - val_loss: 0.0711 - val_accuracy: 0.9775 - 3s/epoch - 5ms/step
Epoch 5/5
540/540 - 3s - loss: 0.0585 - accuracy: 0.9815 - val_loss: 0.0602 - val_accuracy: 0.9810 - 3s/epoch - 5ms/step


<keras.src.callbacks.History at 0x17e62273a90>

## Using the code from the lecture as the basis, fiddle with the hyperparameters of the algorithm.

1. The *width* (the hidden layer size) of the algorithm. Try a hidden layer size of 200. How does the validation accuracy of the model change? What about the time it took the algorithm to train? Can you find a hidden layer size that does better?

2. The *depth* of the algorithm. Add another hidden layer to the algorithm. This is an extremely important exercise! How does the validation accuracy change? What about the time it took the algorithm to train? Hint: Be careful with the shapes of the weights and the biases.

3. The *width and depth* of the algorithm. Add as many additional layers as you need to reach 5 hidden layers. Moreover, adjust the width of the algorithm as you find suitable. How does the validation accuracy change? What about the time it took the algorithm to train?

4. Fiddle with the activation functions. Try applying sigmoid transformation to both layers. The sigmoid activation is given by the string 'sigmoid'.

5. Fiddle with the activation functions. Try applying a ReLu to the first hidden layer and tanh to the second one. The tanh activation is given by the string 'tanh'.

6. Adjust the batch size. Try a batch size of 10000. How does the required time change? What about the accuracy?

7. Adjust the batch size. Try a batch size of 1. That's the SGD. How do the time and accuracy change? Is the result coherent with the theory?

8. Adjust the learning rate. Try a value of 0.0001. Does it make a difference?

9. Adjust the learning rate. Try a value of 0.02. Does it make a difference?

10. Combine all the methods above and try to reach a validation accuracy of 98.5+ percent.

## test model

In [63]:
test_loss, test_accuracy = model.evaluate(test_data)



In [64]:
print('Test loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))

Test loss: 0.08. Test accuracy: 97.55%
