# Imports

In [27]:
from numpy.random import seed
seed(888)
import tensorflow
tensorflow.random.set_seed(404)

In [28]:
import os
import numpy as np
import tensorflow as tf

# Constants

In [46]:
X_TRAIN_PATH = 'MNIST/digit_xtrain.csv'
X_TEST_PATH = 'MNIST/digit_xtest.csv'
Y_TRAIN_PATH = 'MNIST/digit_ytrain.csv'
Y_TEST_PATH = 'MNIST/digit_ytest.csv'

NR_CLASSES = 10
VALIDATION_SIZE = 10000

# Get the Data

In [30]:
%%time

y_train_all = np.loadtxt(Y_TRAIN_PATH, delimiter=',', dtype=int)

CPU times: user 178 ms, sys: 2.81 ms, total: 181 ms
Wall time: 181 ms


In [31]:
y_train_all.shape

(60000,)

In [32]:
y_test = np.loadtxt(Y_TEST_PATH, delimiter=',', dtype=int)

In [33]:
%%time 

x_train_all = np.loadtxt(X_TRAIN_PATH, delimiter=',', dtype=int)

CPU times: user 31.5 s, sys: 828 ms, total: 32.3 s
Wall time: 32.3 s


In [34]:
%%time 

x_test = np.loadtxt(X_TEST_PATH, delimiter=',', dtype=int)

CPU times: user 5.12 s, sys: 102 ms, total: 5.22 s
Wall time: 5.22 s


# Explore

In [35]:
x_train_all.shape

(60000, 784)

In [36]:
x_train_all[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   3,  18,  18,  18,
       126, 136, 175,  26, 166, 255, 247, 127,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,  30,  36,  94, 154, 17

In [37]:
y_train_all.shape

(60000,)

In [38]:
x_test.shape

(10000, 784)

In [39]:
# Each label corresponds to the categories or the classes for digits
y_train_all[:5]

array([5, 0, 4, 1, 9])

# Data Preprocessing 

In [40]:
# Re-scale
x_train_all, x_test = x_train_all / 255.0, x_test / 255.0

### Convert target values to one-hot encoding

In [41]:
# Eg to convert sparse matrix to full matrix
# Array element indexing in actual
values = y_train_all[:5]
np.eye(10)[values]

# Before -> array([5, 0, 4, 1, 9])

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [42]:
y_train_all = np.eye(NR_CLASSES)[y_train_all]

In [43]:
y_train_all.shape

(60000, 10)

In [45]:
y_test = np.eye(NR_CLASSES)[y_test]
y_test.shape

(10000, 10)

### Create validation dataset from training data

**Challenge:** Split the training dataset into a smaller training dataset and a validation dataset for the features and the labels. Create four arrays: `x_val`, `y_val`, `x_train` and `y_train` from `x_train_all` and `y_train_all`. Use the validation size of 10,000.

In [47]:
x_val = x_train_all[:VALIDATION_SIZE]
y_val = y_train_all[:VALIDATION_SIZE]

In [48]:
x_train = x_train_all[VALIDATION_SIZE:]
y_train = y_train_all[VALIDATION_SIZE:]

In [49]:
x_val.shape

(10000, 784)

In [50]:
x_train.shape

(50000, 784)