# Imports

In [1]:
from numpy.random import seed
seed(888)
import tensorflow
tensorflow.random.set_seed(404)

In [2]:
import os
import numpy as np
import tensorflow as tf

import pandas as pd

# Constants

In [23]:
X_TRAIN_PATH = 'NIST/alphabets_xtrain.csv'
X_TEST_PATH = 'NIST/alphabets_xtest.csv'
Y_TRAIN_PATH = 'NIST/alphabets_ytrain.csv'
Y_TEST_PATH = 'NIST/alphabets_ytest.csv'

NR_CLASSES = 26
VALIDATION_SIZE = 111735 #Same as test data set

# Get the Data

In [4]:
%%time

y_train_all = np.loadtxt(Y_TRAIN_PATH, delimiter=',', dtype=int)

CPU times: user 729 ms, sys: 9.48 ms, total: 739 ms
Wall time: 743 ms


In [5]:
y_train_all.shape

(260715,)

In [6]:
%%time

y_test = np.loadtxt(Y_TEST_PATH, delimiter=',', dtype=int)

CPU times: user 355 ms, sys: 5.41 ms, total: 360 ms
Wall time: 363 ms


In [7]:
%%time

x_train_all = np.loadtxt(X_TRAIN_PATH, delimiter=',', dtype=int)

CPU times: user 2min 26s, sys: 3.76 s, total: 2min 30s
Wall time: 2min 30s


In [8]:
%%time

x_test = np.loadtxt(X_TEST_PATH, delimiter=',', dtype=int)

CPU times: user 1min 1s, sys: 1.56 s, total: 1min 3s
Wall time: 1min 3s


# Explore

In [9]:
x_train_all.shape

(260715, 784)

In [10]:
x_train_all[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  15,  2

### Training Data

In [11]:
y_train_all.shape

(260715,)

In [12]:
y_train_all[:5]

array([18, 18, 12, 17,  6])

In [13]:
x_train_all.shape

(260715, 784)

### Testing Data

In [14]:
x_test.shape

(111735, 784)

In [15]:
y_test.shape

(111735,)

# Data Preprocessing

In [16]:
# Re-scale our features
x_train_all, x_test = x_train_all/255.0, x_test/255.0

#### Convert Target values to one-hot encoding

In [17]:
# Convert Sparse Matrix to full Matrix
values = y_train_all[:5]
np.eye(NR_CLASSES)[values]  #0-25(26 types of labels i.e Alphabets)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [18]:
# Pulling out 4th value of our training values
values[4]

6

In [19]:
y_train_all[0]

18

In [20]:
y_train_all = np.eye(NR_CLASSES)[y_train_all]

In [21]:
y_train_all.shape

(260715, 26)

In [22]:
y_test = np.eye(NR_CLASSES)[y_test]
y_test.shape

(111735, 26)

### Create validation dataset from training data

**Challenge:** Split the training dataset into a smaller training dataset and a validation dataset for the features and the labels. Create four arrays: `x_val`, `y_val`, `x_train` and `y_train` from `x_train_all` and `y_train_all`. Use the validation size of 111,735.

In [24]:
x_val = x_train_all[:VALIDATION_SIZE]
y_val = y_train_all[:VALIDATION_SIZE]

In [25]:
x_train = x_train_all[VALIDATION_SIZE:]
y_train = y_train_all[VALIDATION_SIZE:]

In [26]:
x_val.shape

(111735, 784)

In [27]:
x_train.shape

(148980, 784)