In [14]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
#import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline


In [9]:
pickle_file = 'data/notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


In [10]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
    dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
    # one-hot encoding, Map the label 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels

train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [11]:
data_size = 2000
train_dataset = train_dataset[0:data_size]
train_labels = train_labels[0:data_size]

In [12]:
num_examples = len(train_dataset) # training set size
nn_input_dim = 784 # input layer dimensionality
nn_output_dim = 10 # output layer dimensionality

# Gradient descent parameters (I picked these by hand)
epsilon = 0.01 # learning rate for gradient descent
reg_lambda = 0.01 # regularization strength

In [46]:
nn_hdims = [100, 50]
np.random.seed(0)
W1 = np.random.randn(nn_input_dim, nn_hdims[0]) / np.sqrt(nn_hdims[0]) # 784x1024
b1 = np.zeros((1, nn_hdims[0])) # 1x1024
W2 = np.random.randn(nn_hdims[0], nn_hdims[1]) / np.sqrt(nn_hdims[1]) # 1024x512
b2 = np.zeros((1, nn_hdims[1])) # 1x512
W3 = np.random.randn(nn_hdims[1], nn_output_dim) / np.sqrt(nn_output_dim) # 512x10
b3 = np.zeros((1, nn_output_dim)) # 1x10

In [35]:
def activate(x):
    return np.tanh(x)
def derivative(x):
    return 1.0 - np.tanh(x)*np.tanh(x)

In [47]:
import warnings
warnings.filterwarnings('ignore')
num_examples = train_dataset.shape[0]
iterations = 20000
# training
for l in range(iterations):
    # forward
    z1 = train_dataset.dot(W1) + b1 # Nx1024
    a1 = np.tanh(z1) # Nx1024
    z2 = a1.dot(W2) + b2 # Nx512
    a2 = np.tanh(z2) # Nx512
    z3 = a2.dot(W3) + b3 # Nx10
    exp_scores = np.exp(z3)
    probs = exp_scores / np.sum(exp_scores, axis = 1, keepdims=True) # Nx10

    # backward
    dz3 = (probs - train_labels)/num_examples # Nx10 => error for 1 example
    dW3 = (a2.T).dot(dz3) # 512x10
    db3 = np.sum(dz3, axis = 0, keepdims=True) # 1x10

    dz2 = (1-np.power(a2, 2)) * dz3.dot(W3.T) # Nx512
    dW2 = (a1.T).dot(dz2) # 1024x512
    db2 = np.sum(dz2, axis=0, keepdims=True) # 1x512

    dz1 = (1-np.power(a1, 2)) * dz2.dot(W2.T) # Nx1024
    dW1 = (train_dataset.T).dot(dz1) # 784x1024
    db1 = np.sum(dz1, axis=0, keepdims=True) # 1x1024

    # update
    W1 -= epsilon * (dW1 + reg_lambda*W1)
    W2 -= epsilon * (dW2 + reg_lambda*W2)
    W3 -= epsilon * (dW3 + reg_lambda*W3)
    b1 -= epsilon * db1
    b2 -= epsilon * db2
    b3 -= epsilon * db3
    
    # learning rate decay
    if (l + 1) % 5000 == 0:
        epsilon = epsilon * 0.5
    
    if l % 1000 == 0:
        # loss
        corect_logprobs = -np.log([probs[i,np.nonzero(train_labels)[(1)][i].astype('int64')] for i in range(num_examples)])
        data_loss = np.sum(corect_logprobs)
        # Add regulatization term to loss (optional)
        data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3)))
        loss = 1.0 / num_examples * data_loss # reduce to float
        print("%d, loss value: %.4f" % (l, loss))

# testing
z1 = test_dataset.dot(W1) + b1 # Nx1024
a1 = np.tanh(z1) # Nx1024
z2 = a1.dot(W2) + b2 # Nx512
a2 = np.tanh(z2) # Nx512
z3 = a2.dot(W3) + b3 # Nx10
exp_scores = np.exp(z3)
probs = exp_scores / np.sum(exp_scores, axis = 1, keepdims=True) # Nx10
predict = np.argmax(probs, axis=1) # N
true_labels = np.argmax(test_labels, axis=1) # N
accuracy = np.sum(np.equal(predict, true_labels)) / test_dataset.shape[0]
print("prediction accuracy: %.2f %%" % (accuracy*100))

0, loss value: 3.0610
1000, loss value: 0.7320
2000, loss value: 0.5782
3000, loss value: 0.4941
4000, loss value: 0.4364
5000, loss value: 0.3931
6000, loss value: 0.3751
7000, loss value: 0.3590
8000, loss value: 0.3445
9000, loss value: 0.3315
10000, loss value: 0.3197
11000, loss value: 0.3142
12000, loss value: 0.3090
13000, loss value: 0.3040
14000, loss value: 0.2992
15000, loss value: 0.2947
16000, loss value: 0.2925
17000, loss value: 0.2903
18000, loss value: 0.2882
19000, loss value: 0.2862
prediction accuracy: 87.15 %


In [49]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
import os

os.environ['CUDA_VISIBLE_DEVICES']='-1'
print(tf.__version__)

1.12.0


In [95]:
x = tf.placeholder(tf.float32, [None, nn_input_dim])
y_true = tf.placeholder(tf.float32, [None, nn_output_dim])
keep_prob = tf.placeholder(tf.float32)

In [81]:
nn_hdims = [100, 50]
W1 = tf.Variable(tf.truncated_normal(shape=[nn_input_dim, nn_hdims[0]]))
b1 = tf.Variable(tf.zeros(shape=[nn_hdims[0]]))
W2 = tf.Variable(tf.truncated_normal(shape=[nn_hdims[0], nn_hdims[1]]))
b2 = tf.Variable(tf.zeros(shape=[nn_hdims[1]]))
W3 = tf.Variable(tf.truncated_normal(shape=[nn_hdims[1], nn_output_dim]))
b3 = tf.Variable(tf.zeros(shape=[nn_output_dim]))


In [142]:
learning_rate = 1.0
reg_lambda = 1e-3
regularizer = tf.contrib.layers.l2_regularizer(scale=reg_lambda)
#z1 = tf.matmul(x, W1) + b1
#a1 = tf.tanh(z1)
#z2 = tf.matmul(a1, W2) + b2
#a2 = tf.nn.tanh(z2)
z1 = tf.layers.dense(x, nn_hdims[0], activation=tf.nn.tanh, kernel_regularizer=regularizer)
z1 = tf.nn.dropout(z1, keep_prob)
z2 = tf.layers.dense(z1, nn_hdims[1], activation=tf.nn.tanh, kernel_regularizer=regularizer)
z2 = tf.nn.dropout(z2, keep_prob)
logits = tf.layers.dense(z2, nn_output_dim, kernel_regularizer=regularizer)
pred = tf.nn.softmax(logits)
pred_cls = tf.argmax(pred, axis=1)

cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits = logits, labels = y_true)
loss = tf.reduce_mean(cross_entropy)
#optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)
global_step = tf.Variable(0, trainable=False)
starter_learning_rate = 0.5
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           5000, 0.96, staircase=True)
# Passing global_step to minimize() will increment it at each step.
optimizer = (
    tf.train.GradientDescentOptimizer(learning_rate)
    .minimize(loss, global_step=global_step)
)

pred_accuracy = tf.equal(pred_cls, tf.argmax(y_true, axis=1))
accuracy = tf.reduce_mean(tf.cast(pred_accuracy, tf.float32))

In [54]:
session = tf.Session()

In [143]:
batch_size = 128
train_size = len(train_dataset)
iterations = 10000
session.run(tf.global_variables_initializer())
for i in range(iterations):
    offset = (i*batch_size) % (train_size - batch_size)
    x_batch = train_dataset[offset : offset + batch_size, :]
    y_batch = train_labels[offset : offset + batch_size, :]
    
    feed_dict_train = {x: x_batch, y_true: y_batch, keep_prob: 1.0}
    _, cost = session.run([optimizer, loss], feed_dict=feed_dict_train)
    
    if (i+1) % 1000 == 0:
        print("%d, loss value: %.4f" % (i+1, cost))

1000, loss value: 0.0050
2000, loss value: 0.0020
3000, loss value: 0.0144
4000, loss value: 0.0132
5000, loss value: 0.0303
6000, loss value: 0.0013
7000, loss value: 0.0061
8000, loss value: 0.0003
9000, loss value: 0.0012
10000, loss value: 0.0002


In [144]:
feed_dict_test = {x: test_dataset, y_true: test_labels, keep_prob: 1.0}
acc = session.run(accuracy, feed_dict=feed_dict_test)
print("prediction accuracy: %.2f %%" % (acc*100))

prediction accuracy: 86.34 %
