# MLP with 1 hidden layer, 1024 neurons, no regularization, could achieve 98% accuracy

In [1]:
import os
import sys
import timeit
import numpy as np
import theano
import theano.tensor as T
import lasagne
from theano.tensor.signal import pool
from theano.tensor.nnet import conv2d
from logistic_sgd import load_data
from lasagne.layers import get_output, InputLayer, DenseLayer, Upscale2DLayer, ReshapeLayer
from lasagne.regularization import regularize_network_params, l2, l1
import gzip
import pickle
import time

Using gpu device 0: TITAN X (Pascal) (CNMeM is disabled, cuDNN 5105)


In [2]:
batch_size = 1000
f = gzip.open('/home/rui/Downloads/mnist.pkl.gz', 'rb')
try:
    train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
except:
    train_set, valid_set, test_set = pickle.load(f)
f.close()
X_train, y_train = train_set
y_train = np.asarray(y_train, dtype = np.int32)
X_test, y_test = test_set
y_test = np.asarray(y_test, dtype = np.int32)

In [3]:
X_train.shape

(50000, 784)

In [4]:
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert len(inputs) == len(targets)
    if shuffle:
        indices = np.arange(len(inputs))
        np.random.shuffle(indices)
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets[excerpt]


# MLP with 2 fully connected hidden layers, each layer has 1024 hidden units. 

In [12]:
def build_model(input_var, hidden_neurons=1024, layers=8):
    l_in = InputLayer(shape=(None, 784), input_var=input_var)
    l_hidden = DenseLayer(l_in, num_units=hidden_neurons, W=lasagne.init.HeNormal(gain='relu'))
    for i in range(layers):
        l_hidden = DenseLayer(l_hidden, num_units=hidden_neurons, W=lasagne.init.HeNormal(gain='relu'))
    l_out = DenseLayer(lasagne.layers.DropoutLayer(l_hidden), num_units=10, nonlinearity=lasagne.nonlinearities.softmax, W=lasagne.init.HeNormal())
    return l_out

# Train with adagrad algorithm

## final accuracy: 97.77%

In [14]:
num_epochs = 500
input_var = T.matrix('inputs')
target_var = T.ivector('targets')
network = build_model(input_var, 1024, 8)
prediction = lasagne.layers.get_output(network)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
#l2_penalty = regularize_network_params(network, l2)
loss = loss.mean()
params = lasagne.layers.get_all_params(network, trainable=True)
updates = lasagne.updates.adagrad(loss, params, learning_rate=0.01)
test_prediction = lasagne.layers.get_output(network, deterministic=True)
test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,target_var)
test_loss = test_loss.mean()
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                    dtype=theano.config.floatX)
train_fn = theano.function([input_var, target_var], loss, updates=updates)
val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
for epoch in range(num_epochs):
    # In each epoch, we do a full pass over the training data:
    train_err = 0
    train_batches = 0
    start_time = time.time()
    for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True):
        inputs, targets = batch
        train_err += train_fn(inputs, targets)
        train_batches += 1
        # Then we print the results for this epoch:
    print("Epoch {} of {} took {:.3f}s".format(
        epoch + 1, num_epochs, time.time() - start_time))
    print("  training loss:\t\t{:.6f}".format(train_err / train_batches))

test_err = 0
test_acc = 0
test_batches = 0
for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False):
    inputs, targets = batch
    err, acc = val_fn(inputs, targets)
    test_err += err
    test_acc += acc
    test_batches += 1
final_acc = test_acc / test_batches
print("Final results:")
print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
print("  test accuracy:\t\t{:.2f} %".format(
    final_acc * 100))

Epoch 1 of 500 took 2.511s
  training loss:		8.600414
Epoch 2 of 500 took 2.564s
  training loss:		0.935859
Epoch 3 of 500 took 2.384s
  training loss:		0.228391
Epoch 4 of 500 took 2.377s
  training loss:		0.123860
Epoch 5 of 500 took 2.394s
  training loss:		0.079580
Epoch 6 of 500 took 2.381s
  training loss:		0.056763
Epoch 7 of 500 took 2.383s
  training loss:		0.041604
Epoch 8 of 500 took 2.387s
  training loss:		0.024233
Epoch 9 of 500 took 2.460s
  training loss:		0.015570
Epoch 10 of 500 took 2.405s
  training loss:		0.009412
Epoch 11 of 500 took 2.408s
  training loss:		0.006705
Epoch 12 of 500 took 2.408s
  training loss:		0.003692
Epoch 13 of 500 took 2.401s
  training loss:		0.003016
Epoch 14 of 500 took 2.418s
  training loss:		0.029847
Epoch 15 of 500 took 2.402s
  training loss:		0.011502
Epoch 16 of 500 took 2.529s
  training loss:		0.002190
Epoch 17 of 500 took 2.420s
  training loss:		0.001141
Epoch 18 of 500 took 2.437s
  training loss:		0.000584
Epoch 19 of 500 too

In [15]:
lasagne.layers.count_params(network)

9210890

# add bottle neck(128 units) between input layer and hidden1, hidden1 and hidden2. 

## final accuracy: 96.74%

In [16]:
def build_NIN_model(input_var, hidden_neurons=1024, bottle_neck=128, layers=8):
    l_in = InputLayer(shape=(None, 784), input_var=input_var)
    l_b1 = DenseLayer(l_in, num_units=bottle_neck, W=lasagne.init.HeNormal(gain='relu'))
    l_hidden = DenseLayer(l_b1, num_units=hidden_neurons, W=lasagne.init.HeNormal(gain='relu'))
    for i in range(layers):
        l_b = DenseLayer(l_hidden, num_units=bottle_neck, W=lasagne.init.HeNormal(gain='relu'))
        l_hidden = DenseLayer(l_b, num_units=hidden_neurons, W=lasagne.init.HeNormal(gain='relu'))
    l_out = DenseLayer(lasagne.layers.DropoutLayer(l_hidden), num_units=10, nonlinearity=lasagne.nonlinearities.softmax, W=lasagne.init.HeNormal(gain='relu'))
    return l_out

In [17]:
num_epochs = 500
input_var = T.matrix('inputs')
target_var = T.ivector('targets')
network = build_NIN_model(input_var, 1024, 64, 8)
prediction = lasagne.layers.get_output(network)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
#l2_penalty = regularize_network_params(network, l2)
loss = loss.mean()
params = lasagne.layers.get_all_params(network, trainable=True)
updates = lasagne.updates.adagrad(loss, params, learning_rate=0.01)
test_prediction = lasagne.layers.get_output(network, deterministic=True)
test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,target_var)
test_loss = test_loss.mean()
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                    dtype=theano.config.floatX)
train_fn = theano.function([input_var, target_var], loss, updates=updates)
val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
for epoch in range(num_epochs):
    # In each epoch, we do a full pass over the training data:
    train_err = 0
    train_batches = 0
    start_time = time.time()
    for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True):
        inputs, targets = batch
        train_err += train_fn(inputs, targets)
        train_batches += 1
        # Then we print the results for this epoch:
    print("Epoch {} of {} took {:.3f}s".format(
        epoch + 1, num_epochs, time.time() - start_time))
    print("  training loss:\t\t{:.6f}".format(train_err / train_batches))

test_err = 0
test_acc = 0
test_batches = 0
for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False):
    inputs, targets = batch
    err, acc = val_fn(inputs, targets)
    test_err += err
    test_acc += acc
    test_batches += 1
final_acc = test_acc / test_batches
print("Final results:")
print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
print("  test accuracy:\t\t{:.2f} %".format(
    final_acc * 100))

Epoch 1 of 500 took 1.120s
  training loss:		20.279807
Epoch 2 of 500 took 1.078s
  training loss:		2.313354
Epoch 3 of 500 took 1.092s
  training loss:		2.301222
Epoch 4 of 500 took 1.065s
  training loss:		2.301220
Epoch 5 of 500 took 1.058s
  training loss:		2.301145
Epoch 6 of 500 took 1.062s
  training loss:		2.301182
Epoch 7 of 500 took 1.050s
  training loss:		2.301149
Epoch 8 of 500 took 1.055s
  training loss:		2.301151
Epoch 9 of 500 took 1.059s
  training loss:		2.301052
Epoch 10 of 500 took 1.085s
  training loss:		2.301137
Epoch 11 of 500 took 1.058s
  training loss:		2.301203
Epoch 12 of 500 took 1.050s
  training loss:		2.301084
Epoch 13 of 500 took 1.049s
  training loss:		2.301105
Epoch 14 of 500 took 1.061s
  training loss:		2.301051
Epoch 15 of 500 took 1.090s
  training loss:		2.301141
Epoch 16 of 500 took 1.116s
  training loss:		2.301084
Epoch 17 of 500 took 1.085s
  training loss:		2.301075
Epoch 18 of 500 took 1.107s
  training loss:		2.301039
Epoch 19 of 500 to