In [2]:
# Pre-requisites
import numpy as np
import time

# For plots
%matplotlib inline
import matplotlib.pyplot as plt

# To clear print buffer
from IPython.display import clear_output

# Importing functions

In [144]:
# Initializing weight matrices from layer sizes
def initializeWeights(layers):
    weights = [np.random.randn(o, i+1) for i, o in zip(layers[:-1], layers[1:])]
    return weights

# Add a bias term to every data point in the input
def addBiasTerms(X):
        # Make the input an np.array()
        X = np.array(X)
        
        # Forcing 1D vectors to be 2D matrices of 1xlength dimensions
        if X.ndim==1:
            X = np.reshape(X, (1, len(X)))
        
        # Inserting bias terms
        X = np.insert(X, 0, 1, axis=1)
        
        return X

# Sigmoid function
def sigmoid(a):
    return 1/(1 + np.exp(-a))

# Forward Propagation of outputs
def forwardProp(X, weights):
    # Initializing an empty list of outputs
    outputs = []
    
    # Assigning a name to reuse as inputs
    inputs = X
    
    # For each layer
    for w in weights:
        # Add bias term to input
        inputs = addBiasTerms(inputs)
        
        # Y = Sigmoid ( X .* W^T )
        outputs.append(sigmoid(np.dot(inputs, w.T)))
        
        # Input of next layer is output of this layer
        inputs = outputs[-1]
        
    return outputs

# Compute COST (J) of Neural Network
def nnCost(weights, X, Y):
    # Calculate yPred
    yPred = forwardProp(X, weights)[-1]
    
    # Compute J
    J = 0.5*np.sum((yPred-Y)**2)/len(Y)
    
    return J

# Evaluate the accuracy of weights for input X and desired outptut Y
def evaluate(weights, X, Y):
    yPreds = forwardProp(X, weights)[-1]
    # Check if maximum probability is from that neuron corresponding to desired class,
    # AND check if that maximum probability is greater than 0.5
    yes = sum( int( ( np.argmax(yPreds[i]) == np.argmax(Y[i]) ) and 
                    ( (yPreds[i][np.argmax(yPreds[i])]>0.5) == (Y[i][np.argmax(Y[i])]>0.5) ) )
              for i in range(len(Y)) )
    return yes


# TRAINING USING MINI-BATCH GRADIENT DESCENT
def trainUsingMinibatchGD(weights, X, Y, minibatchSize, nEpochs, learningRate=1.0, 
                          decay=None, decayRate=0.1, optimizer=None, mu=0.9, testX=None, testY=None):
    # If testing data is not provided, check accuracy on training data
    if testX is None:
        testX = X
        testY = Y
    
    # Check cost and accuracy
    # Initialize cost
    prevCost = nnCost(weights, testX, testY)
    yes = evaluate(weights, testX, testY)
    print("Before training: "+str(yes)+" of "+str(len(testY))+" = "+str(round(float(yes/len(testY)),4))+
          "; cost="+str(prevCost))
    
    # Backup weights to revert back in case cost increases
    oldWeights = [np.array(w) for w in weights]
    
    # To count the number of times learning rate had to be halved contiguously
    countLRHalf = 0
    
    # Initialize index for iteration through epochs
    epoch = 0
    
    # For nEpochs number of epochs:
    while epoch < nEpochs:
        # clear output
        #clear_output()
        
        # Make a list of all the indices
        fullIdx = list(range(len(Y)))
        
        # Shuffle the full index
        np.random.shuffle(fullIdx)
        
        # Count number of mini-batches
        nOfMinibatches = int(len(X)/minibatchSize)
        
        # For each mini-batch
        for m in range(nOfMinibatches):
            
            # Compute the starting index of this mini-batch
            startIdx = m*minibatchSize
            
            # Declare sampled inputs and outputs
            xSample = X[fullIdx[startIdx:startIdx+minibatchSize]]
            ySample = Y[fullIdx[startIdx:startIdx+minibatchSize]]

            # Run backprop, with an optimizer
            backProp(weights, xSample, ySample, learningRate, optimizer, mu)
        
        # Check cost and accuracy
        cost = nnCost(weights, testX, testY)
        yes = evaluate(weights, testX, testY)
        print("Epoch "+str(epoch+1)+" of "+str(nEpochs)+" : "+
              str(yes)+" of "+str(len(testY))+" = "+str(round(float(yes/len(testY)),4))+
              "; cost="+str(cost))
        
        # If decay type is 'step', when cost increases, revert back weights and halve learning rate 
        if decay is 'step':
            # If cost does not decrease
            if cost >= prevCost:
                # Revert weights back to those at the start of this epoch
                weights = [np.array(w) for w in oldWeights]
                
                # Recalculate prevCost
                cost = nnCost(weights, testX, testY)
                
                # Halve the learning rate
                learningRate = learningRate/2.0
                
                # Revert iteration number
                epoch -= 1
                
                # Increment the count of halving learning rate by 1
                countLRHalf += 1
                
                print("Halving learning rate to: "+str(learningRate)+", count="+str(countLRHalf))
            # If cost decreases, reset the count to 0
            else:
                countLRHalf = 0
        
        # If decay is 'time'
        if decay is 'time':
            learningRate *= np.exp(-decayRate)
        
        # If learningRate has been halved contiguously for too long, break
        if countLRHalf is 10:
            break
        
        # Set prevCost for next epoch
        prevCost = cost
        
        # Set oldWeights for next epoch
        oldWeights = [np.array(w) for w in weights]
        
        # Increase iteration number for epochs
        epoch += 1
    
    # If training was stopped because accuracy was not increasing
    if epoch < nEpochs:
        print("Training ended prematurely...")
    # If training ended in correct number of epochs
    else:
        print("Training complete.")
    
    # Printing training accuracy
    yes = evaluate(weights, X, Y)
    print("TRAINING ACCURACY, COST : "+str(yes)+" of "+str(len(Y))+
          " = "+str(round(float(yes/len(Y)),4)))
    
    # Printing test accuracy
    if testY is not Y:
        yes = evaluate(weights, testX, testY)
        print("TEST ACCURACY, COST : "+str(yes)+" of "+str(len(testY))+
              " = "+str(round(float(yes/len(testY)),4)))
    

# IMPLEMENTING BACK-PROPAGATION WITH LEARNING RATE, MOMENTUM, NAG, ADAGRAD
def backProp(weights, X, Y, learningRate, optimizer=None, mu=0.9):
    # Forward propagate to find outputs
    outputs = forwardProp(X, weights)
    
    # For the last layer, bpError = error = yPred - Y
    bpError = outputs[-1] - Y
    
    # Initialize velocity in the shape of weights for use with momentum and NAG
    v = [np.zeros(w.shape) for w in weights]
    prevV = [np.zeros(w.shape) for w in weights]
    
    # Initialize cache for use with Adagrad
    cache = [np.zeros(w.shape) for w in weights]
    
    # Back-propagating from the last layer to the first
    for l, w in enumerate(reversed(weights)):
        
        # Find yPred for this layer
        yPred = outputs[-l-1]
        
        # Calculate delta for this layer using bpError from next layer
        delta = np.multiply(np.multiply(bpError, yPred), 1-yPred)
        
        # Find input to the layer, by adding bias to the output of the previous layer
        # Take care, l goes from 0 to 1, while the weights are in reverse order
        if l==len(weights)-1: # If 1st layer has been reached
            xL = addBiasTerms(X)
        else:
            xL = addBiasTerms(outputs[-l-2])
        
        # Calculate the gradient for this layer
        grad = np.dot(delta.T, xL)/len(Y)
        
        # Calculate bpError for previous layer to be back-propagated
        bpError = np.dot(delta, w)
        
        # Ignore bias term in bpError
        bpError = bpError[:,1:]
        
        # CHANGE WEIGHTS of the current layer (W <- W + eta*deltaW)
        if optimizer is None:
            w += -learningRate * grad
        
        # Momentum
        if optimizer is 'momentum':
            v[-l-1] = mu * v[-l-1] - learningRate * grad
            w += v[-l-1]
        
        # Nesterov Momentum
        if optimizer is 'nag':
            prevV[-l-1] = np.array(v[-l-1]) # back this up
            v[-l-1] = mu * v[-l-1] - learningRate * grad # velocity update stays the same
            w += -mu * prevV[-l-1] + (1 + mu) * v[-l-1] # position update changes form
        
        # Adagrad
        if optimizer is 'adagrad':
            cache[-l-1] += grad**2
            w += - learningRate * grad / (np.sqrt(cache[-l-1]) + np.finfo(float).eps)

# Initialize network
layers = [2, 2, 1]
weights = initializeWeights(layers)

print("weights:")
for i in range(len(weights)):
    print(i+1); print(weights[i].shape); print(weights[i])

# Declare input and desired output for AND gate
X = np.array([[0,0], [0,1], [1,0], [1,1]])
Y = np.array([[0], [0], [0], [1]])

# Check current accuracy and cost
print("Cost: "+str(nnCost(weights, X, Y)))
yes = evaluate(weights, X, Y)
print("Accuracy: "+str(yes)+" of "+str(len(Y))+" = "+str(round(float(yes/len(Y)), 4)))
print(forwardProp(X, weights)[-1])

weights:
1
(2, 3)
[[ 0.50989498  0.90468367 -1.71849463]
 [ 0.50226972 -0.60305691  0.92248982]]
2
(1, 3)
[[ 1.74069311  0.36830228 -1.59473413]]
Cost: 0.204595059595
Accuracy: 1 of 4 = 0.25
[[ 0.72657099]
 [ 0.63178799]
 [ 0.7824104 ]
 [ 0.6876973 ]]


# Load MNIST data from npz, and format it for our network

In [5]:
# Load MNIST DATA
# Use numpy.load() to load the .npz file
f = np.load('mnist.npz')
# Saving the files
x_train = f['x_train']
y_train = f['y_train']
x_test = f['x_test']
y_test = f['y_test']
f.close()

# Reshaping x_train and x_test for our network with 784 inputs neurons
x_train = np.reshape(x_train, (len(x_train), 784))
x_test = np.reshape(x_test, (len(x_test), 784))

# Normalize x_train
x_train = x_train / 255.0
x_test = x_test / 255.0

# Make new y_train of nx10 elements
new_y_train = np.zeros((len(y_train), 10))
for i in range(len(y_train)):
    new_y_train[i, y_train[i]] = 1

del y_train
y_train = new_y_train

# Make new y_test of nx10 elements
new_y_test = np.zeros((len(y_test), 10))
for i in range(len(y_test)):
    new_y_test[i, y_test[i]] = 1

del y_test
y_test = new_y_test


# Baseline accuracy

Minimum accuracy that can be obtained

In [12]:
# TRAIN using Mini-batch Gradient Descent

# Initialize network
layers = [784, 30, 10]
weights = initializeWeights(layers)

# Take backup of weights to be used later for comparison
#initialWeights = [np.array(w) for w in weights]

# Set options of mini-batch gradient descent
minibatchSize = 10
nEpochs = 30
learningRate = 3.0
mu = 0.9

# Train
trainUsingMinibatchGD(weights, x_train, y_train, minibatchSize, nEpochs, learningRate,
                      decay='step', optimizer='nag', mu=mu, testX=x_test, testY=y_test)

Before training: 1470 of 10000 = 0.147; cost=1.75485656197
Epoch 1 of 30 : 8836 out of 10000 = 0.8836; cost=0.0703368832207
Epoch 2 of 30 : 9046 out of 10000 = 0.9046; cost=0.0582771715058
Epoch 3 of 30 : 9144 out of 10000 = 0.9144; cost=0.0536424597654
Epoch 4 of 30 : 9211 out of 10000 = 0.9211; cost=0.0493324090936
Epoch 5 of 30 : 9211 out of 10000 = 0.9211; cost=0.0494807448319
Halving learning rate to: 1.5, count=1
Epoch 5 of 30 : 9236 out of 10000 = 0.9236; cost=0.0449246405479
Epoch 6 of 30 : 9290 out of 10000 = 0.929; cost=0.0438768907748
Epoch 7 of 30 : 9313 out of 10000 = 0.9313; cost=0.0439081304674
Halving learning rate to: 0.75, count=1
Epoch 7 of 30 : 9324 out of 10000 = 0.9324; cost=0.042031568918
Epoch 8 of 30 : 9329 out of 10000 = 0.9329; cost=0.0413069393454
Epoch 9 of 30 : 9320 out of 10000 = 0.932; cost=0.0416932310691
Halving learning rate to: 0.375, count=1
Epoch 9 of 30 : 9340 out of 10000 = 0.934; cost=0.0410020648576
Epoch 10 of 30 : 9335 out of 10000 = 0.9335; 

## Save the above weights

In [8]:
np.save("nagWeights_mini10_epochs30_lr3.0_mu0.9", weights)

In [None]:
# 1. Train with 2 layers

In [13]:
# TRAIN using Mini-batch Gradient Descent

# Initialize network
layers = [784, 30, 30, 10]
weights = initializeWeights(layers)

# Set options of mini-batch gradient descent
minibatchSize = 10
nEpochs = 60
learningRate = 3.0
mu = 0.9

# Train
trainUsingMinibatchGD(weights, x_train, y_train, minibatchSize, nEpochs, learningRate,
                      decay=None, optimizer='nag', mu=mu, testX=x_test, testY=y_test)

Before training: 882 of 10000 = 0.0882; cost=1.66119949405
Epoch 1 of 60 : 8978 out of 10000 = 0.8978; cost=0.0660061413096
Epoch 2 of 60 : 8886 out of 10000 = 0.8886; cost=0.0681654748053
Epoch 3 of 60 : 9074 out of 10000 = 0.9074; cost=0.0586687958812
Epoch 4 of 60 : 9086 out of 10000 = 0.9086; cost=0.0591863168985
Epoch 5 of 60 : 9134 out of 10000 = 0.9134; cost=0.0545254522007
Epoch 6 of 60 : 9158 out of 10000 = 0.9158; cost=0.0539935307214
Epoch 7 of 60 : 9248 out of 10000 = 0.9248; cost=0.0481049381248
Epoch 8 of 60 : 9253 out of 10000 = 0.9253; cost=0.0482258093188
Epoch 9 of 60 : 9267 out of 10000 = 0.9267; cost=0.0477349709068
Epoch 10 of 60 : 9307 out of 10000 = 0.9307; cost=0.0462745291633
Epoch 11 of 60 : 9229 out of 10000 = 0.9229; cost=0.0483732879842
Epoch 12 of 60 : 9352 out of 10000 = 0.9352; cost=0.0443519019619
Epoch 13 of 60 : 9340 out of 10000 = 0.934; cost=0.0437223217595
Epoch 14 of 60 : 9353 out of 10000 = 0.9353; cost=0.0431274428996
Epoch 15 of 60 : 9308 out o

In [14]:
# TRAIN using Mini-batch Gradient Descent

# Initialize network
layers = [784, 30, 30, 10]
weights = initializeWeights(layers)

# Set options of mini-batch gradient descent
minibatchSize = 10
nEpochs = 60
learningRate = 1.0
mu = 0.9

# Train
trainUsingMinibatchGD(weights, x_train, y_train, minibatchSize, nEpochs, learningRate,
                      decay=None, optimizer='nag', mu=mu, testX=x_test, testY=y_test)

Before training: 974 of 10000 = 0.0974; cost=1.67281282913
Epoch 1 of 60 : 8745 out of 10000 = 0.8745; cost=0.074443859338
Epoch 2 of 60 : 8922 out of 10000 = 0.8922; cost=0.0611683263573
Epoch 3 of 60 : 9139 out of 10000 = 0.9139; cost=0.0549056154571
Epoch 4 of 60 : 9162 out of 10000 = 0.9162; cost=0.0502130992972
Epoch 5 of 60 : 9190 out of 10000 = 0.919; cost=0.0508331219877
Epoch 6 of 60 : 9263 out of 10000 = 0.9263; cost=0.0459753362356
Epoch 7 of 60 : 9229 out of 10000 = 0.9229; cost=0.048099056032
Epoch 8 of 60 : 9298 out of 10000 = 0.9298; cost=0.0443920915768
Epoch 9 of 60 : 9246 out of 10000 = 0.9246; cost=0.0454406838599
Epoch 10 of 60 : 9334 out of 10000 = 0.9334; cost=0.0437210506537
Epoch 11 of 60 : 9310 out of 10000 = 0.931; cost=0.0427986209207
Epoch 12 of 60 : 9263 out of 10000 = 0.9263; cost=0.0463192980383
Epoch 13 of 60 : 9334 out of 10000 = 0.9334; cost=0.0411114303676
Epoch 14 of 60 : 9355 out of 10000 = 0.9355; cost=0.0425227733942
Epoch 15 of 60 : 9366 out of 1

In [19]:
# TRAIN using Mini-batch Gradient Descent

# Initialize network
layers = [784, 30, 30, 10]
weights = initializeWeights(layers)

# Set options of mini-batch gradient descent
minibatchSize = 10
nEpochs = 100
learningRate = 1.0
decayRate = 0.1
nagMu = 0.9

# Train
trainUsingMinibatchGD(weights, x_train, y_train, minibatchSize, nEpochs, learningRate,
                      decay='time', decayRate=decayRate, optimizer='nag', mu=nagMu,
                      testX=x_test, testY=y_test)

Before training: 931 of 10000 = 0.0931; cost=1.61107296067
Epoch 1 of 100 : 8737 out of 10000 = 0.8737; cost=0.0783480782972
Epoch 2 of 100 : 8873 out of 10000 = 0.8873; cost=0.0645569387094
Epoch 3 of 100 : 9048 out of 10000 = 0.9048; cost=0.056951603865
Epoch 4 of 100 : 9144 out of 10000 = 0.9144; cost=0.0511362917512
Epoch 5 of 100 : 9200 out of 10000 = 0.92; cost=0.0497869089763
Epoch 6 of 100 : 9228 out of 10000 = 0.9228; cost=0.0480547534428
Epoch 7 of 100 : 9237 out of 10000 = 0.9237; cost=0.0477574263471
Epoch 8 of 100 : 9247 out of 10000 = 0.9247; cost=0.0464066937476
Epoch 9 of 100 : 9277 out of 10000 = 0.9277; cost=0.0453933840844
Epoch 10 of 100 : 9290 out of 10000 = 0.929; cost=0.0447885192122
Epoch 11 of 100 : 9309 out of 10000 = 0.9309; cost=0.0444222820193
Epoch 12 of 100 : 9294 out of 10000 = 0.9294; cost=0.0439987882411
Epoch 13 of 100 : 9311 out of 10000 = 0.9311; cost=0.0440451283848
Epoch 14 of 100 : 9326 out of 10000 = 0.9326; cost=0.0436575875899
Epoch 15 of 100 

In [149]:
# TRAIN using Mini-batch Gradient Descent

# Initialize network
layers = [784, 128, 10]
weights = initializeWeights(layers)

# Set options of mini-batch gradient descent
minibatchSize = 10
nEpochs = 100
learningRate = 3.0
nagMu = 0.9

# Train
trainUsingMinibatchGD(weights, x_train, y_train, minibatchSize, nEpochs, learningRate,
                      decay=None, optimizer='nag', mu=nagMu,
                      testX=x_test, testY=y_test)

Before training: 1516 of 10000 = 0.1516; cost=1.91848557009
Epoch 1 of 100 : 4815 of 10000 = 0.4815; cost=0.26115046542
Epoch 2 of 100 : 6253 of 10000 = 0.6253; cost=0.196657200378
Epoch 3 of 100 : 6445 of 10000 = 0.6445; cost=0.185546611304
Epoch 4 of 100 : 6450 of 10000 = 0.645; cost=0.183981785899
Epoch 5 of 100 : 6510 of 10000 = 0.651; cost=0.182547138359
Epoch 6 of 100 : 6594 of 10000 = 0.6594; cost=0.178527580912
Epoch 7 of 100 : 6584 of 10000 = 0.6584; cost=0.175072600092
Epoch 8 of 100 : 6606 of 10000 = 0.6606; cost=0.175843939101
Epoch 9 of 100 : 6659 of 10000 = 0.6659; cost=0.179198957328
Epoch 10 of 100 : 6605 of 10000 = 0.6605; cost=0.175178182316
Epoch 11 of 100 : 6707 of 10000 = 0.6707; cost=0.174422266315
Epoch 12 of 100 : 6641 of 10000 = 0.6641; cost=0.174000157486
Epoch 13 of 100 : 6648 of 10000 = 0.6648; cost=0.174920532667
Epoch 14 of 100 : 6668 of 10000 = 0.6668; cost=0.171294499136
Epoch 15 of 100 : 6597 of 10000 = 0.6597; cost=0.173744196854
Epoch 16 of 100 : 6672

In [None]:
# TRAIN using Mini-batch Gradient Descent

# Initialize network
layers = [784, 128, 10]
weights = initializeWeights(layers)

# Set options of mini-batch gradient descent
minibatchSize = 10
nEpochs = 100
learningRate = 1.0
nagMu = 0.9

# Train
trainUsingMinibatchGD(weights, x_train, y_train, minibatchSize, nEpochs, learningRate,
                      decay=None, optimizer='nag', mu=nagMu,
                      testX=x_test, testY=y_test)

Before training: 1205 of 10000 = 0.1205; cost=2.46936963982
Epoch 1 of 100 : 4170 of 10000 = 0.417; cost=0.293425313212
Epoch 2 of 100 : 4586 of 10000 = 0.4586; cost=0.284537827722
Epoch 3 of 100 : 4472 of 10000 = 0.4472; cost=0.28094716287
Epoch 4 of 100 : 4552 of 10000 = 0.4552; cost=0.276532039097
Epoch 5 of 100 : 4618 of 10000 = 0.4618; cost=0.27519764717
Epoch 6 of 100 : 5506 of 10000 = 0.5506; cost=0.233797128552
Epoch 7 of 100 : 5453 of 10000 = 0.5453; cost=0.231355488045
Epoch 8 of 100 : 5549 of 10000 = 0.5549; cost=0.230258491935
Epoch 9 of 100 : 5528 of 10000 = 0.5528; cost=0.230304760574
Epoch 10 of 100 : 5542 of 10000 = 0.5542; cost=0.226973430262
Epoch 11 of 100 : 5553 of 10000 = 0.5553; cost=0.226042940372
Epoch 12 of 100 : 5618 of 10000 = 0.5618; cost=0.226177398449
Epoch 13 of 100 : 5606 of 10000 = 0.5606; cost=0.225591245157
Epoch 14 of 100 : 5601 of 10000 = 0.5601; cost=0.225035983064
Epoch 15 of 100 : 5620 of 10000 = 0.562; cost=0.225802162087
Epoch 16 of 100 : 5589 

# KAGGLE DATA

## Load MNIST data from Kaggle

In [82]:
# Download "train.csv" and "test.csv" from https://www.kaggle.com/c/digit-recognizer/data

In [62]:
# Load training and test data
kaggleTrain = np.loadtxt(open("train.csv"), delimiter=',', skiprows=1)
kaggleTest = np.loadtxt(open("test.csv"), delimiter=',', skiprows=1)

In [73]:
#np.savez_compressed("kaggleData.npz", x=kaggleTest, y=kaggleTrain)

## Make training and test data into the right format

In [74]:
# Extract training inputs and outputs
kaggleTrainX = kaggleTrain[:, 1:]
kaggleTrainY = kaggleTrain[:, 0]

# Normalize x_train and x_test
kaggleTrainX = kaggleTrainX / 255.0
kaggleTestX = kaggleTest / 255.0

# Make new y_train of nx10 elements
new_y_train = np.zeros((len(kaggleTrainY), 10))
for i in range(len(kaggleTrainY)):
    new_y_train[i, int(kaggleTrainY[i])] = 1
del kaggleTrainY
kaggleTrainY = new_y_train

## Train model using Kaggle training data

In [83]:
# TRAIN using Mini-batch Gradient Descent

# Initialize network
layers = [784, 30, 30, 10]
weights = initializeWeights(layers)

# Set options of mini-batch gradient descent
minibatchSize = 10
nEpochs = 60
learningRate = 1.0
mu = 0.9

# Train
trainUsingMinibatchGD(weights, kaggleTrainX, kaggleTrainY, minibatchSize, nEpochs, learningRate,
                      decay=None, optimizer='nag', mu=mu)

Before training: 4232 of 42000 = 0.1008; cost=1.55663129091
Epoch 1 of 60 : 35443 of 42000 = 0.8439; cost=0.0900705110879
Epoch 2 of 60 : 37303 of 42000 = 0.8882; cost=0.0655559645305
Epoch 3 of 60 : 38012 of 42000 = 0.905; cost=0.0584415033671
Epoch 4 of 60 : 38498 of 42000 = 0.9166; cost=0.0491312638999
Epoch 5 of 60 : 38995 of 42000 = 0.9285; cost=0.0459145322529
Epoch 6 of 60 : 39082 of 42000 = 0.9305; cost=0.0443336530561
Epoch 7 of 60 : 39468 of 42000 = 0.9397; cost=0.0383742213748
Epoch 8 of 60 : 39285 of 42000 = 0.9354; cost=0.0402843166376
Epoch 9 of 60 : 39702 of 42000 = 0.9453; cost=0.0347878377413
Epoch 10 of 60 : 39712 of 42000 = 0.9455; cost=0.0341864112373
Epoch 11 of 60 : 39776 of 42000 = 0.947; cost=0.032362313093
Epoch 12 of 60 : 39960 of 42000 = 0.9514; cost=0.0307108280391
Epoch 13 of 60 : 39993 of 42000 = 0.9522; cost=0.0297291448352
Epoch 14 of 60 : 40258 of 42000 = 0.9585; cost=0.0271756510094
Epoch 15 of 60 : 40271 of 42000 = 0.9588; cost=0.0262856915832
Epoch 1

In [89]:
# Save weights
np.save("kaggleWeights_mini10_60epochs_lr1.0_decayNone_nagMu0.9_accuracy0.9807", weights)

In [90]:
# TRAIN using Mini-batch Gradient Descent

# Initialize network
layers = [784, 30, 30, 10]
weights = initializeWeights(layers)

# Set options of mini-batch gradient descent
minibatchSize = 10
nEpochs = 100
learningRate = 1.0
mu = 0.9

# Train
trainUsingMinibatchGD(weights, kaggleTrainX, kaggleTrainY, minibatchSize, nEpochs, learningRate,
                      decay=None, optimizer='nag', mu=mu)

Before training: 5091 of 42000 = 0.1212; cost=2.0068225263


KeyboardInterrupt: 

## Test model on Kaggle test data

In [116]:
# Make predictions
preds = forwardProp(kaggleTestX, weights)[-1]

# Record the digit numbers
testY = -np.ones((len(preds), 2))
for i in range(len(preds)):
    testY[i][0] = i+1
    testY[i][1] = np.argmax(preds[i])

In [115]:
print(preds[0])
testY[0]

[  4.27486584e-03   2.06595069e-04   9.69224712e-01   5.11547819e-03
   7.43323595e-04   4.39173581e-04   6.71424976e-03   3.33608838e-04
   4.63776709e-03   8.61203901e-03]


array([ 1.,  2.])

In [120]:
# Save predictions in the format dictated by Kaggle
np.savetxt("preds.csv", testY, fmt='%i', delimiter=',', header="ImageId,Label", comments='')

In [121]:
myPreds = np.loadtxt(open("preds.csv"), delimiter=',', skiprows=1)

In [122]:
myPreds.shape

(28000, 2)

In [123]:
myPreds[0]

array([ 1.,  2.])