In [3]:
import json
import random
import sys
import numpy as np
import gzip
import numpy as np
import cPickle


In [4]:
class costQuadratic(object):
    @staticmethod
    #y is the real value, a is the estimate value from neural network
    def realCost(a,y):
        return 0.5*np.linalg.norm(a-y)**2
    @staticmethod
    def delta(z,a,y):
        #z is the putput from the previous layer
        return (a-y)*dif_sigmoid(z)


In [5]:
def sigmoid(x):
    return 1.0/(1.0+np.exp(-x))

In [6]:
def diff_sigmoid(x):
    return sigmoid(x)*(1-sigmoid(x))

In [7]:
class costCrossEntropy(object):
    @staticmethod
    def realCost(a,y):
        return np.sum(np.nan_to_sum(-y*np.log(a)-(1-y)*np.log(1-a)))
    @staticmethod
    def delta(z,a,y):
        return (a-y)

In [8]:
def load_data():
    f=gzip.open('../mnist.pkl.gz', 'rb')
    training_data,validation_data,test_data= cPickle.load(f)
    f.close()
    return (training_data,validation_data, test_data)

In [9]:
#make the result into a evector
def vectorized_result(j):
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

In [10]:
def load_data_wrapper():
    tr_d, va_d, te_d = load_data()
    training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
    training_results = [vectorized_result(y) for y in tr_d[1]]
    training_data = zip(training_inputs, training_results)
    validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
    validation_data = zip(validation_inputs, va_d[1])
    test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
    test_data = zip(test_inputs, te_d[1])
    return (training_data, validation_data, test_data) 
    

In [11]:
training_data, validation_data,test_data=load_data_wrapper()

In [12]:
import numpy as np

In [13]:
class Network(object):
    def __init__(self,sizes,cost=costCrossEntropy):
        #sizes is something like [784,20,10], so the first 
        #layer has 784 nodes, second layer has 20 layers
        #and output is 10 layers
        self.num_layers=len(sizes)
        self.sizes=sizes
        self.default_weight_initializer()
        #the above function will initialize the weight differently
        self.cost=cost
    def default_weight_initializer(self):
        self.biases=[np.random.randn(y,1) for y in self.sizes[1:]]
        
        
        self.weights = [np.random.randn(y, x)/np.sqrt(x)
                        for x, y in zip(self.sizes[:-1], self.sizes[1:])]
    def large_weight_initializer(self):
        self.biases=[np.random.randn(y,1) for y in self.sizes[1:]]
        self.weights=[np.random.randn(y, x)
                        for x, y in zip(self.sizes[:-1], self.sizes[1:])]
    def feedforward(self,a):
        #a is from the input layer,
        for b,w in zip(self.biases,self.weights):
            a=sigmoid(np.dot(w,a)+b)
        #after this loop,a is the output layer
        return a
    def SGD(self, training_data,epoches,mini_batch_size,ita,lamda=0.0,
           evaluation_data=None,monitor_evaluation_cost=False,
           monitor_evaluation_accuracy=False,monitor_training_cost=False,
           monitor_training_accuracy=False):
        #mini-batch size is the mini size for stochatic gradient descent
        if evaluation_data:n_data=len(evaluation_data)
        else:n_data=0
        n=len(training_data)
        training_cost, training_accuracy=[],[]
        evaluation_cost,evaluation_accuracy=[],[]
        for k in xrange(epoches):
            random.shuffle(training_data)
            mini_batch_array=[training_data[i:i+mini_batch_size]
                             for i in xrange(0,n,mini_batch_size)]
            for mini_batch in mini_batch_array:
                self.update_mini_parameters(mini_batch,ita,lamda,n_data)
            print "Epoch %s training finished" %k
            if monitor_training_cost:
                cost=self.total_cost(training_data,lamda)
                training_cost.append(cost)
            if monitor_training_accuracy:
                accuracy=self.accuracy(training_data,convert=True)
                training_accuracy.append(accuracy)
            if monitor_evaluation_cost:
                cost=self.total_cost(evalucation_data,lamda,convert=True)
                evaluation_cost.append(cost)
            if monitor_evaluation_accuracy:
                accuracy=self.accuracy(evaluation_data)
                evaluation_accuracy.append(accuracy)
                print "Accuracy on evaluation data: {} / {}".format(
                    self.accuracy(evaluation_data), n_data)
            print
        return evaluation_cost,evaluation_accuracy,training_cost,training_accuracy
    def update_mini_parameters(self,mini_batch,ita,lamda,wholeRecordCount):
        pre_update_b=[np.zeros(b.shape) for b in self.biases]
        #this is the way to create the same shape of matrix of biases and weight
        pre_update_w=[np.zeros(w.shape) for w in self.weights]
        
        for x,y in mini_batch:
            delta_pre_update_b,delta_pre_update_w=self.backProp(x,y)
            pre_update_b=[b+db for b,db in zip(pre_update_b,delta_pre_update_b)]
            pre_update_w=[w+dw for w,dw in zip(pre_update_w,delta_pre_update_w)]
        self.weights=[(1-ita*(lamda/wholeRecordCount))*w-(ita/len(mini_batch))*deltaW 
                       for w,deltaW in zip(self.weights,pre_update_w)]
        #here divide by len(min_batch) is to average the delta over the minibactch
        #then we do update weights and biases later
        #we don't do regularization for b, according to the reason in the book
        self.biases=[b-(ita/len(mini_batch))*deltaB 
                    for b, deltaB in zip(self.biases,pre_update_b)]
    
    def backProp(self,x,y):
        #return the gradient of weights and biases for cost function
        #this is just for one record, so in order to do derivative
        #on the average cost function then get the gradient, is
        #equivalent to do the gradient calculation one by one and then take the
        #average between the gradient,which will be done in update_mini_parameters
        pre_update_b=[np.zeros(b.shape) for b in self.biases]
        pre_update_w=[np.zeros(w.shape) for w in self.weights]
        #feedforward now
        activation=x
        activations=[x] 
        #this is a matrix to store all the act
        #ivations, layer by layer
        sumActions=[] 
        #matrix to store all the sum whcih is before we take the sigmoid function
        for b,w in zip(self.biases,self.weights):
            sumAction=np.dot(w,activation)+b
            sumActions.append(sumAction)
            activation=sigmoid(sumAction)
            activations.append(activation)
        #backward
        #the following structure set up in order for us to use 
        #different cost function,say quadratic cost function delta need to 
        #multiply the sigmoid_prime and crossEntropy don't need to do that
        delta_depend_costFunction=(self.cost).delta(sumActions[-1],activations[-1],y)
        pre_update_b[-1]=delta_depend_costFunction
        pre_update_w[-1]=np.dot(delta_depend_costFunction,activations[-2].transpose())
        #testing the feature of np.dot like this, so say I have 784,5,10 sizes
        #layerso now delta is 10*1 and activations[-2].transpose is 1*5 so we 
        #get a 10*5 matrix after the dot and delta for each node in the hidden layer will
        #be on the column of this matrix
        #the above is about the last layer so it is the initialization of delta
        #now we need to calculate delta for other layers
        for l in xrange(2,self.num_layers):
            sumAction=sumActions[-l]
            sp=diff_sigmoid(sumAction)
            #here weights.transpose shape is (5,10) and delta is (10,5),sp is (5,1),so we will get a (5,1)
            #this is the delta for the hidden layer example
            delta_depend_costFunction=np.dot(self.weights[-l+1].transpose(),delta_depend_costFunction)*sp
            pre_update_b[-l]=delta_depend_costFunction
            #delta is (5,1) and activations[-l-1] is (10,1), so we need a transpose,so delta_weight is (5,10),
            #the same as our weight matrix setup.(which the weight set up is the inverse of the formula set up)
            pre_update_w[-l]=np.dot(delta_depend_costFunction,activations[-l-1].transpose())
            #so the idea is the sigmoid_prime is from the current layer, x(i) is from the previous layer (-l-1) and
            #the w(jk) is from the later layer, look at the formula from Dr.Amy
        #pre_update_b has the same shape with biases
        #pre_update_w has the same shape with weights
        return (pre_update_b,pre_update_w)
       
    def accuracy(self,data,convert=False):
        #return the number of record the network predict correct
        #convert is False means this accuracy is for test or validation data
        #since the usual case is to see the accuracy on test data
        #teh set up for traning data is different from the set of testing data
        #since we need to do matrix multiplication on the output vector 
        #in the training data

        if convert:
            results=[(np.argmax(self.feedforward(x)),np.argmax(y)) for (x,y) in data]
        else:
            results=[(np.argmax(self.feedforward(x)),y) for (x,y) in data]
        
        return sum(int(x==y) for (x,y) in results)
    def total_cost(self,data,lamda,convert=False):
        #the default total cost is on the validation data
        #and this is the cost oevr all the data,not the stochastic gradient one
        cost=0.0
        for x,y in data:
            estimated=self.feedforward(x)
            if convert: y=vectorized_result(y)
            cost=cost+self.cost.realCost/len(data)
        #the following is to add in the regularization part
        cost=cost+0.5*(lamda/len(data))*sum(np.linalg.norm(w)**2 for w in self.weights)
        #so the above sum part is a good way to sum up all the element in a matrix
        return cost
    
    #save the training result to a file
    def save(self,filename):
        #put the data in the dictionary
        data={
            "sizes":self.sizes,
            "weights":[w.tolist() for w in self.weights],
            "biases":[b.tolist() for b in self.biases],
            "cost":str(self.cost.__name__)
            #above is kind of reflection to get the costFunctionName
        }
        f=open(filename,"w")
        json.dump(data,f)
        f.close()
    #the following is load in an already trained network
    def load(filename):
        #return an instance of the network
        f=open(filename,"r")
        data=json.load(f)
        f.close()
        cost=getattr(sys.modules[__name__],data["cost"])
        #load in the cost function for the network
        net=Network(data["sizes"],cost=cost)
        net.weights=[np.array(w) for w in data["weights"]]
        net.biases=[np.array(b) for b in data["biases"]]
        return net
    
    

In [14]:
data_test=load_data()

In [32]:
x=np.reshape(data_test[0][0][0],(784,1))

In [142]:
y=data_test[0][1][0]

In [163]:
net = Network([784, 5, 10], cost=costCrossEntropy)

In [180]:
net.weights[0].shape

(5, 784)

In [164]:
test=net.backProp(x,y)

In [169]:
test[0]

array([[-4.23556702],
       [-4.25745572],
       [-4.0979444 ],
       [-4.26282567],
       [-4.75326355],
       [-4.46624455],
       [-4.69541625],
       [-4.17206748],
       [-4.86077479],
       [-4.60826392]])

In [170]:
test[1]

array([[ 0.27473737],
       [ 0.57892272],
       [ 0.59674144],
       [ 0.71654454],
       [ 0.29883683]])

In [171]:
testdot=np.dot(test[0],test[1].transpose())

In [172]:
testdot

array([[-1.16366853, -2.45206598, -2.52753838, -3.03497241, -1.26574341],
       [-1.16968218, -2.46473784, -2.54060027, -3.05065663, -1.27228456],
       [-1.12585846, -2.37239312, -2.44541326, -2.93635967, -1.2246167 ],
       [-1.1711575 , -2.46784663, -2.54380474, -3.05450445, -1.2738893 ],
       [-1.30589912, -2.75177226, -2.83646935, -3.40592503, -1.4204502 ],
       [-1.22704427, -2.58561044, -2.66519322, -3.20026313, -1.33467835],
       [-1.2900063 , -2.71828314, -2.80194947, -3.36447486, -1.40316329],
       [-1.14622284, -2.41530465, -2.48964557, -2.98947216, -1.24676741],
       [-1.33543647, -2.81401296, -2.90062576, -3.48296162, -1.45257852],
       [-1.2660623 , -2.66782868, -2.74994206, -3.30202634, -1.37711897]])

In [175]:
testdot=np.dot(test[1],test[0].transpose())

In [176]:
testdot

array([[-1.16366853, -1.16968218, -1.12585846, -1.1711575 , -1.30589912,
        -1.22704427, -1.2900063 , -1.14622284, -1.33543647, -1.2660623 ],
       [-2.45206598, -2.46473784, -2.37239312, -2.46784663, -2.75177226,
        -2.58561044, -2.71828314, -2.41530465, -2.81401296, -2.66782868],
       [-2.52753838, -2.54060027, -2.44541326, -2.54380474, -2.83646935,
        -2.66519322, -2.80194947, -2.48964557, -2.90062576, -2.74994206],
       [-3.03497241, -3.05065663, -2.93635967, -3.05450445, -3.40592503,
        -3.20026313, -3.36447486, -2.98947216, -3.48296162, -3.30202634],
       [-1.26574341, -1.27228456, -1.2246167 , -1.2738893 , -1.4204502 ,
        -1.33467835, -1.40316329, -1.24676741, -1.45257852, -1.37711897]])

In [14]:
net = Network([784, 10, 10], cost=costCrossEntropy)
net.large_weight_initializer()
oneHidden10=net.SGD(training_data, 30, 10, 0.1, lamda = 5.0,
        evaluation_data=validation_data, 
        monitor_evaluation_accuracy=True)

Epoch 0 training finished
Accuracy on evaluation data: 8420 / 10000

Epoch 1 training finished
Accuracy on evaluation data: 8836 / 10000

Epoch 2 training finished
Accuracy on evaluation data: 8980 / 10000

Epoch 3 training finished
Accuracy on evaluation data: 9067 / 10000

Epoch 4 training finished
Accuracy on evaluation data: 9111 / 10000

Epoch 5 training finished
Accuracy on evaluation data: 9119 / 10000

Epoch 6 training finished
Accuracy on evaluation data: 9197 / 10000

Epoch 7 training finished
Accuracy on evaluation data: 9217 / 10000

Epoch 8 training finished
Accuracy on evaluation data: 9242 / 10000

Epoch 9 training finished
Accuracy on evaluation data: 9209 / 10000

Epoch 10 training finished
Accuracy on evaluation data: 9252 / 10000

Epoch 11 training finished
Accuracy on evaluation data: 9260 / 10000

Epoch 12 training finished
Accuracy on evaluation data: 9248 / 10000

Epoch 13 training finished
Accuracy on evaluation data: 9229 / 10000

Epoch 14 training finished
Acc

In [15]:
net = Network([784, 100, 10], cost=costCrossEntropy)
net.large_weight_initializer()
oneHidden100=net.SGD(training_data, 30, 10, 0.1, lamda = 5.0,
        evaluation_data=validation_data, 
        monitor_evaluation_accuracy=True)

Epoch 0 training finished
Accuracy on evaluation data: 9087 / 10000

Epoch 1 training finished
Accuracy on evaluation data: 9292 / 10000

Epoch 2 training finished
Accuracy on evaluation data: 9458 / 10000

Epoch 3 training finished
Accuracy on evaluation data: 9533 / 10000

Epoch 4 training finished
Accuracy on evaluation data: 9569 / 10000

Epoch 5 training finished
Accuracy on evaluation data: 9587 / 10000

Epoch 6 training finished
Accuracy on evaluation data: 9608 / 10000

Epoch 7 training finished
Accuracy on evaluation data: 9603 / 10000

Epoch 8 training finished
Accuracy on evaluation data: 9623 / 10000

Epoch 9 training finished
Accuracy on evaluation data: 9640 / 10000

Epoch 10 training finished
Accuracy on evaluation data: 9641 / 10000

Epoch 11 training finished
Accuracy on evaluation data: 9677 / 10000

Epoch 12 training finished
Accuracy on evaluation data: 9673 / 10000

Epoch 13 training finished
Accuracy on evaluation data: 9665 / 10000

Epoch 14 training finished
Acc

In [16]:
net = Network([784, 200, 10], cost=costCrossEntropy)
net.large_weight_initializer()
oneHidden200=net.SGD(training_data, 30, 10, 0.1, lamda = 5.0,
        evaluation_data=validation_data, 
        monitor_evaluation_accuracy=True)

Epoch 0 training finished
Accuracy on evaluation data: 9110 / 10000

Epoch 1 training finished
Accuracy on evaluation data: 9363 / 10000

Epoch 2 training finished
Accuracy on evaluation data: 9500 / 10000

Epoch 3 training finished
Accuracy on evaluation data: 9554 / 10000

Epoch 4 training finished
Accuracy on evaluation data: 9570 / 10000

Epoch 5 training finished
Accuracy on evaluation data: 9610 / 10000

Epoch 6 training finished
Accuracy on evaluation data: 9605 / 10000

Epoch 7 training finished
Accuracy on evaluation data: 9654 / 10000

Epoch 8 training finished
Accuracy on evaluation data: 9637 / 10000

Epoch 9 training finished
Accuracy on evaluation data: 9661 / 10000

Epoch 10 training finished
Accuracy on evaluation data: 9667 / 10000

Epoch 11 training finished
Accuracy on evaluation data: 9649 / 10000

Epoch 12 training finished
Accuracy on evaluation data: 9674 / 10000

Epoch 13 training finished
Accuracy on evaluation data: 9681 / 10000

Epoch 14 training finished
Acc

In [17]:
net = Network([784, 20,20, 10], cost=costCrossEntropy)
net.large_weight_initializer()
twoHidden2020=net.SGD(training_data, 30, 10, 0.1, lamda = 5.0,
        evaluation_data=validation_data, 
        monitor_evaluation_accuracy=True)

Epoch 0 training finished
Accuracy on evaluation data: 8621 / 10000

Epoch 1 training finished
Accuracy on evaluation data: 9090 / 10000

Epoch 2 training finished
Accuracy on evaluation data: 9218 / 10000

Epoch 3 training finished
Accuracy on evaluation data: 9343 / 10000

Epoch 4 training finished
Accuracy on evaluation data: 9374 / 10000

Epoch 5 training finished
Accuracy on evaluation data: 9402 / 10000

Epoch 6 training finished
Accuracy on evaluation data: 9437 / 10000

Epoch 7 training finished
Accuracy on evaluation data: 9442 / 10000

Epoch 8 training finished
Accuracy on evaluation data: 9490 / 10000

Epoch 9 training finished
Accuracy on evaluation data: 9497 / 10000

Epoch 10 training finished
Accuracy on evaluation data: 9535 / 10000

Epoch 11 training finished
Accuracy on evaluation data: 9491 / 10000

Epoch 12 training finished
Accuracy on evaluation data: 9504 / 10000

Epoch 13 training finished
Accuracy on evaluation data: 9489 / 10000

Epoch 14 training finished
Acc

In [18]:
net = Network([784, 50,20, 10], cost=costCrossEntropy)
net.large_weight_initializer()
twoHidden5020=net.SGD(training_data, 30, 10, 0.1, lamda = 5.0,
        evaluation_data=validation_data, 
        monitor_evaluation_accuracy=True)

Epoch 0 training finished
Accuracy on evaluation data: 8792 / 10000

Epoch 1 training finished
Accuracy on evaluation data: 9121 / 10000

Epoch 2 training finished
Accuracy on evaluation data: 9363 / 10000

Epoch 3 training finished
Accuracy on evaluation data: 9426 / 10000

Epoch 4 training finished
Accuracy on evaluation data: 9493 / 10000

Epoch 5 training finished
Accuracy on evaluation data: 9554 / 10000

Epoch 6 training finished
Accuracy on evaluation data: 9571 / 10000

Epoch 7 training finished
Accuracy on evaluation data: 9632 / 10000

Epoch 8 training finished
Accuracy on evaluation data: 9619 / 10000

Epoch 9 training finished
Accuracy on evaluation data: 9586 / 10000

Epoch 10 training finished
Accuracy on evaluation data: 9650 / 10000

Epoch 11 training finished
Accuracy on evaluation data: 9664 / 10000

Epoch 12 training finished
Accuracy on evaluation data: 9649 / 10000

Epoch 13 training finished
Accuracy on evaluation data: 9684 / 10000

Epoch 14 training finished
Acc

In [19]:
#no regularization case:
net = Network([784, 50,20, 10], cost=costCrossEntropy)
net.large_weight_initializer()
twoHidden5020_noReg=net.SGD(training_data, 30, 10, 0.1, lamda = 0.0,
        evaluation_data=validation_data, 
        monitor_evaluation_accuracy=True)

Epoch 0 training finished
Accuracy on evaluation data: 8515 / 10000

Epoch 1 training finished
Accuracy on evaluation data: 8887 / 10000

Epoch 2 training finished
Accuracy on evaluation data: 9089 / 10000

Epoch 3 training finished
Accuracy on evaluation data: 9171 / 10000

Epoch 4 training finished
Accuracy on evaluation data: 9229 / 10000

Epoch 5 training finished
Accuracy on evaluation data: 9263 / 10000

Epoch 6 training finished
Accuracy on evaluation data: 9310 / 10000

Epoch 7 training finished
Accuracy on evaluation data: 9332 / 10000

Epoch 8 training finished
Accuracy on evaluation data: 9357 / 10000

Epoch 9 training finished
Accuracy on evaluation data: 9394 / 10000

Epoch 10 training finished
Accuracy on evaluation data: 9392 / 10000

Epoch 11 training finished
Accuracy on evaluation data: 9383 / 10000

Epoch 12 training finished
Accuracy on evaluation data: 9416 / 10000

Epoch 13 training finished
Accuracy on evaluation data: 9434 / 10000

Epoch 14 training finished
Acc

In [14]:
print net.weights

[array([[  2.72626027e-04,   4.16905976e-06,  -1.02379946e-03, ...,
         -2.69032803e-04,   1.21669158e-04,   1.36562026e-03],
       [  1.15387630e-04,  -2.95285917e-05,  -1.44881540e-05, ...,
          1.15017348e-04,  -1.79013199e-04,  -8.09636463e-04],
       [ -3.09749864e-04,  -2.17425376e-05,  -9.27882443e-04, ...,
         -8.66207998e-05,  -5.97753853e-04,  -2.20254944e-04],
       ..., 
       [ -1.38442536e-04,   1.14865962e-03,  -1.76211855e-04, ...,
          5.02335012e-04,  -5.99548268e-04,  -3.13493347e-04],
       [ -7.86918172e-04,  -4.39724781e-04,   5.72829654e-04, ...,
         -1.66786869e-03,   3.63998348e-04,  -2.27458083e-06],
       [  2.36763553e-04,  -4.93328791e-04,  -3.40238454e-05, ...,
          3.88059737e-04,  -6.02968839e-04,  -8.03264401e-04]]), array([[-2.97531407, -1.65439102,  1.17551978, -2.16417665, -0.36744316,
         4.16211897, -2.43242933,  2.11827459, -1.5647029 , -3.72870387],
       [ 2.42087534, -1.35000517, -2.10363627, -0.3902552

In [15]:
print net.biases

[array([[-0.32876478],
       [ 0.36094367],
       [ 0.06721656],
       [-0.48018359],
       [ 0.65424671],
       [ 0.95461411],
       [-0.04264859],
       [ 0.95929357],
       [ 1.0122879 ],
       [ 0.1860195 ]]), array([[-3.68648201],
       [-5.35651112],
       [-4.71432446],
       [-4.11723353],
       [-8.30516597],
       [-6.25661023],
       [-2.76937582],
       [-9.36402472],
       [-4.67591271],
       [ 0.79349309]])]


In [28]:
from sklearn import svm
def svm_baseline():
    training_data, validation_data, test_data = load_data()
# train
    clf = svm.SVC()
    clf.fit(training_data[0], training_data[1])
# test
    predictions = [int(a) for a in clf.predict(test_data[0])]
    num_correct = sum(int(a == y) for a, y in zip(predictions, test_data[1]))
    print "Baseline classifier using an SVM."
    print "%s of %s values correct." % (num_correct, len(test_data[1]))

In [260]:
training_data, validation_data, test_data = load_data()

In [16]:
training_data[0].shape

(50000, 784)

(784,)

In [22]:
import matplotlib.pyplot as plt

In [21]:
def giveMeThePercentage(arrayToChange):
    return [round(x/10000.0,3) for x in arrayToChange]

In [31]:
oneHidden10Per=giveMeThePercentage(oneHidden10[1])
oneHidden100Per=giveMeThePercentage(oneHidden100[1])
oneHidden200Per=giveMeThePercentage(oneHidden200[1])
twoHidden2020Per=giveMeThePercentage(twoHidden2020[1])
twoHidden5020Per=giveMeThePercentage(twoHidden5020[1])
twoHidden5020Per_noReg=giveMeThePercentage(twoHidden5020_noReg[1])

In [36]:
plt.plot(oneHidden10Per,'r',label="1 hidden 10")
plt.plot(oneHidden100Per,'g',label="1 hidden 100")
plt.plot(oneHidden200Per,'b',label="1 hidden 200")
plt.plot(twoHidden2020Per,'y',label="2 hidden 20 20")
plt.plot(twoHidden5020Per,'k',marker="o",label="2 hidden 50 20")
plt.plot(twoHidden5020Per_noReg,'k',marker="*",label="2 hidden 50 20 no reg")
plt.legend(loc='best')
plt.show()

In [171]:
from sklearn import svm
def svm_baseline():
    training_data, validation_data, test_data = load_data()
# train
    clf = svm.SVC()
    clf.fit(training_data[0], training_data[1])
# test
    predictions = [int(a) for a in clf.predict(test_data[0])]
    num_correct = sum(int(a == y) for a, y in zip(predictions, test_data[1]))
    print "Baseline classifier using an SVM."
    print "%s of %s values correct." % (num_correct, len(test_data[1]))

In [172]:
%%time 
svm_baseline()

Baseline classifier using an SVM.
9435 of 10000 values correct.
CPU times: user 10min, sys: 1.72 s, total: 10min 2s
Wall time: 10min 2s


In [191]:
net = Network([784, 100, 10], cost=costCrossEntropy)
net.large_weight_initializer()
oneHidden10=net.SGD(training_data, 30, 10, 0.1, lamda = 5.0,
        evaluation_data=validation_data, 
        monitor_evaluation_accuracy=True)

Epoch 0 training finished
Accuracy on evaluation data: 9065 / 10000

Epoch 1 training finished
Accuracy on evaluation data: 9325 / 10000

Epoch 2 training finished
Accuracy on evaluation data: 9425 / 10000

Epoch 3 training finished
Accuracy on evaluation data: 9517 / 10000

Epoch 4 training finished
Accuracy on evaluation data: 9580 / 10000

Epoch 5 training finished
Accuracy on evaluation data: 9598 / 10000

Epoch 6 training finished
Accuracy on evaluation data: 9650 / 10000

Epoch 7 training finished
Accuracy on evaluation data: 9641 / 10000

Epoch 8 training finished
Accuracy on evaluation data: 9629 / 10000

Epoch 9 training finished
Accuracy on evaluation data: 9667 / 10000

Epoch 10 training finished
Accuracy on evaluation data: 9650 / 10000

Epoch 11 training finished
Accuracy on evaluation data: 9655 / 10000

Epoch 12 training finished
Accuracy on evaluation data: 9674 / 10000

Epoch 13 training finished
Accuracy on evaluation data: 9670 / 10000

Epoch 14 training finished
Acc

In [285]:
weights=net.weights
biases=net.biases
training_data, validation_data, test_data = load_data()

In [277]:
newWeights=net1.weights
newBiases=net1.biases


In [18]:
def getSigmoidForArray(array):
    return [sigmoid(x) for x in array]

In [222]:
newTest=[getSigmoidForArray(x) for x in newTrainingData]

In [252]:
firstLayerResult=[sigmoid(x)for x in ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionOneHidden(100,weights,biases)]

In [19]:
def ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionOneHidden(n_neurons,weights,biases):
    newTrainingData=[]
    for j in xrange(0,50000):
        tempArray=np.zeros((n_neurons,))
        for i in xrange(0,n_neurons):
            tempArray[i]=np.dot(training_data[0][j].transpose(),weights[0][i])+biases[0][i]
        newTrainingData.append(tempArray)
    return newTrainingData


In [20]:
def ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionTestDataOneHidden(n_neurons,weights,biases):
    newTrainingData=[]
    for j in xrange(0,10000):
        tempArray=np.zeros((n_neurons,))
        for i in xrange(0,n_neurons):
            tempArray[i]=np.dot(test_data[0][j].transpose(),weights[0][i])+biases[0][i]
        newTrainingData.append(tempArray)
    return newTrainingData


In [21]:
def ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionTwoHidden(n_neurons1,n_neurons2,weights,biases):
    firstLayerResult=[sigmoid(x) for x in ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionOneHidden(n_neurons1,weights,biases)]
    newTrainingData=[]
    for j in xrange(0,50000):
        tempArray=np.zeros((n_neurons2,))
        for i in xrange(0,n_neurons2):
            tempArray[i]=np.dot(firstLayerResult[j].transpose(),weights[1][i])+biases[1][i]
        newTrainingData.append(tempArray)
    return newTrainingData


In [22]:
def ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionTestDataTwoHidden(n_neurons1,n_neurons2,weights,biases):
    firstLayerResult=[sigmoid(x) for x in ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionTestDataOneHidden(n_neurons1,weights,biases)]
    newTrainingData=[]
    for j in xrange(0,10000):
        tempArray=np.zeros((n_neurons2,))
        for i in xrange(0,n_neurons2):
            tempArray[i]=np.dot(firstLayerResult[j].transpose(),weights[1][i])+biases[1][i]
        newTrainingData.append(tempArray)
    return newTrainingData


In [23]:
newTrainingData=ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionOneHidden(100,weights,biases)
newTestData=ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionTestDataOneHidden(100,weights,biases)

NameError: name 'weights' is not defined

In [237]:
newTrainingDataTwoLayers=ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionTwoHidden(50,20,newWeights,newBiases)
newTestDataTwoLayers=ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionTestDataTwoHidden(50,20,newWeights,newBiases)

In [26]:
from sklearn import svm
def svm_nn_combine():
    training_data, validation_data, test_data = load_data()
# train
    clf = svm.SVC()
    clf.fit(newTrainingData, training_data[1])
# test
    predictions = [int(a) for a in clf.predict(newTestData)]
    num_correct = sum(int(a == y) for a, y in zip(predictions, test_data[1]))
    print "Baseline classifier using an SVM."
    print "%s of %s values correct." % (num_correct, len(test_data[1]))

In [27]:
from sklearn import svm
def svm_nn_combine_2layers():
    training_data, validation_data, test_data = load_data()
# train
    clf = svm.SVC()
    clf.fit(newTrainingDataTwoLayers, training_data[1])
# test
    predictions = [int(a) for a in clf.predict(newTestDataTwoLayers)]
    num_correct = sum(int(a == y) for a, y in zip(predictions, test_data[1]))
    print "Baseline classifier using an SVM."
    print "%s of %s values correct." % (num_correct, len(test_data[1]))

In [33]:
from sklearn import svm
def svm_nn_combine_2layers_sigmoid_kernel():
    training_data, validation_data, test_data = load_data()
# train
    clf = svm.SVC(kernel='sigmoid')
    clf.fit(newTrainingDataTwoLayers, training_data[1])
# test
    predictions = [int(a) for a in clf.predict(newTestDataTwoLayers)]
    num_correct = sum(int(a == y) for a, y in zip(predictions, test_data[1]))
    print "Baseline classifier using an SVM."
    print "%s of %s values correct." % (num_correct, len(test_data[1]))


In [210]:
%%time
svm_nn_combine()

Baseline classifier using an SVM.
9757 of 10000 values correct.
CPU times: user 4min 52s, sys: 1.29 s, total: 4min 53s
Wall time: 4min 53s


compare to the pure nn which is 9680, so it does enhance it when add in svm but for 10 neurons in first hidden layer,
it decrease the performance.

In [290]:
%%time
svm_nn_combine_2layers()

Baseline classifier using an SVM.
7624 of 10000 values correct.
CPU times: user 10min 9s, sys: 3.3 s, total: 10min 12s
Wall time: 10min 12s


In [291]:
#not enhance the performance at this point

In [18]:
training_data, validation_data,test_data=load_data_wrapper()

In [34]:
net200 = Network([784,200,10], cost=costCrossEntropy)
net200.large_weight_initializer()
twoHidden5020=net200.SGD(training_data, 30, 10, 0.1, lamda = 5.0,
        evaluation_data=validation_data, 
        monitor_evaluation_accuracy=True)

Epoch 0 training finished
Accuracy on evaluation data: 9183 / 10000

Epoch 1 training finished
Accuracy on evaluation data: 9403 / 10000

Epoch 2 training finished
Accuracy on evaluation data: 9505 / 10000

Epoch 3 training finished
Accuracy on evaluation data: 9523 / 10000

Epoch 4 training finished
Accuracy on evaluation data: 9629 / 10000

Epoch 5 training finished
Accuracy on evaluation data: 9647 / 10000

Epoch 6 training finished
Accuracy on evaluation data: 9654 / 10000

Epoch 7 training finished
Accuracy on evaluation data: 9655 / 10000

Epoch 8 training finished
Accuracy on evaluation data: 9670 / 10000

Epoch 9 training finished
Accuracy on evaluation data: 9651 / 10000

Epoch 10 training finished
Accuracy on evaluation data: 9675 / 10000

Epoch 11 training finished
Accuracy on evaluation data: 9653 / 10000

Epoch 12 training finished
Accuracy on evaluation data: 9692 / 10000

Epoch 13 training finished
Accuracy on evaluation data: 9691 / 10000

Epoch 14 training finished
Acc

In [35]:
weights200=net200.weights
biases200=net200.biases
training_data, validation_data, test_data = load_data()

In [44]:
training_data, validation_data, test_data = load_data()

In [42]:
newTrainingData=ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionOneHidden(200,weights200,biases200)
newTestData=ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionTestDataOneHidden(200,weights200,biases200)

In [48]:
%%time
svm_nn_combine()

Baseline classifier using an SVM.
9798 of 10000 values correct.
CPU times: user 3min 34s, sys: 614 ms, total: 3min 34s
Wall time: 3min 34s


In [49]:
#compare to 9716 with 9798 so get enhanced

In [50]:
training_data, validation_data,test_data=load_data_wrapper()

In [51]:
net5020 = Network([784, 50,20, 10], cost=costCrossEntropy)
net5020.large_weight_initializer()
twoHidden5020=net5020.SGD(training_data, 30, 10, 0.1, lamda = 5.0,
        evaluation_data=validation_data, 
        monitor_evaluation_accuracy=True)

Epoch 0 training finished
Accuracy on evaluation data: 8761 / 10000

Epoch 1 training finished
Accuracy on evaluation data: 9139 / 10000

Epoch 2 training finished
Accuracy on evaluation data: 9322 / 10000

Epoch 3 training finished
Accuracy on evaluation data: 9440 / 10000

Epoch 4 training finished
Accuracy on evaluation data: 9475 / 10000

Epoch 5 training finished
Accuracy on evaluation data: 9537 / 10000

Epoch 6 training finished
Accuracy on evaluation data: 9577 / 10000

Epoch 7 training finished
Accuracy on evaluation data: 9638 / 10000

Epoch 8 training finished
Accuracy on evaluation data: 9629 / 10000

Epoch 9 training finished
Accuracy on evaluation data: 9662 / 10000

Epoch 10 training finished
Accuracy on evaluation data: 9618 / 10000

Epoch 11 training finished
Accuracy on evaluation data: 9653 / 10000

Epoch 12 training finished
Accuracy on evaluation data: 9658 / 10000

Epoch 13 training finished
Accuracy on evaluation data: 9668 / 10000

Epoch 14 training finished
Acc

In [52]:
weights5020=net5020.weights
biases5020=net5020.biases
training_data, validation_data, test_data = load_data()

In [53]:
newTrainingDataTwoLayers=ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionTwoHidden(50,20,weights5020,biases5020)
newTestDataTwoLayers=ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionTestDataTwoHidden(50,20,weights5020,biases5020)

In [54]:
%%time
svm_nn_combine_2layers()

Baseline classifier using an SVM.
9756 of 10000 values correct.
CPU times: user 16.7 s, sys: 80.5 ms, total: 16.8 s
Wall time: 16.8 s


In [None]:
#this is obvious faster than the 97.56 than 97.03

In [15]:
net2020 = Network([784, 20,20, 10], cost=costCrossEntropy)
net2020.large_weight_initializer()
oneHidden10=net2020.SGD(training_data, 30, 10, 0.1, lamda = 5.0,
        evaluation_data=validation_data, 
        monitor_evaluation_accuracy=True)

Epoch 0 training finished
Accuracy on evaluation data: 8591 / 10000

Epoch 1 training finished
Accuracy on evaluation data: 9081 / 10000

Epoch 2 training finished
Accuracy on evaluation data: 9227 / 10000

Epoch 3 training finished
Accuracy on evaluation data: 9328 / 10000

Epoch 4 training finished
Accuracy on evaluation data: 9363 / 10000

Epoch 5 training finished
Accuracy on evaluation data: 9438 / 10000

Epoch 6 training finished
Accuracy on evaluation data: 9449 / 10000

Epoch 7 training finished
Accuracy on evaluation data: 9446 / 10000

Epoch 8 training finished
Accuracy on evaluation data: 9478 / 10000

Epoch 9 training finished
Accuracy on evaluation data: 9527 / 10000

Epoch 10 training finished
Accuracy on evaluation data: 9538 / 10000

Epoch 11 training finished
Accuracy on evaluation data: 9547 / 10000

Epoch 12 training finished
Accuracy on evaluation data: 9529 / 10000

Epoch 13 training finished
Accuracy on evaluation data: 9541 / 10000

Epoch 14 training finished
Acc

In [17]:
weights2020=net2020.weights
biases2020=net2020.biases
training_data, validation_data, test_data = load_data()

In [24]:
newTrainingDataTwoLayers=ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionTwoHidden(20,20,weights2020,biases2020)
newTestDataTwoLayers=ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionTestDataTwoHidden(20,20,weights2020,biases2020)

In [28]:
%%time
svm_nn_combine_2layers()

Baseline classifier using an SVM.
9625 of 10000 values correct.
CPU times: user 18.8 s, sys: 103 ms, total: 18.9 s
Wall time: 18.9 s


In [34]:
%%time
svm_nn_combine_2layers_sigmoid_kernel()

Baseline classifier using an SVM.
1135 of 10000 values correct.
CPU times: user 4min 21s, sys: 2.92 s, total: 4min 24s
Wall time: 4min 24s


#obviously sigmoid kernel for svm is not good for this problem,
#the default kernel is 'rbf'

In [29]:
training_data, validation_data,test_data=load_data_wrapper()
net10 = Network([784, 10, 10], cost=costCrossEntropy)
net10.large_weight_initializer()
oneHidden10=net10.SGD(training_data, 30, 10, 0.1, lamda = 5.0,
        evaluation_data=validation_data, 
        monitor_evaluation_accuracy=True)


Epoch 0 training finished
Accuracy on evaluation data: 8376 / 10000

Epoch 1 training finished
Accuracy on evaluation data: 8806 / 10000

Epoch 2 training finished
Accuracy on evaluation data: 9043 / 10000

Epoch 3 training finished
Accuracy on evaluation data: 9128 / 10000

Epoch 4 training finished
Accuracy on evaluation data: 9193 / 10000

Epoch 5 training finished
Accuracy on evaluation data: 9179 / 10000

Epoch 6 training finished
Accuracy on evaluation data: 9209 / 10000

Epoch 7 training finished
Accuracy on evaluation data: 9280 / 10000

Epoch 8 training finished
Accuracy on evaluation data: 9262 / 10000

Epoch 9 training finished
Accuracy on evaluation data: 9246 / 10000

Epoch 10 training finished
Accuracy on evaluation data: 9287 / 10000

Epoch 11 training finished
Accuracy on evaluation data: 9297 / 10000

Epoch 12 training finished
Accuracy on evaluation data: 9275 / 10000

Epoch 13 training finished
Accuracy on evaluation data: 9262 / 10000

Epoch 14 training finished
Acc

In [30]:
weights10=net10.weights
biases10=net10.biases
training_data, validation_data, test_data = load_data()

In [31]:
newTrainingData=ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionOneHidden(10,weights10,biases10)
newTestData=ToGetTheLastHiddenLayerValueBeforeGoToSigmoidFunctionTestDataOneHidden(10,weights10,biases10)

In [32]:
%%time
svm_nn_combine()

Baseline classifier using an SVM.
8523 of 10000 values correct.
CPU times: user 7min 10s, sys: 3.46 s, total: 7min 14s
Wall time: 7min 13s
