# CONTENT

* **a. FNN with 1-by-1 Stochastic** 
* **b. FNN with Batch Stochastic** 
===> SOLVING VANISHING GRADIENT PROBLEM <===

# FNN with 1-by-1 Stochastic

### A. Code

In [870]:
import numpy as np
import theano
from theano import shared, function
import theano.tensor as T
from theano.tensor.nnet import sigmoid, softmax

In [871]:
class Layer1:
    
    def __init__(self, bShape, wShape, activation=sigmoid):
        assert len(bShape) == len(wShape) == 2 # bShape, wShape: (row, col) 2-tuples.
        self.b = shared(np.asarray(np.random.normal(loc=0.,scale=1.,size=bShape),
                                   dtype=theano.config.floatX), borrow=True)
        self.w = shared(np.asarray(np.random.normal(loc=0.,scale=np.sqrt(1./wShape[0]),size=wShape),
                                   dtype=theano.config.floatX), borrow=True)
        self.params = [self.b, self.w]
        self.activation = activation
    
    def activate(self, x):
        z = T.dot(self.w, x) + self.b
        if self.activation is sigmoid: return self.activation(z)
        elif self.activation is softmax: return self.activation(z.T[0])
        else: return z

In [872]:
class NNTheano1:
    
    def __init__(self, sizes, lr=.01, momentum=.9, hiddenActivation=sigmoid, outputActivation=sigmoid):
        self.numLayers = len(sizes)
        assert self.numLayers >= 3 # must have at least 1 hidden layer.
        self.layers = [ Layer1((nOut,1),(nOut,nIn),hiddenActivation) 
                        for nIn,nOut in zip(sizes[:-2],sizes[1:]) ]
        self.layers.append(Layer1((sizes[-1],1),(sizes[-1],sizes[-2]),outputActivation))
            # shapes of bias & weight matrices for layer L
            #  b: nrow(L) * 1
            #  w: nrow(L) * nrow(L-1)
        self.params = []
        for layer in self.layers:
            self.params += layer.params
        self.config(lr, momentum)
    
    def config(self, lr, momentum): 
        # create a symbolic graph for forward passing.
        # lightening fast for evaluation.
        x = T.matrix()
        y = T.vector()
        a = self.layers[0].activate(x)
        for i in range(1,len(self.layers)):
            a = self.layers[i].activate(a)
        cost = T.sum(np.nansum(-y*T.log(a) - (1.-y)*T.log(1-a)))
        self.predict = function([x], a) # <======================== forward pass function.
        self.cost = function([x, y], cost) # <===================== cost function.
        # sgd config
        updates = []
        for param in self.params:
            paramUpdate = shared(param.get_value()*0, broadcastable=param.broadcastable)
            updates.append((param, param-lr*paramUpdate))
            updates.append((paramUpdate, momentum*paramUpdate+(1.-momentum)*T.grad(cost,param)))
        self.run = function([x, y], cost, updates=updates) # <===== sgd function.
    
    def train(self, X, Y):
        counter = 0
        for x,y in zip(X,Y):
            self.run(x,y) 
            counter += 1
            if counter % 5000 == 0: print "Trained: %d data points" % counter
    
    def evaluate(self, X, Y):
        correct = 0
        for x,y in zip(X,Y):
            correct += 1 if np.argmax(self.predict(x))==np.argmax(y) else 0
        print "Accuracy: %d / %d (%.2f%%)" % (correct, len(X), (float(correct)/len(X))*100)
            

### B. Toy Example: Logic Gate OR

In [873]:
# E.G. OR
'''
x y | lor
0 0 | 0
0 1 | 1
1 0 | 1
1 1 | 1
'''
xy1, lor1 = np.asarray([0.,0.]).reshape(2,1), np.asarray([0.])
xy2, lor2 = np.asarray([0.,1.]).reshape(2,1), np.asarray([1.])
xy3, lor3 = np.asarray([1.,0.]).reshape(2,1), np.asarray([1.])
xy4, lor4 = np.asarray([1.,1.]).reshape(2,1), np.asarray([1.])
X = [xy1, xy2, xy3, xy4] * 12500
Y = [lor1, lor2, lor3, lor4] * 12500

In [874]:
'''
fnn: [2, 1, 1]
'''
fnn = NNTheano1([2,1,1],lr=3)

In [875]:
%%time
fnn.train(X,Y)

Trained: 5000 data points
Trained: 10000 data points
Trained: 15000 data points
Trained: 20000 data points
Trained: 25000 data points
Trained: 30000 data points
Trained: 35000 data points
Trained: 40000 data points
Trained: 45000 data points
Trained: 50000 data points
CPU times: user 1.92 s, sys: 45.7 ms, total: 1.97 s
Wall time: 1.96 s


In [876]:
print "OR Truth-Value Table"
print "0 0 |", "0" if fnn.predict(xy1)[0][0] < .5 else "1"
print "0 1 |", "0" if fnn.predict(xy2)[0][0] < .5 else "1"
print "1 0 |", "0" if fnn.predict(xy3)[0][0] < .5 else "1"
print "1 1 |", "0" if fnn.predict(xy4)[0][0] < .5 else "1"

OR Truth-Value Table
0 0 | 0
0 1 | 1
1 0 | 1
1 1 | 1


### C. MNIST

In [877]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/NN/DATA/neural-networks-and-deep-learning-master/src")
import mnist_loader # works when under '~/src' path.
data_train, data_dev, data_test = mnist_loader.load_data_wrapper()

In [878]:
X_train = [np.asarray(datum[0]).reshape(len(datum[0]),1) for datum in data_train]
Y_train = [np.asarray(datum[1]).squeeze() for datum in data_train]

In [879]:
X_test = [np.asarray(datum[0]).reshape(len(datum[0]),1) for datum in data_test]
Y_test = [np.asarray(datum[1]).squeeze() for datum in data_test]

In [881]:
fnn = NNTheano1([784, 30, 10], lr=.5, hiddenActivation=sigmoid, outputActivation=softmax)

In [882]:
print "Pre-train cost: ", fnn.cost(X_train[0],Y_train[0])

Pre-train cost:  3.70402235535


In [883]:
%%time
fnn.train(X_train,Y_train)

Trained: 5000 data points
Trained: 10000 data points
Trained: 15000 data points
Trained: 20000 data points
Trained: 25000 data points
Trained: 30000 data points
Trained: 35000 data points
Trained: 40000 data points
Trained: 45000 data points
Trained: 50000 data points
CPU times: user 11.7 s, sys: 75.2 ms, total: 11.7 s
Wall time: 11.7 s


In [884]:
print "Post-train cost: ", fnn.cost(X_train[0],Y_train[0])

Post-train cost:  0.00475332078753


In [885]:
print "Pre-train Random Accuracy: ", fnn.evaluate(X_train, Y_train)
print "Pre-train Random Accuracy: ", fnn.evaluate(X_test, Y_test)

Pre-train Random Accuracy:  Accuracy: 35016 / 50000 (70.03%)
None
Pre-train Random Accuracy:  Accuracy: 1240 / 10000 (12.40%)
None


# FNN with Batch Stochastic

### A. Code

In [683]:
import numpy as np
import theano
from theano import shared, function, scan
import theano.tensor as T
from theano.tensor.nnet import sigmoid, softmax

In [699]:
class Layer2:
    
    def __init__(self, bShape, wShape, activation=sigmoid):
        assert len(bShape) == len(wShape) == 2 # bShape, wShape: (row, col) 2-tuples.
        self.b = shared(np.asarray(np.random.normal(loc=0.,scale=1.,size=bShape),
                                   dtype=theano.config.floatX), borrow=True)
        self.w = shared(np.asarray(np.random.normal(loc=0.,scale=np.sqrt(1./wShape[0]),size=wShape),
                                   dtype=theano.config.floatX), borrow=True)
        self.params = [self.b, self.w]
        self.activation = activation
    
    def activate(self, x):
        z = T.dot(self.w, x) + self.b
        if self.activation is sigmoid: return self.activation(z)
        elif self.activation is softmax: return self.activation(z.T[0])
        else: return z

In [1034]:
class NNTheano2:
    
    def __init__(self, sizes, hiddenActivation=sigmoid, outputActivation=sigmoid):
        self.numLayers = len(sizes)
        assert self.numLayers >= 3 # must have at least 1 hidden layer.
        self.layers = [ Layer1((nOut,1),(nOut,nIn),hiddenActivation) 
                        for nIn,nOut in zip(sizes[:-2],sizes[1:]) ]
        self.layers.append(Layer1((sizes[-1],1),(sizes[-1],sizes[-2]),outputActivation))
            # shapes of bias & weight matrices for layer L
            #  b: nrow(L) * 1
            #  w: nrow(L) * nrow(L-1)
        self.params = []
        for layer in self.layers:
            self.params += layer.params

    def predict(self, x):
        a = self.layers[0].activate(x)
        for i in range(1,len(self.layers)):
            a = self.layers[i].activate(a)
        return a
    
    def sgd(self, lr=.01, momentum=.0):
        # create symbolic graph for batch cost computation.
        X = T.tensor3()
        Y = T.matrix()
        batchSize = T.iscalar()
        A = scan(lambda x: self.predict(x), sequences=X)[0]
        C = scan(lambda a,y: T.sum(np.nansum(-y*T.log(a) - (1.-y)*T.log(1-a))), sequences=[A,Y])[0]
        cost = (1./batchSize) * T.sum(C)
        updates = []
        for param in self.params:
            paramUpdate = shared(param.get_value()*0, broadcastable=param.broadcastable)
            updates.append((param, param-lr*paramUpdate))
            updates.append((paramUpdate, momentum*paramUpdate+(1.-momentum)*T.grad(cost,param)))
        self.batch_update = function([X,Y,batchSize], cost, updates=updates)
    
    def train(self, data_train, epochs=1, batchSize=10):
        assert len(data_train) == 2
        X_train, Y_train = data_train
        for i in xrange(epochs): 
            c = 0
            batches = [ [X_train[k:k+batchSize],Y_train[k:k+batchSize]] 
                        for k in xrange(0,len(data_train),batchSize) ]
            for batch in batches:
                c = self.batch_update(np.array(batch[0]),np.array(batch[1]),len(batch[0]))
            if i!=0 and i%10==0: print "Epoch: %d | Cost: %.6f" % (i,c)
        

### B. Toy Example: Logic Gate OR

In [1035]:
# E.G. OR
'''
x y | lor
0 0 | 0
0 1 | 1
1 0 | 1
1 1 | 1
'''
xy1, lor1 = np.asarray([0.,0.]).reshape(2,1), np.asarray([0.])
xy2, lor2 = np.asarray([0.,1.]).reshape(2,1), np.asarray([1.])
xy3, lor3 = np.asarray([1.,0.]).reshape(2,1), np.asarray([1.])
xy4, lor4 = np.asarray([1.,1.]).reshape(2,1), np.asarray([1.])
X = [xy1, xy2, xy3, xy4] * 12500
Y = [lor1, lor2, lor3, lor4] * 12500

In [1036]:
fnn = NNTheano2([2,1,1])

In [1037]:
fnn.sgd(lr=3.)

In [1038]:
fnn.train([X,Y],epochs=100)

Epoch: 10 | Cost: 0.548807
Epoch: 20 | Cost: 0.325504
Epoch: 30 | Cost: 0.173902
Epoch: 40 | Cost: 0.103380
Epoch: 50 | Cost: 0.070442
Epoch: 60 | Cost: 0.052469
Epoch: 70 | Cost: 0.041450
Epoch: 80 | Cost: 0.034098
Epoch: 90 | Cost: 0.028881


In [1039]:
print "OR Truth-Value Table"
print "0 0 |", "0" if fnn.predict(xy1).eval()[0][0] < .5 else "1"
print "0 1 |", "0" if fnn.predict(xy2).eval()[0][0] < .5 else "1"
print "1 0 |", "0" if fnn.predict(xy3).eval()[0][0] < .5 else "1"
print "1 1 |", "0" if fnn.predict(xy4).eval()[0][0] < .5 else "1"

OR Truth-Value Table
0 0 | 0
0 1 | 1
1 0 | 1
1 1 | 1


### C. MNIST

In [993]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/NN/DATA/neural-networks-and-deep-learning-master/src")
import mnist_loader # works when under '~/src' path.
data_train, data_dev, data_test = mnist_loader.load_data_wrapper()

In [994]:
X_train = [np.asarray(datum[0]).reshape(len(datum[0]),1) for datum in data_train]
Y_train = [np.asarray(datum[1]).squeeze() for datum in data_train]

In [995]:
X_test = [np.asarray(datum[0]).reshape(len(datum[0]),1) for datum in data_test]
Y_test = [np.asarray(datum[1]).squeeze() for datum in data_test]

In [1040]:
fnn = NNTheano2([784, 30, 10], hiddenActivation=sigmoid, outputActivation=softmax)

In [1041]:
%%time
fnn.sgd(.5)

CPU times: user 10.5 s, sys: 64.3 ms, total: 10.5 s
Wall time: 10.6 s


In [1042]:
%%time
fnn.train([X_train,Y_train],epochs=100)

Epoch: 10 | Cost: 0.937225
Epoch: 20 | Cost: 0.218448
Epoch: 30 | Cost: 0.119370
Epoch: 40 | Cost: 0.081350
Epoch: 50 | Cost: 0.061352
Epoch: 60 | Cost: 0.049066
Epoch: 70 | Cost: 0.040775
Epoch: 80 | Cost: 0.034817
Epoch: 90 | Cost: 0.030334
CPU times: user 133 ms, sys: 7.92 ms, total: 141 ms
Wall time: 134 ms


In [1043]:
print sum(np.argmax(fnn.predict(X_train[i]).eval())==np.argmax(Y_train[i]) for i in range(100))

43
