In [1]:
import numpy as np
csv = np.genfromtxt ('train.csv', delimiter=",")
test_csv = np.genfromtxt ('test.csv', delimiter=",")
# each row is 784 + 1 cols separated by commas
# column 785 contains the corresponding label (0 to 9), last col is y_vect
train_x, train_y = csv[:,:-1], csv[:,-1]
test_x, test_y = test_csv[:,:-1], test_csv[:,-1]
# import params
alpha = np.genfromtxt ('alpha1.txt', delimiter=",")
beta = np.genfromtxt ('alpha2.txt', delimiter=",")
bias1 = np.genfromtxt ('beta1.txt', delimiter=",")
bias2 = np.genfromtxt ('beta2.txt', delimiter=",")
alpha = np.concatenate((bias1.reshape(bias1.shape[0], 1), alpha), axis=1)
beta = np.concatenate((bias2.reshape(bias2.shape[0], 1), beta), axis=1)

In [2]:
train_y
# z = 1/(1 + np.exp(-x))

array([0., 2., 9., ..., 7., 9., 2.])

In [3]:
class LinearLayer1:
    def __init__(self,x_vect,alpha):
        # add x_0 to x for the forward pass
        #self.x_star = x_vect.T
        self.x_vect = x_vect.T
        self.x_vect = np.insert(self.x_vect, 0, 1, axis=0)
        #print(self.x_star.shape, "x_star")
        #print(self.x_vect.shape, "x_vect")
        # add col to beg of a of all 1s
        # concat alpha and beta together, beta is bias
        self.alpha_star = alpha[:,1:]
        #self.alpha = np.concatenate((bias.reshape(bias.shape[0], 1), alpha), axis=1)
        self.alpha = alpha
        #print(self.alpha.shape, "alpha")

    def forward(self):
        #print(self.x_star.shape,self.alpha.shape, self.x_vect.shape)
        
        #print((np.dot(self.alpha, self.x_vect)).shape)
        return np.dot(self.alpha, self.x_vect)
    def backward(self, passed_in):
        # dl/d_alpha= dl/db * db/dz * dz/da * da/d_alpha
        # all but da/d_alpha will get passed in
        return np.dot(passed_in, self.x_vect.reshape(1, self.x_vect.shape[0]))
        #return np.dot(passed_in, self.x_star.reshape(1, self.x_star.shape[0]))

In [4]:
class SigmoidLayer:
    def __init__(self,A):
        #print(A.shape)
        self.A = A  
    def forward(self):
        return 1/(1 + np.exp(-self.A))
    def backward(self, passed_in):
        # dl/d_a= dl/db * db/dz * dz/da 
        sigmoid = 1/(1 + np.exp(-self.A))
        # all but dz/da will get passed in
        #dz_da = np.exp(-self.A) * (1+ np.exp(-self.A))**(-2)
        return np.array([sigmoid[i]*(1-sigmoid[i]) for i in range(passed_in.shape[0])])
        #return passed_in *dz_da[:, None]

In [5]:
class LinearLayer2:
    def __init__(self,Z,beta):
        # add 1 for z_0
        self.Z = Z
        self.Z = np.insert(self.Z, 0, 1, axis=0)
        self.beta_star = beta[:,1:]
        self.beta = beta 
    def forward(self):
        
        return np.dot(self.beta, self.Z)
    def backward(self, passed_in):
        # dl/dz= dl/db * db/dz
        # all but db/dz will get passed in
        #print(passed_in.shape, self.beta_star.T.shape, self.Z.shape)
        return np.dot(self.beta_star.T, passed_in)
    def beta_gradient(self, passed_in):
        # dl/dz= dl/db * db/dbeta
        # all but db/dbeta will get passed in
        return np.dot(passed_in.reshape(passed_in.shape[0], 1), self.Z.reshape(1, self.Z.shape[0]))

In [6]:
def translate_y_vect(y_vect, B):
    # translate the int at each element to a np array with a 1 at that index, zero ow
    # init matrix of zeros, directly use y_vect to index the matrix
    if len(y_vect.shape) != 1:
        return y_vect
    y = np.zeros((B.shape[0], y_vect.shape[0]))
    #print(y.shape)
    for row_index in range((y_vect.shape[0])):
        #print(y_vect[row_index])
        y[int(y_vect[row_index]), row_index] = 1
    return y

"""return np.sum(np.sum(np.dot(self.y, \
                                    (self.B - \
                                    np.log(np.sum(np.exp(self.B), axis =0))). \
                                    reshape(self.B.shape[1],self.B.shape[0] ))))"""

'return np.sum(np.sum(np.dot(self.y,                                     (self.B -                                     np.log(np.sum(np.exp(self.B), axis =0))).                                     reshape(self.B.shape[1],self.B.shape[0] ))))'

In [7]:
class SoftmaxCELayer:
    def __init__(self,B, y_vect):
        self.B = B
        #print(self.B.shape)
        self.y_hat_vect = np.exp(self.B)/ np.sum(np.exp(self.B), axis =0)
        #print(self.y_hat_vect.shape)
        self.y = translate_y_vect(y_vect, self.B)
        # was [0] before
        self.N = self.y.shape[0]
        #print(self.y_hat_vect.shape, self.y.shape, "y_hat, y")
    def forward(self):
        return sum(-sum([self.y[i] * np.log(self.y_hat_vect[i]) for i in range(self.y.shape[0])]))
        #np.sum(np.sum(np.dot(self.y, np.log((self.y_hat_vect.reshape(self.y.shape[1], self.y.shape[0],))))))

    def backward(self):
        # dl/db
        #sum( self.y_hat_vect[i] - self.y[i]for i in range(self.y.shape[0])])
        #print((-1/self.N)*np.sum(np.subtract(self.y_hat_vect, self.y), axis=1))
        return (-1/self.N)*np.sum(np.subtract(self.y_hat_vect, self.y), axis=1)

In [8]:
class Network:
    def __init__(self,X, y, alpha, beta):
        self.whole_X =X
        self.whole_y =y
        self.batch_x = 0
        self.batch_y = 0
        self.alpha =alpha
        self.beta =beta
        self.loss =  0
        self.dl_db = 0
        self.dl_dbeta = 0
        self.dl_dalpha = 0
        self.a = 0
        self.z = 0
        self.B = 0
        self.y_hat = 0
    def forward_pass(self):
        self.a = LinearLayer1(self.batch_x, self.alpha).forward()
        self.alpha = LinearLayer1(self.batch_x, self.alpha).alpha
        self.z = SigmoidLayer(self.a).forward()
        #print(self.z.shape)
        self.B = LinearLayer2(self.z, self.beta).forward()
        self.beta = LinearLayer2(self.z,self.beta).beta
        #print(self.a.shape, self.z.shape, self.B.shape)
        self.y_hat = SoftmaxCELayer(self.B, self.batch_y).y_hat_vect
        #print(translate_y_vect(self.batch_y, np.zeros((10,1))).shape, self.y_hat.shape)
        self.loss = SoftmaxCELayer(self.B, self.batch_y).forward()
            
    def backward_pass(self):
        self.dl_db = SoftmaxCELayer(self.B, self.batch_y).backward()
        #print(self.dl_db.shape, "dl_db")
        dl_dz = LinearLayer2(self.z,self.beta).backward(self.dl_db)
        #print(dl_dz.shape, "dl_dz")
        self.dl_dbeta = LinearLayer2(self.z,self.beta).beta_gradient(self.dl_db)
        #print(self.dl_db.shape, "dl_dbeta")
        dl_da = SigmoidLayer(self.a).backward(dl_dz)
        #print(dl_da.shape, "dl_da")
        self.dl_dalpha = LinearLayer1(self.batch_x, self.alpha).backward(dl_da)
        #print(self.dl_dalpha.shape, "dl_dalpha")
        
    def update(self, learning_rate):
        self.alpha = self.alpha - learning_rate*self.dl_dalpha
        self.beta = self.beta - learning_rate*self.dl_dbeta
            
    def get_test_loss(self, test_x, test_y):
        self.batch_x = test_x
        #print(test_y.shape, "before trans")
        self.batch_y = translate_y_vect(test_y, np.zeros((10,1)))
        #print(self.batch_y.shape, "after trans")
        self.forward_pass()
        #print(self.y_hat)
        return self.loss
    
    def test_acccuracy(self, test_y):
        #print(test_y.shape, "before trans", self.y_hat.shape)
        self.batch_y = translate_y_vect(test_y, np.zeros((10,1)))
        #print(self.batch_y.shape, "after trans", self.y_hat.shape)
        correct_prediction = 0
        incorrect_prediction = 0
        # correct prediction
        #(10, 1000) after trans (10, 1000)
        for row in range((self.batch_y.shape[1])):
            A = (self.batch_y[:,row]- self.y_hat[:,row])
            B = np.all(np.zeros((10,1)))
            if (A==B).all():
                correct_prediction += 1
            else:
                incorrect_prediction += 1
        # correct / correct+incorrect
        return correct_prediction/ (correct_prediction+ incorrect_prediction)
            
    def train(self, test_x, test_y, learning_rate, epochs, batch_size):
        loss_list = []
        test_list = []
        accuracy_list = []
        # modify alpha and beta from beginning
        for epoch in range(epochs):
            for batch_index in range(int(len(self.whole_X)/batch_size)):
                
                self.batch_x = self.whole_X[batch_index*batch_size:(batch_index+1)*batch_size]
                self.batch_y = self.whole_y[batch_index*batch_size:(batch_index+1)*batch_size]
                #print(self.alpha.shape, self.beta.shape, self.loss)
                #print(self.batch_x.shape,self.batch_y.shape)
                self.forward_pass()
                #print(self.alpha.shape, self.beta.shape, self.loss)
                
                #print(self.y_hat)
                self.backward_pass()
                #print(self.alpha.shape, self.beta.shape, self.loss)
                self.update(learning_rate)
                #print(self.beta)
                #print(self.alpha)
            if epoch == 2:
                print(self.beta[:,0])
            # compute training loss for epoch
            loss_list.append(np.array(self.loss))
            # compute test loss
            (test_list.append(np.array(self.get_test_loss(test_x, test_y))))
            # compute test accuracy
            (accuracy_list.append(np.array(self.test_acccuracy(test_y))))
        return loss_list, test_list, accuracy_list

In [9]:
learning_rate = 0.01
epochs = 15
batch_size = 1
nn = Network(train_x, train_y, alpha, beta)
loss_list, test_list, accuracy_list = nn.train(test_x, test_y, learning_rate, epochs, batch_size)

[-1.80039478 -1.52145295 -1.69371438 -0.34914953 -0.82852895  7.58397735
 -1.3999061  -0.7435914   1.18845782 -0.45260243]


In [10]:
accuracy_list

[array(0.),
 array(0.),
 array(0.),
 array(0.),
 array(0.),
 array(0.),
 array(0.),
 array(0.),
 array(0.),
 array(0.),
 array(0.),
 array(0.),
 array(0.),
 array(0.),
 array(0.)]

In [11]:
(test_list)


[array(8157.77111597),
 array(14639.89443956),
 array(7840.27042446),
 array(10362.07394332),
 array(13040.53663109),
 array(15729.48857137),
 array(18422.42304541),
 array(21117.43142475),
 array(23813.70331287),
 array(26510.78661003),
 array(29208.4004221),
 array(31906.38258513),
 array(34604.63718856),
 array(37303.10651587),
 array(40001.75510842)]

In [12]:
"""

learning_rate = 0.01
epochs = 100
batch_size = 1
nn = Network(train_x, train_y, alpha, bias1, beta, bias2)
loss_list, test_list, accuracy_list = nn.train(test_x, test_y, learning_rate, epochs, batch_size)

"""

'\n\nlearning_rate = 0.01\nepochs = 100\nbatch_size = 1\nnn = Network(train_x, train_y, alpha, bias1, beta, bias2)\nloss_list, test_list, accuracy_list = nn.train(test_x, test_y, learning_rate, epochs, batch_size)\n\n'

In [13]:
"""
learning_rate = 0.01
epochs = 100
batch_size_ls = [1, 10, 50, 100]
loss_ls_ls = np.array([])
test_loss_ls_ls = np.array([])
accuracy_ls_ls = np.array([])
for batch_size in batch_size_ls:
    nn = Network(train_x, train_y, alpha, bias1, beta, bias2)
    loss_list, test_list, accuracy_list = nn.train(test_x, test_y, learning_rate, epochs, batch_size)
    np.vstack(loss_ls_ls, loss_list)
    np.vstack(test_loss_ls_ls, test_list)
    np.vstack(accuracy_ls_ls, accuracy_list)
"""

'\nlearning_rate = 0.01\nepochs = 100\nbatch_size_ls = [1, 10, 50, 100]\nloss_ls_ls = np.array([])\ntest_loss_ls_ls = np.array([])\naccuracy_ls_ls = np.array([])\nfor batch_size in batch_size_ls:\n    nn = Network(train_x, train_y, alpha, bias1, beta, bias2)\n    loss_list, test_list, accuracy_list = nn.train(test_x, test_y, learning_rate, epochs, batch_size)\n    np.vstack(loss_ls_ls, loss_list)\n    np.vstack(test_loss_ls_ls, test_list)\n    np.vstack(accuracy_ls_ls, accuracy_list)\n'