In [1]:
import numpy as np
np.random.seed(0)

In [2]:
import scipy.io as sio
import matplotlib.pyplot as plt
import scipy
from tqdm.notebook import tqdm

In [3]:
def data_loader(file_name):
    mat = scipy.io.loadmat(file_name)
    Xtrain = mat.get('Yt')
    # for the bias
    Xtrain = np.vstack([Xtrain, np.ones(Xtrain.shape[1])]) 
    Ytrain = mat.get('Ct')
    Xtest = mat.get('Yv')
    Xtest = np.vstack([Xtest, np.ones(Xtest.shape[1])])
    Ytest = mat.get('Cv')
    return Xtrain.T, Ytrain.T, Xtest.T, Ytest.T

datasets = ["GMMData","PeaksData","SwissRollData"]

In [4]:

def softmax(X, w, eta = True):
    product_Xw = X.T @ w
    if eta==True:
        exp = np.exp(product_Xw - np.max(product_Xw ))
    else:
        exp = np.exp(product_Xw )
    div = np.divide(exp, np.sum(exp, axis = 1).reshape(-1,1))
    return div

def softmax_loss(X, C, w, eta = True):
    sm = softmax(X, w, eta = eta)
    log  = np.log(sm)
    m = len(X[0])
    return -np.sum(C*log)/m  

def softmax_loss_gradient_W(X, C, w, eta = True):
    #cross entrophy loss wrt W
    sm= softmax(X, w, eta = eta)
    m = len(X[0])
    gradient = (1/m)*X @ (sm - C)
    return gradient

def softmax_loss_gradient_X(X, C, w, eta = True):
    #cross entrophy loss wrt X
    sm= softmax(X, w, eta = eta)
    m = len(X[0])
    gradient = (1/m)*w @ (sm - C).T
    return gradient


In [5]:
class Layer:
    def __init__(self):
        self.input = None
        self.output = None
    def forward(self,input):
        pass
    def backward(self,output_grad,lr):
        pass

In [6]:
class Dense(Layer):
    def __init__(self,input_size,output_size):
        self.weights = 0.10 * np.random.randn(output_size,input_size)
        self.biases = np.random.randn(output_size,1)
        self.inputs = None
        
    def forward(self,inputs):
        self.inputs = inputs
        print('input dense'+str(self.inputs.shape))
        self.output = np.dot(self.inputs,self.weights) + self.biases
        print('dense'+str(self.output.shape))
        return self.output
    
    def backward(self,output_grad,lr):
        #Gradients on parametrs
        w_grad = np.dot(output_grad,self.inputs.T)
        self.weights -= lr* w_grad
        print('bias shape' + str(self.biases.shape))

        print('output_grad shape' + str(output_grad.shape))

        self.biases -= lr* output_grad
        return np.dot(self.weights.T, output_grad)

    
class Softmax_Layer(Layer):
    def __init__(self,input_size,output_size):
        self.weights = 0.10 * np.random.randn(output_size,input_size) 
        
    def forward(self,inputs):
        self.inputs = inputs
        print('input - softmax'+str(self.inputs.shape))
        self.output = softmax(inputs,self.weights.T)
        
        print('softmax'+str(self.output.shape))
        return self.output
    
    def calc_loss(self, C):
        #cross entrophy loss
        return softmax_loss(self.output, C, self.weights.T)
    
    def backward(self,c,lr):
        # the only backward that applies on inputs (because the softmax_loss_g.. making the softmax inside of it)
        grad_W = softmax_loss_gradient_W(self.inputs, c, self.weights.T).T
        grad_X = softmax_loss_gradient_X(self.inputs, c, self.weights.T)
        
        print('grad_W ' + str(grad_W.shape ))
        print('self.weights' + str(self.weights.shape))

        self.weights -= grad_W*lr
        return grad_X
        #the time reducing version from the notes

In [7]:
class Activation(Layer):
    def __init__(self,activation,activation_prime):
        self.activation = activation
        self.activation_prime = activation_prime
        
    def forward(self,inputs):
        self.inputs = inputs        
        print('tanh'+str(self.activation(self.inputs).shape))
        return self.activation(self.inputs)
    
    def backward(self,output_grad,lr):
        return np.multiply(output_grad,self.activation_prime(self.inputs))
    

class Tanh(Activation):
    def __init__(self):
        tanh = lambda x: np.tanh(x)
        tanh_prime = lambda x:1-np.tanh(x)**2
        super().__init__(tanh,tanh_prime)


In [8]:
class Net:
    def __init__(self,layers,lr=0.01):
        self.lr = lr
        self.layers = layers
        self.loss = []
    
    def forward(self,X,C):
        '''
        forward pass of the network
        '''
        output = X
        for layer in self.layers:
            output = layer.forward(output)
            
            
        self.loss.append(self.layers[-1].calc_loss(C))
        #the output without softmax(just the logits)
        return output
    

    def backward(self,c):
        '''
        backward pass through the network + updating params
        '''
        g_x = self.layers[-1].backward(c,self.lr)
        
        for layer in np.flip(self.layers[:-1]):
            print('g_x' + str(g_x.shape))
            g_x = layer.backward(g_x, self.lr)


    
        

In [9]:
nn = Net(layers = [Dense(6,6),Tanh(),Softmax_Layer(6,5)], lr = 0.1)

In [10]:
Xtrain, Ytrain, Xtest, Ytest = data_loader(datasets[0])

In [11]:
Xtrain, Ytrain, Xtest, Ytest = data_loader(datasets[0])
examples_num, feature_num = Xtrain.shape
labels_num = Ytrain.shape[1]

train_acc=[]
test_acc=[]
train_loss = []

In [12]:
epocs = 100
lr = 0.1
batch_size = 1

In [13]:
indices = np.arange(examples_num)
np.random.shuffle(indices)

In [14]:
for i in [Xtrain, Ytrain, Xtest, Ytest]:
    print(i.shape)


(25000, 6)
(25000, 5)
(6250, 6)
(6250, 5)


In [15]:
# Train loop
for epoc in tqdm(range(epocs)):
    # Shuffle train data
    indices = np.arange(examples_num)
    np.random.shuffle(indices)
    Xtrain = Xtrain[indices]
    Ytrain = Ytrain[indices]
    i = 0
    while i * batch_size < examples_num:
        # Obtain minibatch
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, examples_num)
        minix = Xtrain[batch_start:batch_end]
        miniy = Ytrain[batch_start:batch_end]
        
        #forward pass
        nn.forward(minix,miniy)

        #Backward+update nn params
        nn.backward(miniy)
        
        #next batch...
        i += 1

  0%|          | 0/100 [00:00<?, ?it/s]

input dense(1, 6)
dense(6, 6)
tanh(6, 6)
input - softmax(6, 6)
softmax(6, 5)
grad_W (5, 6)
self.weights(5, 6)
g_x(6, 6)
g_x(6, 6)
bias shape(6, 1)
output_grad shape(6, 6)


ValueError: non-broadcastable output operand with shape (6,1) doesn't match the broadcast shape (6,6)