In [1]:
import numpy as np

In [2]:
class Layer:
    def __init__(self):
        self.input_data = None 
        self.output_data = None
    def forward(self,X):
        pass
    def compute_gradient(self,output_error):
        pass
    def apply_gradient(self,learning_rate,gradients=[]):
        pass
    def get_gradients(self):
        pass

In [3]:
class Activation(Layer):
    def __init__(self):
        super().__init__()
        
    def activation(self, X):
        raise NotImplementedError("Function must be implemented in subclass")

    def d_activation(self, X):
        raise NotImplementedError("Function must be implemented in subclass")

    def apply_gradient(self,learning_rate,gradients=[]):
        pass

    def forward(self, X):
        self.input_data = X
        self.output_data = self.activation(X)  # This should correctly update output_data in child class
        return self.output_data

    def compute_gradient(self, output_error):
        input_error = self.d_activation(self.input_data) * output_error
        return {'input_error': input_error, 'gradients': []}

In [4]:
class Linear(Layer):
    def __init__(self,input_size,output_size):
        super().__init__()

        self.input_size = input_size
        self.output_size = output_size

        self.W = np.random.uniform(size=(self.input_size,self.output_size)) -0.5
        self.b = np.zeros((1,self.output_size))
        
        self.dW = np.ones(shape=(self.input_size,self.output_size))
        self.db = np.ones(shape=(1,self.output_size))

    def forward(self,X):
        self.input_data = X
        self.output_data = np.dot(X,self.W) + self.b
        return self.output_data

    def get_gradients(self):
        return [self.dW,self.db]

    def compute_gradient(self,output_error):
        self.dW = np.dot(self.input_data.transpose(),output_error)
        self.db = np.sum(output_error,axis=0).reshape(1,self.output_data.shape[1])
        input_error = np.dot(output_error,self.W.transpose()) 
        return {"input_error" : input_error, "gradients" : [self.dW,self.db]}
                

    def apply_gradient(self,learning_rate,gradients=None):
        #Check if the optimizer is sending you custom gradients or use the custom ones
        if gradients is None:
            self.W = self.W - (learning_rate*self.dW)
            self.b = self.b - (learning_rate*self.db)
        else:
            dW = gradients[0]
            db = gradients[1]
            self.W = self.W - (learning_rate*dW)
            self.b = self.b - (learning_rate*db)

In [5]:
class ReLU(Activation):
    def __init__(self):
        super().__init__()
        
    def activation(self,X):
        return X*(X>0)

    def d_activation(self,X):
        return (X>0).astype(int)

In [6]:
class Sigmoid(Activation):        
    def __init__(self):
        super().__init__()
        
    def activation(self, X):
        return 1 / (1 + np.exp(-X))

    def d_activation(self, X):
        s = self.output_data
        return s * (1 - s)

In [7]:
class Tanh(Activation):
    def __init__(self):
        super().__init__()
        
    def activation(self,X):
        return np.tanh(X)
        
    def d_activation(self,X):
        t = self.output_data
        return  1. -(t**2)

In [8]:
class NeuralNetwork:
    def __init__(self,layers):
        self.layers = layers

    def forward(self,X):
        output = X
        for layer in self.layers:
            output = layer.forward(output)
        return output

In [9]:
class MSELoss:
    def __init__(self):
        pass

    def mse(self,y_true,y_pred):
        out = y_pred - y_true
        out = out**2
        return np.mean(out)

    def compute_gradient(self,y_true,y_pred):
        out = (2/len(y_true))*(y_pred-y_true)
        return out

    def compute_loss(self,y_true,y_pred):
        return self.mse(y_true,y_pred)

    def loss(self,y_true,y_pred):
        return (self.compute_loss(y_true,y_pred),self.compute_gradient(y_true,y_pred))

In [10]:
class CELoss:
    def __init__(self, epsilon=1e-15):
        self.epsilon = epsilon

    def _clip_probabilities(self, y_pred):
        return np.clip(y_pred, self.epsilon, 1 - self.epsilon)

    def compute_loss(self, y_true, y_pred):
        y_pred = self._clip_probabilities(y_pred)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    def compute_gradient(self, y_true, y_pred):
        y_pred = self._clip_probabilities(y_pred)
        return (y_pred - y_true) / (y_pred * (1 - y_pred))

    def loss(self,y_true,y_pred):
        return (self.compute_loss(y_true,y_pred),self.compute_gradient(y_true,y_pred))

In [11]:
class SimpleOptimizer:
    def __init__(self,layers,lr=0.01):
        self.lr = lr
        self.layers = layers
    
    def step(self,error):
        output_error = error
        for i in range(len(self.layers)-1,-1,-1):
            output_error = self.layers[i].compute_gradient(output_error)['input_error']
            
        for i in range(1,len(self.layers)+1):
            self.layers[-i].apply_gradient(self.lr)

In [12]:
class MomentumOptimizer:
    def __init__(self,layers,lr=0.01,beta=0.9):
        self.lr = lr
        self.layers = layers
        self.beta = beta
        self.V_gradients = {i:layers[i].get_gradients() for i in range(len(layers))}
    
    def step(self,error):
        output_error = error
        for i in range(len(self.layers)-1,-1,-1):
            layer = self.layers[i]
            out = layer.compute_gradient(output_error)
            output_error = out['input_error']
            gradients = out['gradients']
            for j in range(len(gradients)):
                self.V_gradients[i][j] = self.beta*self.V_gradients[i][j] + (1-self.beta)*gradients[j]
                
        for i in range(len(self.layers)-1,-1,-1):
            self.layers[i].apply_gradient(self.lr,self.V_gradients[i])

In [13]:
class RMSProp:
    def __init__(self,layers,lr=0.01,beta=0.9):
        self.lr = lr
        self.layers = layers
        self.beta = beta
        self.epsilon = 1e-8
        self.S_gradients = {i:layers[i].get_gradients() for i in range(len(layers))}
        
    def step(self,error):
        output_error = error
        final_gradients = {i:[] for i in range(len(layers))}
        
        for i in range(len(self.layers)-1,-1,-1):
            layer = self.layers[i]
            out = layer.compute_gradient(output_error)
            output_error = out['input_error']
            gradients = out['gradients']
            
            for j in range(len(gradients)):
                self.S_gradients[i][j] = self.beta*self.S_gradients[i][j] + (1-self.beta)*(gradients[j]**2)
                final_gradients[i].append(gradients[j]/(np.sqrt(self.S_gradients[i][j])) + self.epsilon)

        for i in range(len(self.layers)-1,-1,-1):
            self.layers[i].apply_gradient(self.lr,final_gradients[i])

In [14]:
class Adam:
    def __init__(self,layers,lr=0.01,beta_s=0.99,beta_v=0.9):
        self.lr = lr
        self.layers = layers
        
        self.beta_s = beta_s
        self.beta_v = beta_v
        
        self.t = 1
        
        self.epsilon = 1e-8
        
        self.S_gradients = {i:layers[i].get_gradients() for i in range(len(layers))}
        self.V_gradients = {i:layers[i].get_gradients() for i in range(len(layers))}

        
    def step(self,error):
        output_error = error
        final_gradients = {i:[] for i in range(len(layers))}
        
        for i in range(len(self.layers)-1,-1,-1):
            layer = self.layers[i]
            out = layer.compute_gradient(output_error)
            output_error = out['input_error']
            gradients = out['gradients']
            
            for j in range(len(gradients)):
                
                self.S_gradients[i][j] = (self.beta_s*self.S_gradients[i][j]) + ((1-self.beta_s)*(gradients[j]**2))
                corrected_s = self.S_gradients[i][j]/(1-(self.beta_s**self.t))

                self.V_gradients[i][j] = (self.beta_v*self.V_gradients[i][j]) + ((1-self.beta_v)*gradients[j])
                corrected_v = self.V_gradients[i][j]/(1-(self.beta_v**self.t))

                final_gradients[i].append(corrected_v/(np.sqrt(corrected_s) + self.epsilon))
            
        for i in range(len(self.layers)-1,-1,-1):
            self.layers[i].apply_gradient(self.lr,final_gradients[i])
        self.t += 1

<h3>Solving the Xor problem</h3>

In [15]:
X = np.array([[0,0],[1,1],[0,1],[1,0]])
y = np.array([0,0,1,1]).reshape(4,1)

<h4>Here we can see how Logistic Regression fails to solve the Xor Problem</h4>

In [16]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X,y.reshape(4,))
lr_pred = lr.predict(X)
print(lr_pred)

[0 0 0 0]


<h4>Now let's check our Neural Network</h4>

In [17]:
def train(model,optimizer,loss,epochs,print_every):
    for e in range(epochs):
        output = model.forward(X)
        error,error_gradient = loss.loss(y,output)
        optimizer.step(error_gradient)
        if e%print_every == 0:
            print("epoch: ", e , "\t Model error: ", np.round(error,5))

In [18]:
layers = [Linear(2,16), 
          ReLU(),
          Linear(16,8) ,
          ReLU(),
          Linear(8,1) ,
          Sigmoid()]

NN = NeuralNetwork(layers=layers)
optim = SimpleOptimizer(NN.layers,lr=0.01)
loss = CELoss()

In [19]:
train(NN,optim,loss,2_000,200)

epoch:  0 	 Model error:  0.67846
epoch:  200 	 Model error:  0.58271
epoch:  400 	 Model error:  0.35974
epoch:  600 	 Model error:  0.14253
epoch:  800 	 Model error:  0.05072
epoch:  1000 	 Model error:  0.02397
epoch:  1200 	 Model error:  0.01418
epoch:  1400 	 Model error:  0.00958
epoch:  1600 	 Model error:  0.00704
epoch:  1800 	 Model error:  0.00546


In [20]:
print("Prediction is : " , np.round(NN.forward(X).reshape(4,)).astype(int))
print("y is :          " , y.reshape(4,))

Prediction is :  [0 0 1 1]
y is :           [0 0 1 1]


<h3>Good Job!</h3>

<h3>Comparing SimpleOptimizer to MomentumOptimizer to RMSPropOptimizer vs Adam</h3>

<h4>Simple optimizer</h4>

In [21]:
layers = [Linear(2,16), 
          ReLU(),
          Linear(16,8) ,
          ReLU(),
          Linear(8,1) ,
          Sigmoid()]

NN = NeuralNetwork(layers=layers)
optim = SimpleOptimizer(NN.layers,lr=0.01)
loss = CELoss()
train(NN,optim,loss,2_000,200)

epoch:  0 	 Model error:  0.70924
epoch:  200 	 Model error:  0.61952
epoch:  400 	 Model error:  0.43136
epoch:  600 	 Model error:  0.18784
epoch:  800 	 Model error:  0.06411
epoch:  1000 	 Model error:  0.02912
epoch:  1200 	 Model error:  0.01672
epoch:  1400 	 Model error:  0.0111
epoch:  1600 	 Model error:  0.00808
epoch:  1800 	 Model error:  0.00622


<h4>Using Momentum</h4>

In [22]:
momentum_layers = [Linear(2,16), 
          ReLU(),
          Linear(16,8) ,
          ReLU(),
          Linear(8,1) ,
          Sigmoid()]

momentum_NN = NeuralNetwork(layers=momentum_layers)
momentum_optim = MomentumOptimizer(momentum_NN.layers,lr=0.01)
momentum_loss = CELoss()
train(momentum_NN,momentum_optim,momentum_loss,2_000,200)

epoch:  0 	 Model error:  0.70478
epoch:  200 	 Model error:  0.69316
epoch:  400 	 Model error:  0.69315
epoch:  600 	 Model error:  0.69315
epoch:  800 	 Model error:  0.69315
epoch:  1000 	 Model error:  0.69315
epoch:  1200 	 Model error:  0.69315
epoch:  1400 	 Model error:  0.69315
epoch:  1600 	 Model error:  0.69315
epoch:  1800 	 Model error:  0.69315


<h4>RMSProp</h4>

In [23]:
rmsprop_layers = [Linear(2,16), 
          ReLU(),
          Linear(16,8) ,
          ReLU(),
          Linear(8,1) ,
          Sigmoid()]

rmsprop_NN = NeuralNetwork(layers=rmsprop_layers)
rmsprop_optim = RMSProp(rmsprop_NN.layers,lr=0.01)
rmsprop_loss = CELoss()
train(rmsprop_NN,rmsprop_optim,rmsprop_loss,2_000,200)

epoch:  0 	 Model error:  0.69107
epoch:  200 	 Model error:  0.00035
epoch:  400 	 Model error:  0.0
epoch:  600 	 Model error:  0.0
epoch:  800 	 Model error:  0.0
epoch:  1000 	 Model error:  0.0
epoch:  1200 	 Model error:  0.0
epoch:  1400 	 Model error:  0.0
epoch:  1600 	 Model error:  0.0
epoch:  1800 	 Model error:  0.0


<h4>Adam</h4>

In [24]:
adam_layers = [Linear(2,16), 
          ReLU(),
          Linear(16,8) ,
          ReLU(),
          Linear(8,1) ,
          Sigmoid()]

adam_NN = NeuralNetwork(layers=adam_layers)
adam_optim = Adam(adam_NN.layers,lr=0.01)
adam_loss = CELoss()
train(adam_NN,adam_optim,adam_loss,2_000,200)

epoch:  0 	 Model error:  0.67777
epoch:  200 	 Model error:  0.55636
epoch:  400 	 Model error:  0.02015
epoch:  600 	 Model error:  0.0019
epoch:  800 	 Model error:  0.00039
epoch:  1000 	 Model error:  0.0001
epoch:  1200 	 Model error:  3e-05
epoch:  1400 	 Model error:  1e-05
epoch:  1600 	 Model error:  0.0
epoch:  1800 	 Model error:  0.0


<h3>We can see how the Models using RMSProp or Adam optimizers converge much faster!</h3>

<h4>Verifying that the model works with multiple outputs (we simply expanded y to 2 classes)</h4>

In [25]:
X = np.array([[0,0],[1,1],[0,1],[1,0]])
y = np.array([[1,0],[1,0],[0,1],[0,1]])

In [26]:
layers = [Linear(2,16), 
          ReLU(),
          Linear(16,8) ,
          ReLU(),
          Linear(8,2) ,
          Sigmoid()]

NN = NeuralNetwork(layers=layers)
optim = SimpleOptimizer(NN.layers,lr=0.1)
loss = MSELoss()
train(NN,optim,loss,1_000,200)

epoch:  0 	 Model error:  0.24924
epoch:  200 	 Model error:  0.0958
epoch:  400 	 Model error:  0.0225
epoch:  600 	 Model error:  0.0088
epoch:  800 	 Model error:  0.00428


In [27]:
print("Prediction is : \n" , np.round(NN.forward(X)).astype(int))
print("y is :          \n" , y)

Prediction is : 
 [[1 0]
 [1 0]
 [0 1]
 [0 1]]
y is :          
 [[1 0]
 [1 0]
 [0 1]
 [0 1]]


In [28]:
layers = [Linear(2,16), 
          ReLU(),
          Linear(16,8) ,
          ReLU(),
          Linear(8,2) ,
          Sigmoid()]

NN = NeuralNetwork(layers=layers)
optim = RMSProp(NN.layers,lr=0.1)
loss = CELoss()
train(NN,optim,loss,1_000,200)

epoch:  0 	 Model error:  0.69014
epoch:  200 	 Model error:  0.0
epoch:  400 	 Model error:  0.0
epoch:  600 	 Model error:  0.0
epoch:  800 	 Model error:  0.0


In [29]:
print("Prediction is : \n" , np.round(NN.forward(X)).astype(int))
print("y is :          \n" , y)

Prediction is : 
 [[1 0]
 [1 0]
 [0 1]
 [0 1]]
y is :          
 [[1 0]
 [1 0]
 [0 1]
 [0 1]]
