In [1]:
import numpy as np

In [2]:
class Layer:
    def __init__(self):
        self.is_parameterized = False
    def forward(self,X):
        pass
    def compute_gradient(self,output_error):
        pass
    def apply_gradient(self,learning_rate,gradients=[]):
        pass
    def get_gradients(self):
        return []

In [3]:
class Linear(Layer):
    def __init__(self,input_size,output_size):
        super().__init__()
        self.input_data = None 
        self.output_data = None
        self.input_size = input_size
        self.output_size = output_size
        self.is_parameterized = True

        
        self.W = np.random.uniform(size=(self.input_size,self.output_size))-0.5
        self.b = np.random.uniform(size=(1,self.output_size))-0.5
        
        self.dW = np.ones(shape=(self.input_size,self.output_size))
        self.db = np.ones(shape=(1,self.output_size))

    def forward(self,X):
        self.input_data = X
        self.output_data = np.dot(X,self.W) + self.b
        return self.output_data

    def get_gradients(self):
        return [self.dW,self.db]

    def compute_gradient(self,output_error):
        self.dW = np.dot(self.input_data.transpose(),output_error)
        self.db = np.sum(output_error,axis=0).reshape(1,self.output_data.shape[1])
        return {"input_error" : np.dot(output_error,self.W.transpose()) , "gradients" : [self.dW,self.db]}
                

    def apply_gradient(self,learning_rate,gradients=None):
        if gradients is None:
            self.W = self.W - (learning_rate*self.dW)
            self.b = self.b - (learning_rate*self.db)
        else:
            dW = gradients[0]
            db = gradients[1]
            self.W = self.W - (learning_rate*dW)
            self.b = self.b - (learning_rate*db)

In [4]:
class ReLU(Layer):
    def __init__(self):
        super().__init__()
        self.input_data = None 

    def _relu(self,X):
        return X*(X>0)

    def _d_relu(self,X):
        return (X>0).astype(int)
        
    def forward(self,X):
        self.input_data = X
        return self._relu(X)

    def compute_gradient(self,output_error):
        return {'input_error' : self._d_relu(self.input_data) * output_error , 'gradients' : []}


In [5]:
class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
        self.input_data = None 
        self.output_data = None
        
    def _sigmoid(self,X):
        return 1/(1+(np.exp(-X)))

    def _d_sigmoid(self,s):
        return s*(1-s)
            
    def forward(self,X):
        self.input_data = X
        self.output_data = self._sigmoid(X)
        return self.output_data
        
    def compute_gradient(self,output_error):
        return {'input_error' : self._d_sigmoid(self.output_data)*output_error , 'gradients' : []}

In [6]:
class NeuralNetwork:
    def __init__(self,layers):
        self.layers = layers

    def forward(self,X):
        output = X
        for layer in self.layers:
            output = layer.forward(output)
        return output

In [7]:
class MSELoss:
    def __init__(self):
        pass

    def mse(self,y_true,y_pred):
        out = y_pred - y_true
        out = out**2
        return np.mean(out)

    def compute_gradient(self,y_true,y_pred):
        out = (2/len(y_true))*(y_pred-y_true)
        #print(out.shape)
        return out

    def compute_loss(self,y_true,y_pred):
        return self.mse(y_true,y_pred)

    def loss(self,y_true,y_pred):
        return (self.compute_loss(y_true,y_pred),self.compute_gradient(y_true,y_pred))

In [8]:
class CELoss:
    def __init__(self, epsilon=1e-15):
        self.epsilon = epsilon

    def _clip_probabilities(self, y_pred):
        return np.clip(y_pred, self.epsilon, 1 - self.epsilon)

    def compute_loss(self, y_true, y_pred):
        y_pred = self._clip_probabilities(y_pred)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    def compute_gradient(self, y_true, y_pred):
        y_pred = self._clip_probabilities(y_pred)
        return (y_pred - y_true) / (y_pred * (1 - y_pred))

    def loss(self,y_true,y_pred):
        return (self.compute_loss(y_true,y_pred),self.compute_gradient(y_true,y_pred))

In [9]:
class SimpleOptimizer:
    def __init__(self,layers,lr=0.01):
        self.lr = lr
        self.layers = layers
    
    def step(self,error):
        output_error = error
        for i in range(len(self.layers)-1,-1,-1):
            output_error = self.layers[i].compute_gradient(output_error)['input_error']
            
        for i in range(1,len(self.layers)+1):
            self.layers[-i].apply_gradient(self.lr)

In [10]:
class MomentumOptimizer:
    def __init__(self,layers,lr=0.01,beta=0.9):
        self.lr = lr
        self.layers = layers
        self.beta = beta
        self.V_gradients = {i:layers[i].get_gradients() for i in range(len(layers))}
    
    def step(self,error):
        output_error = error
        for i in range(len(self.layers)-1,-1,-1):
            layer = self.layers[i]
            out = layer.compute_gradient(output_error)
            output_error = out['input_error']
            gradients = out['gradients']
            for j in range(len(gradients)):
                self.V_gradients[i][j] = self.beta*self.V_gradients[i][j] + (1-self.beta)*gradients[j]
                
        for i in range(len(self.layers)-1,-1,-1):
            self.layers[i].apply_gradient(self.lr,self.V_gradients[i])

In [11]:
class RMSProp:
    def __init__(self,layers,lr=0.01,beta=0.9):
        self.lr = lr
        self.layers = layers
        self.beta = beta
        self.epsilon = 1e-8
        self.S_gradients = {i:layers[i].get_gradients() for i in range(len(layers))}
        
    def step(self,error):
        output_error = error
        final_gradients = {i:[] for i in range(len(layers))}
        
        for i in range(len(self.layers)-1,-1,-1):
            layer = self.layers[i]
            out = layer.compute_gradient(output_error)
            output_error = out['input_error']
            gradients = out['gradients']
            
            for j in range(len(gradients)):
                self.S_gradients[i][j] = self.beta*self.S_gradients[i][j] + (1-self.beta)*(gradients[j]**2)
                final_gradients[i].append(gradients[j]/(np.sqrt(self.S_gradients[i][j])) + self.epsilon)

        for i in range(len(self.layers)-1,-1,-1):
            self.layers[i].apply_gradient(self.lr,final_gradients[i])

<h3>Solving the Xor problem</h3>

In [12]:
X = np.array([[0,0],[1,1],[0,1],[1,0]])
y = np.array([0,0,1,1]).reshape(4,1)

<h4>Here we can see how Logistic Regression fails to solve the Xor Problem</h4>

In [13]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X,y.reshape(4,))
lr_pred = lr.predict(X)
print(lr_pred)

[0 0 0 0]


<h4>Now let's check our Neural Network</h4>

In [14]:
layers = [Linear(2,16), 
          ReLU(),
          Linear(16,8) ,
          ReLU(),
          Linear(8,1) ,
          Sigmoid()]

NN = NeuralNetwork(layers=layers)
optim = SimpleOptimizer(NN.layers,lr=0.01)
loss = CELoss()

In [15]:
for e in range(1_000):
    output = NN.forward(X)
    error,error_gradient = loss.loss(y,output)
    optim.step(error_gradient)
    if e%100 == 0:
        print("epoch: ", e , "\n \t train error: ", np.round(error,5))

epoch:  0 
 	 train error:  0.68313
epoch:  100 
 	 train error:  0.62579
epoch:  200 
 	 train error:  0.53624
epoch:  300 
 	 train error:  0.39467
epoch:  400 
 	 train error:  0.22658
epoch:  500 
 	 train error:  0.11362
epoch:  600 
 	 train error:  0.06189
epoch:  700 
 	 train error:  0.03794
epoch:  800 
 	 train error:  0.02596
epoch:  900 
 	 train error:  0.01886


In [16]:
print("Prediction is : " , np.round(NN.forward(X).reshape(4,)).astype(int))
print("y is :          " , y.reshape(4,))

Prediction is :  [0 0 1 1]
y is :           [0 0 1 1]


<h3>Good Job!</h3>

<h3>Comparing regular optimizer to Momentum Optimizer to RMSProp Optimizer</h3>

In [17]:
layers = [Linear(2,16), 
          ReLU(),
          Linear(16,8) ,
          ReLU(),
          Linear(8,1) ,
          Sigmoid()]

NN = NeuralNetwork(layers=layers)
optim = SimpleOptimizer(NN.layers,lr=0.01)
loss = CELoss()

momentum_layers = [Linear(2,16), 
          ReLU(),
          Linear(16,8) ,
          ReLU(),
          Linear(8,1) ,
          Sigmoid()]

momentum_NN = NeuralNetwork(layers=momentum_layers)
momentum_optim = MomentumOptimizer(momentum_NN.layers,lr=0.01)
momentum_loss = CELoss()

rmsprop_layers = [Linear(2,16), 
          ReLU(),
          Linear(16,8) ,
          ReLU(),
          Linear(8,1) ,
          Sigmoid()]

rmsprop_NN = NeuralNetwork(layers=rmsprop_layers)
rmsprop_optim = RMSProp(rmsprop_NN.layers,lr=0.01)
rmsprop_loss = CELoss()

In [18]:
for e in range(1_000):
    output = NN.forward(X)
    error,error_gradient = loss.loss(y,output)
    optim.step(error_gradient)

    momentum_output = momentum_NN.forward(X)
    momentum_error,momentum_error_gradient = loss.loss(y,momentum_output)
    momentum_optim.step(momentum_error_gradient)
    
    rmsprop_output = rmsprop_NN.forward(X)
    rmsprop_error,rmsprop_error_gradient = loss.loss(y,rmsprop_output)
    rmsprop_optim.step(rmsprop_error_gradient)
    
    if e%50 == 0:
        print("epoch: ", e , "\n \t Model with regular optimizer train error: ", np.round(error,5), 
              "\n \t Model with momentum optimizer train error: ", np.round(momentum_error,5),
        "\n \t Model with rmsprop optimizer train error: ", np.round(rmsprop_error,5))

epoch:  0 
 	 Model with regular optimizer train error:  0.71302 
 	 Model with momentum optimizer train error:  0.69004 
 	 Model with rmsprop optimizer train error:  0.69658
epoch:  50 
 	 Model with regular optimizer train error:  0.69395 
 	 Model with momentum optimizer train error:  0.65487 
 	 Model with rmsprop optimizer train error:  0.61106
epoch:  100 
 	 Model with regular optimizer train error:  0.68529 
 	 Model with momentum optimizer train error:  0.62673 
 	 Model with rmsprop optimizer train error:  0.16216
epoch:  150 
 	 Model with regular optimizer train error:  0.67623 
 	 Model with momentum optimizer train error:  0.59469 
 	 Model with rmsprop optimizer train error:  0.01128
epoch:  200 
 	 Model with regular optimizer train error:  0.66257 
 	 Model with momentum optimizer train error:  0.55294 
 	 Model with rmsprop optimizer train error:  0.00067
epoch:  250 
 	 Model with regular optimizer train error:  0.64865 
 	 Model with momentum optimizer train error:

<h3>We can see how the Model using the RMSProp optimizer converges faster!</h3>

<h4>Verifying that the model works with multiple outputs (we simply expanded y to 2 classes)</h4>

In [19]:
X = np.array([[0,0],[1,1],[0,1],[1,0]])
y = np.array([[1,0],[1,0],[0,1],[0,1]])

In [20]:
layers = [Linear(2,16), 
          ReLU(),
          Linear(16,8) ,
          ReLU(),
          Linear(8,2) ,
          Sigmoid()]

NN = NeuralNetwork(layers=layers)
optim = SimpleOptimizer(NN.layers,lr=0.1)
loss = MSELoss()

In [21]:
for e in range(1_000):
    output = NN.forward(X)
    error,error_gradient = loss.loss(y,output)
    optim.step(error_gradient)
    if e%100 == 0:
        print("epoch: ", e , "\n \t train error: ", np.round(error,5))

epoch:  0 
 	 train error:  0.28318
epoch:  100 
 	 train error:  0.22913
epoch:  200 
 	 train error:  0.14376
epoch:  300 
 	 train error:  0.03114
epoch:  400 
 	 train error:  0.00906
epoch:  500 
 	 train error:  0.00433
epoch:  600 
 	 train error:  0.00264
epoch:  700 
 	 train error:  0.00183
epoch:  800 
 	 train error:  0.00137
epoch:  900 
 	 train error:  0.00108


In [22]:
print("Prediction is : \n" , np.round(NN.forward(X)).astype(int))
print("y is :          \n" , y)

Prediction is : 
 [[1 0]
 [1 0]
 [0 1]
 [0 1]]
y is :          
 [[1 0]
 [1 0]
 [0 1]
 [0 1]]


In [23]:
layers = [Linear(2,16), 
          ReLU(),
          Linear(16,8) ,
          ReLU(),
          Linear(8,2) ,
          Sigmoid()]

NN = NeuralNetwork(layers=layers)
optim = RMSProp(NN.layers,lr=0.1)
loss = CELoss()

In [24]:
for e in range(1_000):
    output = NN.forward(X)
    error,error_gradient = loss.loss(y,output)
    optim.step(error_gradient)
    if e%100 == 0:
        print("epoch: ", e , "\n \t train error: ", np.round(error,5))

epoch:  0 
 	 train error:  0.71649
epoch:  100 
 	 train error:  9e-05
epoch:  200 
 	 train error:  0.0
epoch:  300 
 	 train error:  0.0
epoch:  400 
 	 train error:  0.0
epoch:  500 
 	 train error:  0.0
epoch:  600 
 	 train error:  0.0
epoch:  700 
 	 train error:  0.0
epoch:  800 
 	 train error:  0.0
epoch:  900 
 	 train error:  0.0


In [25]:
print("Prediction is : \n" , np.round(NN.forward(X)).astype(int))
print("y is :          \n" , y)

Prediction is : 
 [[1 0]
 [1 0]
 [0 1]
 [0 1]]
y is :          
 [[1 0]
 [1 0]
 [0 1]
 [0 1]]
