# Plain SGD

In [1]:
import numpy as np

In [2]:
class Optimizer_SGD():
    def __init__(self, learning_rate):
        self.learning_rate = learning_rate
    
    def update_params(self, layer):
        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases

# SGD with learning rate decay
A decay is applied to learning rate based on the iterations over time
$$
\text{learning\_rate} = \text{learning\_rate} \times \left( \frac{1}{1 + \text{decay} \cdot \text{iterations}} \right)
$$


In [None]:
class Optimizer_SGD_Decay():
    def __init__(self, learning_rate=1, decay= 0 ):
        self.learning_rate=  learning_rate
        self.decay = decay
        self.iterations = 0
    
    def pre_update_params(self):
        if self.decay:
            self.learning_rate = self.learning_rate *(1. /(1. + self.decay * self.iterations) )
    
    def update_params(self, layer):
        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases
        
    def post_update_params(self):
        self.iterations += 1
    

# SGD with Momentum
$$
\text{momentum} = \text{momentum} \cdot \text{momentum coefficient} - \text{learning rate} \cdot \nabla J(\text{weights})
$$

$$
\text{weights} = \text{weights} + \text{momentum}
$$


In [5]:
class Optimizer_SGD_Momentum():
    def __init__(self, learning_rate = 1, decay =0 ,momentum =0):
        self.learning_rate= learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.momentum = momentum
        self.iterations = 0
    
    def pre_update_params(self):
        if self.decay:
            self.learning_rate = self.learning_rate *(1. /(1. + self.decay * self.iterations) )
    
    def update_params(self, layer):
        if self.momentum:

            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.bias)
            
            weight_updates = self.momentum* layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates

            bias_updates = self.momentum* layer.bias_momentums - self.current_learning_rate*layer.dbiases
            layer.bias_momentums = bias_updates
        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases
        layer.weights += weight_updates
        layer.biases += bias_updates
    
    def post_update_params(self):
        self.iterations += 1
        

# AdaGrad
Short for adaptive gradient is a method to 