# Gradient Descent Optimization Algorithms
In this notebook you can find a collection of GD based optimization algorithm used for deep learning. The code is always accompanied by a explanatory youtube video which are linked here:
- [Stochastic Gradient Descent](https://youtu.be/IH9kqpMORLM)
- [Stochastic Gradient Descent + Momentum](https://youtu.be/7EuiXb6hFAM)
- [Adagrad](https://youtu.be/EGt-UOIIdDk)

## Tests
In order to demonstrate the algorithms capabilities to optimize a function we used these simple test setup:
- learning various linear function of the form `f(x) = w0 + w1*x` with the squared error. This is a simple sanity check as the gradient are simple to calculate and the test data is also easy to generate.

In [38]:
import numpy as np
from numpy.random import permutation

class Line():
    def __init__(self):
        self.w0 = np.random.uniform(0,1,1)
        self.w1 = np.random.uniform(0,1,1)
    
    def evaluate(self,x):
        """
            evaluate: will evaluate the line yhate given x
            x: a point in the plane

            return the result of the function evalutation
        """
        return self.w0 + self.w1*x
    
    def dx_w0(self, x, y):
        """
            dx_w0: partial derivative of the weight w0
            x: a point in the plane
            y: the response of the point x

            return the gradient at that point for this x and y for w0
        """
        yhat = self.evaluate(x)
        return 2*(yhat - y)
        
    
    def dx_w1(self, x, y):
        """
            dx_w1: partial derivative of the weight w1 for a linear function
            x: a point in the plane
            y: the response of the point x

            return the gradient at that point for this x and y for w1
        """  
        yhat = self.evaluate(x)
        return 2*x*(yhat - y)

    def __str__(self):
        return f"y = {self.w0[0]} + {self.w1[0]}*x"
    
    
#################### Helper functions ######################
def stochastic_sample(xs, ys):
    """
        stochastic_sample: sample with replacement one x and one y
        xs: all point on the plane
        ys: all response on the plane
        
        return the randomly selected x and y point
    """
    perm = permutation(len(xs))
    x = xs[perm[0]]
    y = ys[perm[0]]

    return x, y
    
    
def gradient(dx, xs, ys):
    """
        gradient: estimate mean gradient over all point for w1
        dx: partial derivative function used to evaluate the gradient
        xs: all point on the plane
        ys: all response on the plane
        
        return the mean gradient all x and y for w1
    """         
    N = len(ys)
    
    total = 0
    for x,y in zip(xs,ys):
        total = total + dx(x, y)
    
    gradient = total/N
    return gradient

################## Optimization Functions #####################

def gd(model, xs, ys, learning_rate = 0.01, max_num_iteration = 1000):
    """
        gd: will estimate the parameters w1 and w2 (here it uses least square cost function)
        model: the model we are trying to optimize using gradient descent
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        max_num_iteration: the number of iteration before we stop updating
    """    

    for i in range(max_num_iteration):
        model.w0 = model.w0 - learning_rate*gradient(model.dx_w0, xs, ys)
        model.w1 = model.w1 - learning_rate*gradient(model.dx_w1, xs, ys)
        
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)
            
def sgd(model, xs, ys, learning_rate = 0.01, max_num_iteration = 1000):
    """
        sgd: will estimate the parameters w0 and w1 
        (here it uses least square cost function)
        model: the model we are trying to optimize using sgd
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        max_num_iteration: the number of iteration before we stop updating
    """       
    
    for i in range(max_num_iteration):
        
        # Select a random x and y
        x, y = stochastic_sample(xs, ys)
        
        # Updating the model parameters
        model.w0 = model.w0 - learning_rate*model.dx_w0(x, y)
        model.w1 = model.w1 - learning_rate*model.dx_w1(x, y)
        
    
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)
            
def sgd_momentum(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 1000):
    """
        sgd_momentum: will estimate the parameters w0 and w1 
        (here it uses least square cost function)
        model: the model we are trying to optimize using sgd
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        decay_factor: determines the relative contribution of the current gradient and earlier gradients to the weight change
        max_num_iteration: the number of iteration before we stop updating
    """
    
    # These are needed to keep track of the previous gradient
    prev_g0 = 0
    prev_g1 = 0
    
    for i in range(max_num_iteration):
        
        # Select a random x and y
        x, y = stochastic_sample(xs, ys)

        g0 = decay_factor*prev_g0 - learning_rate*model.dx_w0(x,y)
        g1 = decay_factor*prev_g1 - learning_rate*model.dx_w1(x,y)
        
        # Updating the model parameters
        model.w0 = model.w0 + g0
        model.w1 = model.w1 + g1
        
        # swap previous gradient
        prev_g0, prev_g1 = g0, g1
    
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)
            
            
def adagrad(model, xs, ys, learning_rate = 0.1, max_num_iteration = 10000, eps=0.0000001):
    """
        adagrad: will estimate the parameters w0 and w1 
        (here it uses least square cost function)
        model: the model we are trying to optimize using sgd
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        max_num_iteration: the number of iteration before we stop updating
        eps: is a numerical safety to avoid division by 0
    """         
    # Here only the diagonal matter
    G = [[0,0],
         [0,0]]
    
    for i in range(max_num_iteration):
        
        # Select a random x and y
        x, y = stochastic_sample(xs, ys)
        
        g0 = model.dx_w0(x, y)
        g1 = model.dx_w1(x, y)
        
        G[0][0] = G[0][0] + g0*g0
        G[1][1] = G[1][1] + g1*g1
        
        model.w0 = model.w0 - (learning_rate/np.sqrt(G[0][0] + eps)) * g0
        model.w1 = model.w1 - (learning_rate/np.sqrt(G[1][1] + eps)) * g1
    
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)

In [39]:
# Here we have a simple line with intercept = 0 and slope = 1
xs = [1,2,3,4,5,6,7]
ys = [1,2,3,4,5,6,7]

# Gradient Descent
model = Line()
print("Gradient Descent: ")
gd(model, xs, ys)
print(model)

# Stochastic Gradient Descent
model = Line()
print("Stochastic Gradient Descent: ")
sgd(model, xs, ys)
print(model)

Gradient Descent: 
Iteration 0
y = 0.1484230627743015 + 0.6232288191403248*x
Iteration 100
y = 0.14660574025182332 + 0.9705053945109118*x
Iteration 200
y = 0.0991315358376387 + 0.9800564047762639*x
Iteration 300
y = 0.06703053632585736 + 0.9865145851638741*x
Iteration 400
y = 0.04532455552278566 + 0.9908814628825817*x
Iteration 500
y = 0.03064745481598692 + 0.9938342483214537*x
Iteration 600
y = 0.020723126258254662 + 0.9958308560603568*x
Iteration 700
y = 0.01401251635720002 + 0.9971809177379064*x
Iteration 800
y = 0.009474951424502668 + 0.9980937993709253*x
Iteration 900
y = 0.006406751093678868 + 0.9987110695962501*x
y = 0.004349087072231295 + 0.9991250369377569*x
Stochastic Gradient Descent: 
Iteration 0
y = 0.5688755138166692 + 0.5107534412165847*x
Iteration 100
y = 0.4425741941184875 + 0.9361376701582683*x
Iteration 200
y = 0.314871672875429 + 0.9469777307465838*x
Iteration 300
y = 0.21571168770394028 + 0.9649569866226333*x
Iteration 400
y = 0.15004050569910096 + 0.96965671489744

In [None]:
# Here we have a simple line with intercept = 0 and slope = 2
xs = [1,2,3,4,5,6,7]
ys = [2,4,6,8,10,12,14]

# Gradient Descent
model = Line()
print("Gradient Descent: ")
gradient_descent(model, xs, ys)
print(model)

# Stochastic Gradient Descent
model = Line()
print("Stochastic Gradient Descent: ")
stochastic_gradient_descent(model, xs, ys)
print(model)

# Stochastic Gradient Descent with Momentum
model = Line()
print("SGD + Momentum: ")
sgd_momentum(model, xs, ys)
print(model)

# Adagrad
model = Line()
print("Adagrad")
adagrad(model, xs, ys)
print(model)

In [22]:
# Here we have a simple line with intercept = 1 and slope = 2
xs = [1,2,3,4,5,6,7]
ys = [3,5,7,9,11,13,15]

# Gradient Descent
model = Line()
print("Gradient Descent: ")
gradient_descent(model, xs, ys)
print(model)

# Stochastic Gradient Descent
model = Line()
print("Stochastic Gradient Descent: ")
stochastic_gradient_descent(model, xs, ys)
print(model)

# Stochastic Gradient Descent with Momentum
model = Line()
print("SGD + Momentum: ")
sgd_momentum(model, xs, ys)
print(model)

# Adagrad
model = Line()
print("Adagrad")
adagrad(model, xs, ys)
print(model)

Gradient Descent: 
Iteration 0
y = 0.665189371668105 + 0.9077676569256095*x
Iteration 100
y = 0.9281890927973836 + 2.0144471449352293*x
Iteration 200
y = 0.9514430641756475 + 2.009768837587384*x
Iteration 300
y = 0.96716688162708 + 2.006605470370549*x
Iteration 400
y = 0.9777989767313627 + 2.004466471924208*x
Iteration 500
y = 0.9849881626053194 + 2.0030201288221177*x
Iteration 600
y = 0.9898493299503595 + 2.002042143834544*x
Iteration 700
y = 0.9931363430239945 + 2.001380852171081*x
Iteration 800
y = 0.9953589480444256 + 2.0009337014788704*x
Iteration 900
y = 0.9968618240495349 + 2.0006313481413165*x
y = 0.9978697158267908 + 2.000428577292816*x
Stochastic Gradient Descent: 
Iteration 0
y = 0.3411892027614143 + 1.770977165160403*x
Iteration 100
y = 0.6039940977243495 + 2.102681740225482*x
Iteration 200
y = 0.7205112221946766 + 2.0588950933611394*x
Iteration 300
y = 0.8011732178881769 + 2.0392396655904825*x
Iteration 400
y = 0.8719426224199744 + 2.024874311328534*x
Iteration 500
y = 0.9