# Gradient Descent Optimization Algorithms
In this notebook you can find a collection of GD based optimization algorithm used for deep learning. The code is always accompanied by a explanatory youtube video which are linked here:
- [Stochastic Gradient Descent](https://youtu.be/IH9kqpMORLM)
- [Stochastic Gradient Descent + Momentum](https://youtu.be/7EuiXb6hFAM)
- [Adagrad](https://youtu.be/EGt-UOIIdDk)
- [RMSprop](https://youtu.be/nLCuzsQaAKE)
- AdaDelta

## Tests
In order to demonstrate the algorithms capabilities to optimize a function we used these simple test setup:
- learning various linear function of the form `f(x) = w0 + w1*x` with the squared error. This is a simple sanity check as the gradient are simple to calculate and the test data is also easy to generate.

In [5]:
import numpy as np
from numpy.random import permutation

class Line():
    def __init__(self):
        self.w0 = np.random.uniform(0,1,1)
        self.w1 = np.random.uniform(0,1,1)
    
    def evaluate(self,x):
        """
            evaluate: will evaluate the line yhate given x
            x: a point in the plane

            return the result of the function evalutation
        """
        return self.w0 + self.w1*x
    
    def dx_w0(self, x, y):
        """
            dx_w0: partial derivative of the weight w0
            x: a point in the plane
            y: the response of the point x

            return the gradient at that point for this x and y for w0
        """
        yhat = self.evaluate(x)
        return 2*(yhat - y)
        
    
    def dx_w1(self, x, y):
        """
            dx_w1: partial derivative of the weight w1 for a linear function
            x: a point in the plane
            y: the response of the point x

            return the gradient at that point for this x and y for w1
        """  
        yhat = self.evaluate(x)
        return 2*x*(yhat - y)

    def __str__(self):
        return f"y = {self.w0[0]} + {self.w1[0]}*x"
    
    
#################### Helper functions ######################
def stochastic_sample(xs, ys):
    """
        stochastic_sample: sample with replacement one x and one y
        xs: all point on the plane
        ys: all response on the plane
        
        return the randomly selected x and y point
    """
    perm = permutation(len(xs))
    x = xs[perm[0]]
    y = ys[perm[0]]

    return x, y
    
    
def gradient(dx, xs, ys):
    """
        gradient: estimate mean gradient over all point for w1
        dx: partial derivative function used to evaluate the gradient
        xs: all point on the plane
        ys: all response on the plane
        
        return the mean gradient all x and y for w1
    """         
    N = len(ys)
    
    total = 0
    for x,y in zip(xs,ys):
        total = total + dx(x, y)
    
    gradient = total/N
    return gradient

################## Optimization Functions #####################

def gd(model, xs, ys, learning_rate = 0.01, max_num_iteration = 1000):
    """
        gd: will estimate the parameters w1 and w2 (here it uses least square cost function)
        model: the model we are trying to optimize using gradient descent
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        max_num_iteration: the number of iteration before we stop updating
    """    

    for i in range(max_num_iteration):
        model.w0 = model.w0 - learning_rate*gradient(model.dx_w0, xs, ys)
        model.w1 = model.w1 - learning_rate*gradient(model.dx_w1, xs, ys)
        
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)
            
def sgd(model, xs, ys, learning_rate = 0.01, max_num_iteration = 1000):
    """
        sgd: will estimate the parameters w0 and w1 
        (here it uses least square cost function)
        model: the model we are trying to optimize using sgd
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        max_num_iteration: the number of iteration before we stop updating
    """       
    
    for i in range(max_num_iteration):
        
        # Select a random x and y
        x, y = stochastic_sample(xs, ys)
        
        # Updating the model parameters
        model.w0 = model.w0 - learning_rate*model.dx_w0(x, y)
        model.w1 = model.w1 - learning_rate*model.dx_w1(x, y)
        
    
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)
            
def sgd_momentum(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 1000):
    """
        sgd_momentum: will estimate the parameters w0 and w1 
        (here it uses least square cost function)
        model: the model we are trying to optimize using sgd
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        decay_factor: determines the relative contribution of the current gradient and earlier gradients to the weight change
        max_num_iteration: the number of iteration before we stop updating
    """
    
    # These are needed to keep track of the previous gradient
    prev_g0 = 0
    prev_g1 = 0
    
    for i in range(max_num_iteration):
        
        # Select a random x and y
        x, y = stochastic_sample(xs, ys)

        g0 = decay_factor*prev_g0 - learning_rate*model.dx_w0(x,y)
        g1 = decay_factor*prev_g1 - learning_rate*model.dx_w1(x,y)
        
        # Updating the model parameters
        model.w0 = model.w0 + g0
        model.w1 = model.w1 + g1
        
        # swap previous gradient
        prev_g0, prev_g1 = g0, g1
    
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)
            
            
def adagrad(model, xs, ys, learning_rate = 0.1, max_num_iteration = 10000, eps=0.0000001):
    """
        adagrad: will estimate the parameters w0 and w1 
        (here it uses least square cost function)
        model: the model we are trying to optimize using sgd
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        max_num_iteration: the number of iteration before we stop updating
        eps: is a numerical safety to avoid division by 0
    """         
    # Here only the diagonal matter
    G = [[0,0],
         [0,0]]
    
    for i in range(max_num_iteration):
        
        # Select a random x and y
        x, y = stochastic_sample(xs, ys)
        
        g0 = model.dx_w0(x, y)
        g1 = model.dx_w1(x, y)
        
        G[0][0] = G[0][0] + g0*g0
        G[1][1] = G[1][1] + g1*g1
        
        model.w0 = model.w0 - (learning_rate/np.sqrt(G[0][0] + eps)) * g0
        model.w1 = model.w1 - (learning_rate/np.sqrt(G[1][1] + eps)) * g1
    
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)
            
def RMSprop(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 10000, eps=0.0000001):
    """
        RMSprop: will estimate the parameters w0 and w1 
        (here it uses least square cost function)
        model: the model we are trying to optimize using sgd
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        decay_factor: the parameter used in the running averaging
        max_num_iteration: the number of iteration before we stop updating
        eps: is a numerical safety to avoid division by 0
    """         
    
    # Running average
    E = [0,0]
    
    for i in range(max_num_iteration):
        
        # Select a random x and y
        x, y = stochastic_sample(xs, ys)
        
        g0 = model.dx_w0(x, y)
        g1 = model.dx_w1(x, y)
        
        E[0] = decay_factor*E[0] + (1-decay_factor)*g0*g0
        E[1] = decay_factor*E[1] + (1-decay_factor)*g1*g1
        
        model.w0 = model.w0 - (learning_rate/np.sqrt(E[0] + eps)) * g0
        model.w1 = model.w1 - (learning_rate/np.sqrt(E[1] + eps)) * g1
    
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)

def adadelta(model, xs, ys, decay_factor = 0.9, max_num_iteration = 10000, eps=0.0000001):
    """
        Adadelta: will estimate the parameters w0 and w1
        model: the model we are trying to optimize using sgd
        xs: all point on the plane
        ys: all response on the plane
        decay_factor: the parameter used in the running averaging
        max_num_iteration: the number of iteration before we stop updating
        eps: is a numerical safety to avoid division by 0
    """         
    
    # Running average
    E_g = [0,0] # for gradient
    E_p = [0,0] # for parameters
    delta_p = [0,0] #delta for parameter
    
    for i in range(max_num_iteration):
        
        # Select a random x and y
        x, y = stochastic_sample(xs, ys)
        
        g0 = model.dx_w0(x, y)
        g1 = model.dx_w1(x, y)
        
        # Get the running average for the gradient
        E_g[0] = decay_factor*E_g[0] + (1-decay_factor)*g0*g0
        E_g[1] = decay_factor*E_g[1] + (1-decay_factor)*g1*g1
        
        # Get the running average for the parameters
        E_p[0] = decay_factor*E_p[0] + (1-decay_factor)*delta_p[0]*delta_p[0]
        E_p[1] = decay_factor*E_p[1] + (1-decay_factor)*delta_p[1]*delta_p[1]
        
        # Calculate the gradient difference
        delta_p[0] = - np.sqrt(E_p[0] + eps) / np.sqrt(E_g[0] + eps) * g0
        delta_p[1] = - np.sqrt(E_p[1] + eps) / np.sqrt(E_g[1] + eps) * g1
        
        # update the models
        model.w0 = model.w0 + delta_p[0]
        model.w1 = model.w1 + delta_p[1]
        
        
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)

In [6]:
# Here we have a simple line with intercept = 0 and slope = 1
xs = [1,2,3,4,5,6,7]
ys = [1,2,3,4,5,6,7]

# Gradient Descent
model = Line()
print("Gradient Descent: ")
gd(model, xs, ys)
print(model)

# Stochastic Gradient Descent
model = Line()
print("Stochastic Gradient Descent: ")
sgd(model, xs, ys)
print(model)

# Stochastic Gradient Descent with Momentum
model = Line()
print("SGD + Momentum: ")
sgd_momentum(model, xs, ys)
print(model)

# Adagrad
model = Line()
print("Adagrad")
adagrad(model, xs, ys)
print(model)

# RMSprop
model = Line()
print("RMSprop")
RMSprop(model, xs, ys)
print(model)

# Adadelta
model = Line()
print("Adadelta")
adadelta(model, xs, ys)
print(model)

Gradient Descent: 
Iteration 0
y = 0.15080809254765393 + 0.6790908370317389*x
Iteration 100
y = 0.14070772861001737 + 0.9716919751062303*x
Iteration 200
y = 0.09514343174678791 + 0.980858744143106*x
Iteration 300
y = 0.06433386917675844 + 0.9870571091711977*x
Iteration 400
y = 0.04350112926625649 + 0.9912483055313238*x
Iteration 500
y = 0.029414494599109095 + 0.9940822991489187*x
Iteration 600
y = 0.01988942602444484 + 0.9959985825044247*x
Iteration 700
y = 0.013448786830212731 + 0.9972943306231896*x
Iteration 800
y = 0.009093770075728168 + 0.9981704866376216*x
Iteration 900
y = 0.006149004756654367 + 0.9987629238177406*x
y = 0.00417412143896754 + 0.9991602370760215*x
Stochastic Gradient Descent: 
Iteration 0
y = 0.15993686766783655 + 0.1833518650965968*x
Iteration 100
y = 0.21828157996824554 + 0.9594516741177225*x
Iteration 200
y = 0.1451844200607905 + 0.9741557959554868*x
Iteration 300
y = 0.1023440101887112 + 0.9800172156010936*x
Iteration 400
y = 0.07450363961602721 + 0.98933111138

Iteration 1200
y = -0.0007664471045705575 + 1.0002642184776036*x
Iteration 1300
y = -0.021958706880734306 + 1.003498640175805*x
Iteration 1400
y = -0.015584512173962473 + 0.9848432560044733*x
Iteration 1500
y = -0.00013040805631450714 + 0.9996115058478232*x
Iteration 1600
y = 0.002924356459606971 + 1.0017756413448418*x
Iteration 1700
y = 0.00028922302118505097 + 0.999895063767121*x
Iteration 1800
y = 0.004173454138749314 + 0.9977761099422883*x
Iteration 1900
y = 0.008285548513884912 + 0.9972693671576253*x
Iteration 2000
y = 0.0037519184753459795 + 0.9991495583156801*x
Iteration 2100
y = -0.0009917925918777797 + 1.0006309126324515*x
Iteration 2200
y = -0.005797292073705403 + 1.001185732126061*x
Iteration 2300
y = -0.0024461824844422613 + 1.0008023430416744*x
Iteration 2400
y = -0.0018481187011078282 + 0.996790630504706*x
Iteration 2500
y = 0.015076565167656184 + 0.9940245585355336*x
Iteration 2600
y = -0.00336552791882152 + 1.000816589542082*x
Iteration 2700
y = -0.007711886970819105 + 

Iteration 4900
y = -0.00018293553284564853 + 1.000084353469475*x
Iteration 5000
y = -0.00021086202011657087 + 1.000599467500034*x
Iteration 5100
y = -0.0002698118770557987 + 1.0002670984772302*x
Iteration 5200
y = -0.000174041228809895 + 1.0000873631031082*x
Iteration 5300
y = -0.000368303888004153 + 0.9988172336751648*x
Iteration 5400
y = 3.23267012324346e-08 + 1.0000015527829704*x
Iteration 5500
y = -0.0007458676117796859 + 1.0002497117687457*x
Iteration 5600
y = -0.0002750920909796192 + 0.9999100367963439*x
Iteration 5700
y = -0.0002603746682872479 + 1.000235272525123*x
Iteration 5800
y = 0.0006660635547060155 + 0.9997308283380719*x
Iteration 5900
y = 0.0007499650195894795 + 0.9998309531696397*x
Iteration 6000
y = 0.00026793080973281725 + 0.9999254737646864*x
Iteration 6100
y = -7.108162650266238e-05 + 1.0000467252894638*x
Iteration 6200
y = 2.7908004826124246e-05 + 0.9999831010503192*x
Iteration 6300
y = 7.319170492363453e-05 + 0.9999735463240744*x
Iteration 6400
y = 0.000101980212

In [7]:
# Here we have a simple line with intercept = 0 and slope = 2
xs = [1,2,3,4,5,6,7]
ys = [2,4,6,8,10,12,14]

# Gradient Descent
model = Line()
print("Gradient Descent: ")
gd(model, xs, ys)
print(model)

# Stochastic Gradient Descent
model = Line()
print("Stochastic Gradient Descent: ")
sgd(model, xs, ys)
print(model)

# Stochastic Gradient Descent with Momentum
model = Line()
print("SGD + Momentum: ")
sgd_momentum(model, xs, ys)
print(model)

# Adagrad
model = Line()
print("Adagrad")
adagrad(model, xs, ys)
print(model)

# RMSprop
model = Line()
print("RMSprop")
RMSprop(model, xs, ys)
print(model)

# Adadelta
model = Line()
print("Adadelta")
adadelta(model, xs, ys)
print(model)

Gradient Descent: 
Iteration 0
y = 0.6965931374130392 + 0.7601122786401455*x
Iteration 100
y = 0.617623544918069 + 1.8757445461082207*x
Iteration 200
y = 0.4176232831814616 + 1.9159812299345367*x
Iteration 300
y = 0.28238756130710596 + 1.9431883792444125*x
Iteration 400
y = 0.1909441786231191 + 1.961585247554065*x
Iteration 500
y = 0.12911220020207081 + 1.9740248001050498*x
Iteration 600
y = 0.0873027937338816 + 1.9824361484423931*x
Iteration 700
y = 0.059032204406802784 + 1.988123714820858*x
Iteration 800
y = 0.039916261646207894 + 1.9919695205124175*x
Iteration 900
y = 0.02699048696926038 + 1.9945699686536877*x
y = 0.01832191952439833 + 1.9963139384089141*x
Stochastic Gradient Descent: 
Iteration 0
y = 0.9619258265142657 + 1.087046466635935*x
Iteration 100
y = 0.7920127114741361 + 1.8730271390796145*x
Iteration 200
y = 0.546286989963847 + 1.8525856253438988*x
Iteration 300
y = 0.3750536398579547 + 1.9349436535828175*x
Iteration 400
y = 0.2635085249438779 + 1.9449615901242314*x
Iterat

Iteration 1000
y = -0.039749095115406675 + 1.987267555742025*x
Iteration 1100
y = 0.005145669157455596 + 1.9984807726790454*x
Iteration 1200
y = -0.0018140213445952532 + 2.0037880205456013*x
Iteration 1300
y = -0.01278392583858953 + 2.0039946207483874*x
Iteration 1400
y = 0.0006377257328787176 + 1.999931949377187*x
Iteration 1500
y = 0.0009988082670943522 + 1.9943520576447586*x
Iteration 1600
y = -0.01806246811933287 + 1.990932459663341*x
Iteration 1700
y = 0.003213682754240674 + 1.999003713429964*x
Iteration 1800
y = 0.011017420383813752 + 1.9970627341422795*x
Iteration 1900
y = -0.009959479920880443 + 2.0064445494038146*x
Iteration 2000
y = 0.005708949443180763 + 1.998207036748069*x
Iteration 2100
y = -0.0018150808151644426 + 2.0003408117602763*x
Iteration 2200
y = -0.00026099203332354817 + 1.998588111772451*x
Iteration 2300
y = -0.008553796181148434 + 1.9855130641651118*x
Iteration 2400
y = 0.00021324745550753877 + 2.0011468628037545*x
Iteration 2500
y = 0.0003281239054911537 + 1.99

Iteration 4100
y = 1.1703668469299928 + 1.7578849874677622*x
Iteration 4200
y = 1.1299443751200116 + 1.7596081596761088*x
Iteration 4300
y = 1.0857688501669025 + 1.7603755539483317*x
Iteration 4400
y = 1.067999887278753 + 1.7764118268018219*x
Iteration 4500
y = 1.027562399896915 + 1.781172047294927*x
Iteration 4600
y = 1.0135197357347023 + 1.8003402340929056*x
Iteration 4700
y = 0.9867339323786954 + 1.8049278823229686*x
Iteration 4800
y = 0.9570536816927211 + 1.8130726368629162*x
Iteration 4900
y = 0.916086185933129 + 1.8121264546965077*x
Iteration 5000
y = 0.8769579109672395 + 1.8178313845022325*x
Iteration 5100
y = 0.8375535213298363 + 1.8197866853883014*x
Iteration 5200
y = 0.8102089699606075 + 1.830294102584346*x
Iteration 5300
y = 0.7689489406316838 + 1.8354861163350307*x
Iteration 5400
y = 0.7308673994842835 + 1.8382178533040052*x
Iteration 5500
y = 0.6944176540689229 + 1.8482404095568499*x
Iteration 5600
y = 0.658045742524041 + 1.8554471817071359*x
Iteration 5700
y = 0.628446273

In [8]:
# Here we have a simple line with intercept = 1 and slope = 2
xs = [1,2,3,4,5,6,7]
ys = [3,5,7,9,11,13,15]

# Gradient Descent
model = Line()
print("Gradient Descent: ")
gd(model, xs, ys)
print(model)

# Stochastic Gradient Descent
model = Line()
print("Stochastic Gradient Descent: ")
sgd(model, xs, ys)
print(model)

# Stochastic Gradient Descent with Momentum
model = Line()
print("SGD + Momentum: ")
sgd_momentum(model, xs, ys)
print(model)

# Adagrad
model = Line()
print("Adagrad")
adagrad(model, xs, ys)
print(model)

# RMSprop
model = Line()
print("RMSprop")
RMSprop(model, xs, ys)
print(model)

# Adadelta
model = Line()
print("Adadelta")
adadelta(model, xs, ys)
print(model)

Gradient Descent: 
Iteration 0
y = 0.3758312748922319 + 0.9861076428125647*x
Iteration 100
y = 0.7298486719499636 + 2.0543498967332554*x
Iteration 200
y = 0.8173296897367237 + 2.036750189498243*x
Iteration 300
y = 0.8764824052780473 + 2.0248496595087433*x
Iteration 400
y = 0.9164801538689702 + 2.0168027862204627*x
Iteration 500
y = 0.9435257404950816 + 2.0113616697512993*x
Iteration 600
y = 0.9618133637168664 + 2.0076825079986067*x
Iteration 700
y = 0.9741790471729282 + 2.0051947407767163*x
Iteration 800
y = 0.9825404468737576 + 2.0035125679976087*x
Iteration 900
y = 0.9881942390968437 + 2.0023751202356674*x
y = 0.9919859096488965 + 2.001612299988075*x
Stochastic Gradient Descent: 
Iteration 0
y = 0.9153217017005606 + 0.34378094132412806*x
Iteration 100
y = 1.1818667130118197 + 1.9585589470138083*x
Iteration 200
y = 1.117518306355778 + 1.9755644836754853*x
Iteration 300
y = 1.0847443749817014 + 1.9853921780077008*x
Iteration 400
y = 1.0553999353012187 + 1.9821880215759884*x
Iteration 5

Iteration 1300
y = 1.013456608574291 + 2.003760445243725*x
Iteration 1400
y = 1.000604429686155 + 1.999814973888711*x
Iteration 1500
y = 0.9983353003157828 + 2.002377400504234*x
Iteration 1600
y = 1.0079070299852295 + 1.99911172255804*x
Iteration 1700
y = 1.0057974653348982 + 2.0092064057307333*x
Iteration 1800
y = 1.0012182835397407 + 1.9912445656314368*x
Iteration 1900
y = 0.9935730533373829 + 2.004907314371268*x
Iteration 2000
y = 1.0068544594217217 + 1.9945796914142868*x
Iteration 2100
y = 1.0034692386766948 + 1.999877157270015*x
Iteration 2200
y = 1.0238524454768172 + 2.003215919656625*x
Iteration 2300
y = 1.0006305118741923 + 2.000139723628984*x
Iteration 2400
y = 1.0133749706891342 + 2.0028699564784285*x
Iteration 2500
y = 1.0065502964187343 + 1.998588577290943*x
Iteration 2600
y = 1.0001875380771728 + 2.0005311718749885*x
Iteration 2700
y = 0.9983088913058962 + 1.9998294101697964*x
Iteration 2800
y = 1.0024019194786755 + 1.9994309554909746*x
Iteration 2900
y = 0.986840323977595

Iteration 6900
y = 1.0000222206592522 + 1.9999761252125086*x
Iteration 7000
y = 0.999549383094565 + 1.99884719313683*x
Iteration 7100
y = 0.9995616976239795 + 2.0000340912511376*x
Iteration 7200
y = 1.000060767627944 + 1.9999566175420058*x
Iteration 7300
y = 1.0004173904765594 + 2.000108873603106*x
Iteration 7400
y = 0.9998062765670366 + 1.9999941183689391*x
Iteration 7500
y = 0.9996904393857947 + 2.000100981808597*x
Iteration 7600
y = 1.0001020411022337 + 2.0001841864843892*x
Iteration 7700
y = 0.9994753953994944 + 1.9984761540171951*x
Iteration 7800
y = 0.9990845626216417 + 1.999888433343906*x
Iteration 7900
y = 0.9997245929413028 + 1.999346802722747*x
Iteration 8000
y = 1.0002529886696372 + 1.999916494460583*x
Iteration 8100
y = 1.0004261759731738 + 2.000501330332923*x
Iteration 8200
y = 0.9998512618662146 + 2.0000518470701882*x
Iteration 8300
y = 1.0001164682137489 + 1.999937125270592*x
Iteration 8400
y = 0.9997953649417827 + 2.0001169420946123*x
Iteration 8500
y = 0.99895627907883