# Gradient Descent Optimization Algorithms
In this notebook you can find a collection of GD based optimization algorithm used for deep learning. The code is always accompanied by a explanatory youtube video which are linked here:
- [Stochastic Gradient Descent](https://youtu.be/IH9kqpMORLM)
- [Stochastic Gradient Descent + Momentum](https://youtu.be/7EuiXb6hFAM)
- [Adagrad](https://youtu.be/EGt-UOIIdDk)

## Tests
In order to demonstrate the algorithms capabilities to optimize a function we used these simple test setup:
- learning various linear function of the form `f(x) = w0 + w1*x` with the squared error. This is a simple sanity check as the gradient are simple to calculate and the test data is also easy to generate.

In [43]:
import numpy as np
from numpy.random import permutation

class Line():
    def __init__(self):
        self.w0 = np.random.uniform(0,1,1)
        self.w1 = np.random.uniform(0,1,1)
    
    def evaluate(self,x):
        """
            evaluate: will evaluate the line yhate given x
            x: a point in the plane

            return the result of the function evalutation
        """
        return self.w0 + self.w1*x
    
    def dx_w0(self, x, y):
        """
            dx_w0: partial derivative of the weight w0
            x: a point in the plane
            y: the response of the point x

            return the gradient at that point for this x and y for w0
        """
        yhat = self.evaluate(x)
        return 2*(yhat - y)
        
    
    def dx_w1(self, x, y):
        """
            dx_w1: partial derivative of the weight w1 for a linear function
            x: a point in the plane
            y: the response of the point x

            return the gradient at that point for this x and y for w1
        """  
        yhat = self.evaluate(x)
        return 2*x*(yhat - y)

    def __str__(self):
        return f"y = {self.w0[0]} + {self.w1[0]}*x"
    
    
#################### Helper functions ######################
def stochastic_sample(xs, ys):
    """
        stochastic_sample: sample with replacement one x and one y
        xs: all point on the plane
        ys: all response on the plane
        
        return the randomly selected x and y point
    """
    perm = permutation(len(xs))
    x = xs[perm[0]]
    y = ys[perm[0]]

    return x, y
    
    
def gradient(dx, xs, ys):
    """
        gradient: estimate mean gradient over all point for w1
        dx: partial derivative function used to evaluate the gradient
        xs: all point on the plane
        ys: all response on the plane
        
        return the mean gradient all x and y for w1
    """         
    N = len(ys)
    
    total = 0
    for x,y in zip(xs,ys):
        total = total + dx(x, y)
    
    gradient = total/N
    return gradient

################## Optimization Functions #####################

def gd(model, xs, ys, learning_rate = 0.01, max_num_iteration = 1000):
    """
        gd: will estimate the parameters w1 and w2 (here it uses least square cost function)
        model: the model we are trying to optimize using gradient descent
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        max_num_iteration: the number of iteration before we stop updating
    """    

    for i in range(max_num_iteration):
        model.w0 = model.w0 - learning_rate*gradient(model.dx_w0, xs, ys)
        model.w1 = model.w1 - learning_rate*gradient(model.dx_w1, xs, ys)
        
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)
            
def sgd(model, xs, ys, learning_rate = 0.01, max_num_iteration = 1000):
    """
        sgd: will estimate the parameters w0 and w1 
        (here it uses least square cost function)
        model: the model we are trying to optimize using sgd
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        max_num_iteration: the number of iteration before we stop updating
    """       
    
    for i in range(max_num_iteration):
        
        # Select a random x and y
        x, y = stochastic_sample(xs, ys)
        
        # Updating the model parameters
        model.w0 = model.w0 - learning_rate*model.dx_w0(x, y)
        model.w1 = model.w1 - learning_rate*model.dx_w1(x, y)
        
    
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)
            
def sgd_momentum(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 1000):
    """
        sgd_momentum: will estimate the parameters w0 and w1 
        (here it uses least square cost function)
        model: the model we are trying to optimize using sgd
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        decay_factor: determines the relative contribution of the current gradient and earlier gradients to the weight change
        max_num_iteration: the number of iteration before we stop updating
    """
    
    # These are needed to keep track of the previous gradient
    prev_g0 = 0
    prev_g1 = 0
    
    for i in range(max_num_iteration):
        
        # Select a random x and y
        x, y = stochastic_sample(xs, ys)

        g0 = decay_factor*prev_g0 - learning_rate*model.dx_w0(x,y)
        g1 = decay_factor*prev_g1 - learning_rate*model.dx_w1(x,y)
        
        # Updating the model parameters
        model.w0 = model.w0 + g0
        model.w1 = model.w1 + g1
        
        # swap previous gradient
        prev_g0, prev_g1 = g0, g1
    
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)
            
            
def adagrad(model, xs, ys, learning_rate = 0.1, max_num_iteration = 10000, eps=0.0000001):
    """
        adagrad: will estimate the parameters w0 and w1 
        (here it uses least square cost function)
        model: the model we are trying to optimize using sgd
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        max_num_iteration: the number of iteration before we stop updating
        eps: is a numerical safety to avoid division by 0
    """         
    # Here only the diagonal matter
    G = [[0,0],
         [0,0]]
    
    for i in range(max_num_iteration):
        
        # Select a random x and y
        x, y = stochastic_sample(xs, ys)
        
        g0 = model.dx_w0(x, y)
        g1 = model.dx_w1(x, y)
        
        G[0][0] = G[0][0] + g0*g0
        G[1][1] = G[1][1] + g1*g1
        
        model.w0 = model.w0 - (learning_rate/np.sqrt(G[0][0] + eps)) * g0
        model.w1 = model.w1 - (learning_rate/np.sqrt(G[1][1] + eps)) * g1
    
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)
            
def RMSprop(model, xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 10000, eps=0.0000001):
    """
        adagrad: will estimate the parameters w0 and w1 
        (here it uses least square cost function)
        model: the model we are trying to optimize using sgd
        xs: all point on the plane
        ys: all response on the plane
        learning_rate: the learning rate for the step that weights update will take
        decay_factor: the parameter used in the running averaging
        max_num_iteration: the number of iteration before we stop updating
        eps: is a numerical safety to avoid division by 0
    """         
    
    # Running average
    E = [0,0]
    
    for i in range(max_num_iteration):
        
        # Select a random x and y
        x, y = stochastic_sample(xs, ys)
        
        g0 = model.dx_w0(x, y)
        g1 = model.dx_w1(x, y)
        
        E[0] = decay_factor*E[0] + (1-decay_factor)*g0*g0
        E[1] = decay_factor*E[1] + (1-decay_factor)*g1*g1
        
        model.w0 = model.w0 - (learning_rate/np.sqrt(E[0] + eps)) * g0
        model.w1 = model.w1 - (learning_rate/np.sqrt(E[1] + eps)) * g1
    
        if i % 100 == 0:
            print(f"Iteration {i}")
            print(model)

In [44]:
# Here we have a simple line with intercept = 0 and slope = 1
xs = [1,2,3,4,5,6,7]
ys = [1,2,3,4,5,6,7]

# Gradient Descent
model = Line()
print("Gradient Descent: ")
gd(model, xs, ys)
print(model)

# Stochastic Gradient Descent
model = Line()
print("Stochastic Gradient Descent: ")
sgd(model, xs, ys)
print(model)

# Stochastic Gradient Descent with Momentum
model = Line()
print("SGD + Momentum: ")
sgd_momentum(model, xs, ys)
print(model)

# Adagrad
model = Line()
print("Adagrad")
adagrad(model, xs, ys)
print(model)

# RMSprop
model = Line()
print("RMSprop")
RMSprop(model, xs, ys)
print(model)

Gradient Descent: 
Iteration 0
y = 0.43977339981685476 + 0.9281497279610761*x
Iteration 100
y = 0.29514879790543636 + 0.940621033393055*x
Iteration 200
y = 0.19957304254759445 + 0.9598492654783679*x
Iteration 300
y = 0.13494684577527907 + 0.9728509676953854*x
Iteration 400
y = 0.0912480511006594 + 0.9816424291147179*x
Iteration 500
y = 0.061699899555516934 + 0.9875870194920031*x
Iteration 600
y = 0.041720097681443725 + 0.9916066190862188*x
Iteration 700
y = 0.028210200714882196 + 0.9943245827931141*x
Iteration 800
y = 0.01907510932621575 + 0.9961624093076331*x
Iteration 900
y = 0.01289816401820672 + 0.9974051066581902*x
y = 0.008755645032385488 + 0.998238511700912*x
Stochastic Gradient Descent: 
Iteration 0
y = 0.40415139138878375 + 0.9502270760087869*x
Iteration 100
y = 0.27331002363870227 + 0.9368261614451874*x
Iteration 200
y = 0.19039023932798 + 0.9595425266093232*x
Iteration 300
y = 0.131001775359636 + 0.9682276045899648*x
Iteration 400
y = 0.0976414198558026 + 0.9768105555303794*

Iteration 1200
y = 0.004337033192637736 + 0.9995453114338523*x
Iteration 1300
y = -0.002232503788417964 + 0.9996675704629407*x
Iteration 1400
y = -0.003175714566848448 + 1.001726748333116*x
Iteration 1500
y = 0.0017922623781967137 + 1.002896444822712*x
Iteration 1600
y = 0.006419973569301237 + 1.0207976854719463*x
Iteration 1700
y = -0.0016089425038976848 + 1.002064305305371*x
Iteration 1800
y = -0.001312944159504584 + 1.0004771032103619*x
Iteration 1900
y = 0.0018832995488001374 + 0.9996762870157483*x
Iteration 2000
y = 0.0004614877804068817 + 0.9998463190222946*x
Iteration 2100
y = 0.004545353246595934 + 0.9978728416847239*x
Iteration 2200
y = -0.008540644051944516 + 0.9983230001019476*x
Iteration 2300
y = -0.0006291195425393184 + 0.999974652122385*x
Iteration 2400
y = 2.9993538141773482e-05 + 1.0000295585623757*x
Iteration 2500
y = 0.0034695048547479716 + 1.0048621076105955*x
Iteration 2600
y = -0.011291441807548248 + 1.0043660446455576*x
Iteration 2700
y = 0.00692214513795806 + 0.9

In [45]:
# Here we have a simple line with intercept = 0 and slope = 2
xs = [1,2,3,4,5,6,7]
ys = [2,4,6,8,10,12,14]

# Gradient Descent
model = Line()
print("Gradient Descent: ")
gradient_descent(model, xs, ys)
print(model)

# Stochastic Gradient Descent
model = Line()
print("Stochastic Gradient Descent: ")
stochastic_gradient_descent(model, xs, ys)
print(model)

# Stochastic Gradient Descent with Momentum
model = Line()
print("SGD + Momentum: ")
sgd_momentum(model, xs, ys)
print(model)

# Adagrad
model = Line()
print("Adagrad")
adagrad(model, xs, ys)
print(model)

# RMSprop
model = Line()
print("RMSprop")
RMSprop(model, xs, ys)
print(model)

Gradient Descent: 
Iteration 0
y = 0.2789325278004371 + 1.3358568567918874*x
Iteration 100
y = 0.2696616613331869 + 1.9457486159621535*x
Iteration 200
y = 0.1823392085369287 + 1.96331642259197*x
Iteration 300
y = 0.12329371111006296 + 1.975195382102839*x
Iteration 400
y = 0.08336846101979738 + 1.9832276699139633*x
Iteration 500
y = 0.05637189626488761 + 1.988658924008373*x
Iteration 600
y = 0.038117420540419854 + 1.9923314170429465*x
Iteration 700
y = 0.02577415068366662 + 1.9948146750261944*x
Iteration 800
y = 0.017427906559414653 + 1.9964937987585774*x
Iteration 900
y = 0.011784362199611246 + 1.997629184823043*x
y = 0.007999564295136265 + 1.9983906224097068*x
Stochastic Gradient Descent: 
Iteration 0
y = 0.7308891888267327 + 1.8762419506553245*x
Iteration 100
y = 0.4975943009913024 + 1.9076720943802694*x
Iteration 200
y = 0.3474687676852617 + 1.9061960148650012*x
Iteration 300
y = 0.240941969023524 + 1.9654983735485392*x
Iteration 400
y = 0.1679475126203722 + 1.9664212852345133*x
Ite

Iteration 2100
y = -0.005964468467718368 + 2.0023425819307072*x
Iteration 2200
y = -0.006794529475300516 + 2.003588930913294*x
Iteration 2300
y = 0.0008427397724754686 + 1.9996573747572184*x
Iteration 2400
y = 0.005685151593749328 + 1.9985801702079358*x
Iteration 2500
y = 0.01870763759046995 + 2.0038587846730262*x
Iteration 2600
y = 0.0036807255375019676 + 2.012878853497082*x
Iteration 2700
y = -0.006363308902545225 + 1.9980854492835023*x
Iteration 2800
y = -0.005603542350455338 + 2.0016564656151545*x
Iteration 2900
y = 0.0043663000043937476 + 1.9928981789933766*x
Iteration 3000
y = -0.0019430511767457477 + 1.9832186800120126*x
Iteration 3100
y = 0.010357198193816497 + 2.013058606693389*x
Iteration 3200
y = -0.0007620524102915793 + 1.9996594153676133*x
Iteration 3300
y = 0.013818754012331099 + 1.9962823799904879*x
Iteration 3400
y = -0.012039259985344893 + 1.9937149502422402*x
Iteration 3500
y = 0.004094262002633386 + 1.9984871875180579*x
Iteration 3600
y = 0.0011615686487377727 + 1.99

In [46]:
# Here we have a simple line with intercept = 1 and slope = 2
xs = [1,2,3,4,5,6,7]
ys = [3,5,7,9,11,13,15]

# Gradient Descent
model = Line()
print("Gradient Descent: ")
gradient_descent(model, xs, ys)
print(model)

# Stochastic Gradient Descent
model = Line()
print("Stochastic Gradient Descent: ")
stochastic_gradient_descent(model, xs, ys)
print(model)

# Stochastic Gradient Descent with Momentum
model = Line()
print("SGD + Momentum: ")
sgd_momentum(model, xs, ys)
print(model)

# Adagrad
model = Line()
print("Adagrad")
adagrad(model, xs, ys)
print(model)

# RMSprop
model = Line()
print("RMSprop")
RMSprop(model, xs, ys)
print(model)

Gradient Descent: 
Iteration 0
y = 0.7406505784360102 + 1.061529569230414*x
Iteration 100
y = 0.9566930775180532 + 2.008712623334922*x
Iteration 200
y = 0.9707168237023819 + 2.005891281820767*x
Iteration 300
y = 0.9801993684858387 + 2.0039835535357744*x
Iteration 400
y = 0.986611254039696 + 2.0026935901651224*x
Iteration 500
y = 0.9909468282230623 + 2.0018213456685054*x
Iteration 600
y = 0.9938784469086396 + 2.0012315533696023*x
Iteration 700
y = 0.9958607421604649 + 2.0008327489550206*x
Iteration 800
y = 0.9972011260530711 + 2.00056308629346*x
Iteration 900
y = 0.9981074637834889 + 2.0003807464085916*x
y = 0.9987152919361768 + 2.0002584616226216*x
Stochastic Gradient Descent: 
Iteration 0
y = 0.8004437429705904 + 2.0062370180114977*x
Iteration 100
y = 0.86857928026988 + 2.0254292343549656*x
Iteration 200
y = 0.9102812816021776 + 2.018083143283916*x
Iteration 300
y = 0.9334817642389961 + 2.013862892751264*x
Iteration 400
y = 0.9514917304943428 + 2.0116075054713884*x
Iteration 500
y = 0

Iteration 2400
y = 0.9993541814265278 + 2.000320188160971*x
Iteration 2500
y = 0.9991930642104807 + 2.000403955560601*x
Iteration 2600
y = 0.9884193383858861 + 2.0037612828442994*x
Iteration 2700
y = 0.9934887753613368 + 2.0004912593732627*x
Iteration 2800
y = 1.0064595528928055 + 1.9980949213660453*x
Iteration 2900
y = 1.0007562408422417 + 2.0002820716179213*x
Iteration 3000
y = 1.0017162163431843 + 1.9994301048622989*x
Iteration 3100
y = 0.9991440328217209 + 2.0004284270575767*x
Iteration 3200
y = 1.0030353399620067 + 1.9994828452026163*x
Iteration 3300
y = 0.9804975053946241 + 1.984953533109832*x
Iteration 3400
y = 1.0015952328192343 + 1.999192511700589*x
Iteration 3500
y = 1.0127455021550353 + 1.9966990273757355*x
Iteration 3600
y = 1.0000358258171977 + 1.9999355639133525*x
Iteration 3700
y = 0.999169967918656 + 1.9995066047746435*x
Iteration 3800
y = 1.0047423424401194 + 2.016269316768565*x
Iteration 3900
y = 0.9894310955279736 + 1.9974922489325602*x
Iteration 4000
y = 0.999263796