# Stochastic Gradient Descent with Momentum
In this notebook we will code the Stochastic Gradient Descent Algorithm from Scratch with Momentum using Python and we will visualize how it behaves when given a simple learning task.

## Pseudo Code
In a nutshell the algorithm work like this:
- Initiate the gradient for the previous iteration to be 0
- Repeat the below until convergence or when we run out of timesteps:
    - Calculate the current gradient as: `exponential_decay_factor*previous_gradient - learning_rate*derivative_of_cost`
    - update the weight by adding the current gradient
    - Set the previous gradient to be the current gradient

In [22]:
import numpy as np
from numpy.random import permutation

def f(w1,w2,x):
    '''
        f: function we are trying to estimate the parameters (line)
        w1: bias
        w2: slope
        x: a point in the plane
        
        return yhat an estimate of y
    '''
    yhat = w1 + w2*x
    return yhat

def dx_w1(w1,w2,x,y):
    '''
        dx_w1: partial derivative of the weight w1 for function f
        w1: bias
        w2: slope
        x: a point in the plane
        y: the response of the point x
        
        return gradient which is the gradient at that point for this x and y for w1
    '''
    yhat = f(w1,w2,x)
    gradient = 2*(yhat - y)
    return gradient

def dx_w2(w1,w2,x,y):
    '''
        dx_w2: partial derivative of the weight w2 for function f
        w1: bias
        w2: slope
        x: a point in the plane
        y: the response of the point x
        
        return gradient which is the gradient at that point for this x and y for w2
    '''    
    yhat = f(w1,w2,x)
    gradient = 2*x*(yhat - y)
    return gradient


def stochastic_gradient_descent(xs, ys, learning_rate = 0.01, decay_factor = 0.9, max_num_iteration = 1000):
    
    # Randomly initialize the weight w1 and w2
    w1 = np.random.uniform(0,1,1)
    w2 = np.random.uniform(0,1,1)

    prev_gradient_w1 = 0
    prev_gradient_w2 = 0
    
    iteration = 0

    while iteration < max_num_iteration:
        
        perm = permutation(len(xs))
        xr = xs[perm]
        yr = ys[perm]

        x = xr[0]
        y = yr[0]
        
        gradient_w1 = decay_factor*prev_gradient_w1 - learning_rate*dx_w1(w1,w2,x,y)
        gradient_w2 = decay_factor*prev_gradient_w2 - learning_rate*dx_w2(w1,w2,x,y)
        
        w1 = w1 + gradient_w1
        w2 = w2 + gradient_w2
        
        prev_gradient_w1 = gradient_w1
        prev_gradient_w2 = gradient_w2
        
        iteration = iteration + 1
    
        if iteration % 100 == 0:
            print(f"Iteration {iteration}")
            print(f"W1 = {w1}")
            print(f"W2 = {w2}")
    
    return (w1,w2)
        

In [23]:
# Here we have a simple line with intercept = 0 and slope = 1
xs = np.array([1,2,3,4,5,6,7])
ys = np.array([1,2,3,4,5,6,7])
(w1,w2) = stochastic_gradient_descent(xs,ys)
print(w1,w2)

Iteration 100
W1 = [0.04679401]
W2 = [0.99677495]
Iteration 200
W1 = [-0.00639827]
W2 = [1.01086287]
Iteration 300
W1 = [-0.00034862]
W2 = [0.99859601]
Iteration 400
W1 = [0.0003179]
W2 = [1.00034575]
Iteration 500
W1 = [0.00016487]
W2 = [0.9998794]
Iteration 600
W1 = [0.00011044]
W2 = [1.00008353]
Iteration 700
W1 = [-3.26907409e-05]
W2 = [0.99994309]
Iteration 800
W1 = [-6.35786955e-06]
W2 = [1.00000022]
Iteration 900
W1 = [4.92323847e-07]
W2 = [0.99999987]
Iteration 1000
W1 = [2.37054786e-07]
W2 = [0.9999993]
[2.37054786e-07] [0.9999993]


In [24]:
# Here we have a simple line with intercept = 1 and slope = 2
xs = np.array([1,2,3,4,5,6,7])
ys = np.array([3,5,7,9,11,13,15])
(w1,w2) = stochastic_gradient_descent(xs,ys)
print(w1,w2)

Iteration 100
W1 = [0.66366045]
W2 = [2.11993004]
Iteration 200
W1 = [1.0184634]
W2 = [1.99783607]
Iteration 300
W1 = [0.98708618]
W2 = [2.00054736]
Iteration 400
W1 = [0.99356234]
W2 = [2.00020221]
Iteration 500
W1 = [1.00019198]
W2 = [2.00012619]
Iteration 600
W1 = [1.00010617]
W2 = [1.99997991]
Iteration 700
W1 = [0.99999758]
W2 = [2.00000108]
Iteration 800
W1 = [1.00002325]
W2 = [2.00000186]
Iteration 900
W1 = [0.99999682]
W2 = [1.99999403]
Iteration 1000
W1 = [0.99999579]
W2 = [1.99998935]
[0.99999579] [1.99998935]


In [25]:
# Here we have a simple line with intercept = 0 and slope = 2
xs = np.array([1,2,3,4,5,6,7])
ys = np.array([2,4,6,8,10,12,14])
(w1,w2) = stochastic_gradient_descent(xs,ys)
print(w1,w2)

Iteration 100
W1 = [0.94753198]
W2 = [2.37083653]
Iteration 200
W1 = [-0.07306528]
W2 = [2.05079373]
Iteration 300
W1 = [0.00029184]
W2 = [1.99904906]
Iteration 400
W1 = [0.00320269]
W2 = [1.99677224]
Iteration 500
W1 = [8.62550934e-05]
W2 = [2.00011322]
Iteration 600
W1 = [-0.0002954]
W2 = [1.99969239]
Iteration 700
W1 = [2.13294674e-05]
W2 = [1.99990829]
Iteration 800
W1 = [-7.56718947e-05]
W2 = [2.00013542]
Iteration 900
W1 = [9.2616811e-06]
W2 = [2.00004751]
Iteration 1000
W1 = [0.00012359]
W2 = [1.99901294]
[0.00012359] [1.99901294]
