# AdaGrad
In this notebook we will code the AdaGrad algorithm from Scratch using Python and we will visualize how it behaves when given a simple learning task.

This algorithm estimate the gradient using a subset of the training data (randomly selected) and make us of past gradient to weight the learning rate which is usually set to 0.01

For the update if we have multiple feature we need to take the partial derivative of each feature for the function we are trying to estimate.

In [5]:
# f(x) = w0 + w1*x
# we are trying to fit the best w0 and w1 we can on the dataset
# (x1,x2,...,xn) with (y1,y2,...,yn)
# we are using least squares
# We need to minimize: Sum[i=1:N](yhat_i - yi)^2
# Which tranlsate to Sum[i=1:N](w0 + w1*xi - yi)^2

# The gradient are then the following
# df(x)/d(w0) = (1/N) * (Sum[i=1:N] 2*(w0 + w1*xi - yi))
# df(x)/d(w1) = (1/N) * (Sum[i=1:N] 2*xi*(w0 + w1*xi - yi))


import numpy as np
from numpy.random import permutation

def f(w0,w1,x):
    '''
        f: function we are trying to estimate the parameters (line)
        w0: bias
        w1: slope
        x: a point in the plane
        
        return yhat an estimate of y
    '''
    yhat = w0 + w1*x
    return yhat

def dx_w0(w0,w1,x,y):
    '''
        dx_w0: partial derivative of the weight w0 for function f
        w0: bias
        w1: slope
        x: a point in the plane
        y: the response of the point x
        
        return gradient which is the gradient at that point for this x and y for w0
    '''
    yhat = f(w0,w1,x)
    gradient = 2*(yhat - y)
    return gradient

def dx_w1(w0,w1,x,y):
    '''
        dx_w1: partial derivative of the weight w1 for function f
        w0: bias
        w1: slope
        x: a point in the plane
        y: the response of the point x
        
        return gradient which is the gradient at that point for this x and y for w1
    '''    
    yhat = f(w0,w1,x)
    gradient = 2*x*(yhat - y)
    return gradient


def adagrad(xs, ys, learning_rate = 0.1, max_num_iteration = 10000, eps=0.0000001):
    
    # Randomly initialize the weight w1 and w2
    w0 = np.random.uniform(0,1,1)
    w1 = np.random.uniform(0,1,1)
    
    # Here only the diagonal matter
    G = [[0,0],
         [0,0]]
    
    iteration = 0
    while iteration < max_num_iteration:
        
        perm = permutation(len(xs))
        xr = xs[perm]
        yr = ys[perm]

        x = xr[0]
        y = yr[0]
        
        g0 = dx_w0(w0,w1,x,y)
        g1 = dx_w1(w0,w1,x,y)
        
        G[0][0] = G[0][0] + g0*g0
        G[1][1] = G[1][1] + g1*g1
        
        w0 = w0 - (learning_rate/np.sqrt(G[0][0] + eps)) * g0
        w1 = w1 - (learning_rate/np.sqrt(G[1][1] + eps)) * g1
        
        iteration = iteration + 1
    
        if iteration % 100 == 0:
            print(f"Iteration {iteration}")
            print(f"W0 = {w0}")
            print(f"W1 = {w1}")
    
    return (w0,w1)
        

In [6]:
# Here we have a simple line with intercept = 0 and slope = 1
xs = np.array([1,2,3,4,5,6,7])
ys = np.array([1,2,3,4,5,6,7])
(w0,w1) = adagrad(xs,ys)
print(w0,w1)

Iteration 100
W0 = [0.76577277]
W1 = [0.8204972]
Iteration 200
W0 = [0.68532368]
W1 = [0.85604548]
Iteration 300
W0 = [0.62264082]
W1 = [0.87799457]
Iteration 400
W0 = [0.54867631]
W1 = [0.88596567]
Iteration 500
W0 = [0.47402276]
W1 = [0.8992883]
Iteration 600
W0 = [0.41726199]
W1 = [0.91260877]
Iteration 700
W0 = [0.37381124]
W1 = [0.91855069]
Iteration 800
W0 = [0.3391826]
W1 = [0.93010449]
Iteration 900
W0 = [0.29938186]
W1 = [0.9371298]
Iteration 1000
W0 = [0.27645289]
W1 = [0.94754331]
Iteration 1100
W0 = [0.23263306]
W1 = [0.94611994]
Iteration 1200
W0 = [0.21864874]
W1 = [0.95882055]
Iteration 1300
W0 = [0.19137292]
W1 = [0.96069542]
Iteration 1400
W0 = [0.1674093]
W1 = [0.96569289]
Iteration 1500
W0 = [0.14785636]
W1 = [0.9682544]
Iteration 1600
W0 = [0.13297494]
W1 = [0.97199049]
Iteration 1700
W0 = [0.12085064]
W1 = [0.97517561]
Iteration 1800
W0 = [0.10259883]
W1 = [0.97624921]
Iteration 1900
W0 = [0.0964831]
W1 = [0.98124711]
Iteration 2000
W0 = [0.08563508]
W1 = [0.982220

In [7]:
# Here we have a simple line with intercept = 0 and slope = 2
xs = np.array([1,2,3,4,5,6,7])
ys = np.array([2,4,6,8,10,12,14])
(w0,w1) = adagrad(xs,ys)
print(w0,w1)

Iteration 100
W0 = [1.4810176]
W1 = [1.29291814]
Iteration 200
W0 = [1.69183505]
W1 = [1.55412882]
Iteration 300
W0 = [1.68290487]
W1 = [1.6145571]
Iteration 400
W0 = [1.65296181]
W1 = [1.64697463]
Iteration 500
W0 = [1.58662324]
W1 = [1.64872187]
Iteration 600
W0 = [1.541732]
W1 = [1.66921681]
Iteration 700
W0 = [1.50502462]
W1 = [1.6888788]
Iteration 800
W0 = [1.4466929]
W1 = [1.70218591]
Iteration 900
W0 = [1.3817204]
W1 = [1.70813252]
Iteration 1000
W0 = [1.33259133]
W1 = [1.72359061]
Iteration 1100
W0 = [1.27470273]
W1 = [1.72165704]
Iteration 1200
W0 = [1.24212978]
W1 = [1.74061829]
Iteration 1300
W0 = [1.18240306]
W1 = [1.74083412]
Iteration 1400
W0 = [1.15248899]
W1 = [1.76084473]
Iteration 1500
W0 = [1.11915712]
W1 = [1.77422631]
Iteration 1600
W0 = [1.07640569]
W1 = [1.77243442]
Iteration 1700
W0 = [1.03392179]
W1 = [1.77732933]
Iteration 1800
W0 = [1.01003706]
W1 = [1.78626542]
Iteration 1900
W0 = [0.96993452]
W1 = [1.78829135]
Iteration 2000
W0 = [0.94964321]
W1 = [1.806908

In [8]:
# Here we have a simple line with intercept = 1 and slope = 2
xs = np.array([1,2,3,4,5,6,7])
ys = np.array([3,5,7,9,11,13,15])
(w0,w1) = adagrad(xs,ys)
print(w0,w1)

Iteration 100
W0 = [2.23024861]
W1 = [1.32212727]
Iteration 200
W0 = [2.43033016]
W1 = [1.56587716]
Iteration 300
W0 = [2.46657328]
W1 = [1.65047771]
Iteration 400
W0 = [2.43182151]
W1 = [1.67408372]
Iteration 500
W0 = [2.39775408]
W1 = [1.69972672]
Iteration 600
W0 = [2.36403531]
W1 = [1.71247205]
Iteration 700
W0 = [2.32251528]
W1 = [1.72531943]
Iteration 800
W0 = [2.26872862]
W1 = [1.72968101]
Iteration 900
W0 = [2.22650601]
W1 = [1.74256662]
Iteration 1000
W0 = [2.19732753]
W1 = [1.76025476]
Iteration 1100
W0 = [2.14733383]
W1 = [1.76821815]
Iteration 1200
W0 = [2.1049319]
W1 = [1.77276331]
Iteration 1300
W0 = [2.05728389]
W1 = [1.77199014]
Iteration 1400
W0 = [2.03096112]
W1 = [1.78417196]
Iteration 1500
W0 = [1.98693093]
W1 = [1.78716033]
Iteration 1600
W0 = [1.95648765]
W1 = [1.79780196]
Iteration 1700
W0 = [1.92065035]
W1 = [1.80482458]
Iteration 1800
W0 = [1.87797392]
W1 = [1.80593045]
Iteration 1900
W0 = [1.84780761]
W1 = [1.81188903]
Iteration 2000
W0 = [1.81807638]
W1 = [1.