In [1]:
import numpy as np

$$x_{new} = x - \alpha * \nabla f(x)$$

In [2]:
def gradient_descent(x0, f, gradient, alpha, num_iters, tol):
    result = {}
    result['converged'] = False

    x = x0
    for i in range(num_iters):
        x_new = x - alpha * gradient(x)
        if abs(f(x_new) - f(x)) < tol:
            result['converged'] = True
            break
        x = x_new

    result['iter'] = i
    result['x'] = x

    return result

In [3]:
def f(x):
    return 0.5*(x[0]**2 + 10*x[1]**2)

In [4]:
def gradient(x): # vraca f'x(x,y), f'y(x,y)
    return np.array([x[0], 10*x[1]])

In [5]:
x0 = np.array((700,400))
alpha = 0.01
num_iters = 10000
tol = 1e-5

gradient_descent(x0, f, gradient, alpha, num_iters, tol)

{'converged': True, 'iter': 996, 'x': array([3.14595031e-02, 1.06561271e-43])}

$$inertia = \beta * inertia - \alpha * \nabla f(x)$$
$$x_{new} = x + inertia$$

In [6]:
def momentum(x0, f, gradient, alpha, num_iters, tol, beta=0.9):
    result = {}
    result['converged'] = False

    x = x0
    inertia = 0 ##
    for i in range(num_iters):
        inertia = beta * inertia - alpha * gradient(x) ##
        x_new = x + inertia ##
        if abs(f(x_new) - f(x)) < tol:
            result['converged'] = True
            break
        x = x_new

    result['iter'] = i
    result['x'] = x

    return result

In [7]:
momentum(x0, f, gradient, alpha, num_iters, tol, beta=0.9)

{'converged': True, 'iter': 194, 'x': array([-0.01804139,  0.01335318])}

$$inertia = \beta * inertia - \alpha * \nabla f(x + \beta * inertia)$$
$$x_{new} = x + inertia$$

In [8]:
def nesterov(x0, f, gradient, alpha, num_iters, tol, beta=0.9):
    result = {}
    result['converged'] = False

    x = x0
    inertia = 0
    for i in range(num_iters):
        inertia = beta * inertia - alpha * gradient(x + beta*inertia) ## isti samo gradient(x + beta*inertia)
        x_new = x + inertia 
        if abs(f(x_new) - f(x)) < tol:
            result['converged'] = True
            break
        x = x_new

    result['iter'] = i
    result['x'] = x

    return result

In [9]:
nesterov(x0, f, gradient, alpha, num_iters, tol, beta=0.9)

{'converged': True,
 'iter': 172,
 'x': array([ 1.05179270e-03, -2.69569084e-06])}

$$m = \beta_1 * m + (1 - \beta_1) * \nabla f(x)$$
$$v = \beta_2 * v + (1 - \beta_2) * \nabla f(x)^2$$

$$m_{hat} = \frac{m}{1 - \beta_1^i}$$
$$v_{hat} = \frac{v}{1 - \beta_2^i}$$


$$x_{new} = x - \alpha * \frac{m_{hat}}{\sqrt{v_{hat}} + \epsilon}$$

In [25]:
def adam(x0, f, gradient, num_iters, tol, beta1, beta2, eps, alpha):
    result = {}
    result['converged'] = False

    x = x0
    m = 0 #
    v = 0 #
    for i in range(1, num_iters+1):
        grad = gradient(x)
        
        m = beta1 * m + (1 - beta1) * grad
        v = beta2 * v + (1 - beta2) * grad**2

        m_hat = m / (1 - beta1**i)
        v_hat = v / (1 - beta2**i)

        x_new = x - alpha * m_hat / (np.sqrt(v_hat) + eps)
        
        if abs(f(x_new) - f(x)) < tol:
            result['converged'] = True
            break
        x = x_new

    result['iter'] = i
    result['x'] = x

    return result

In [32]:
adam(x0, f, gradient, num_iters, tol, beta1=0.9, beta2=0.999, eps=1e-6, alpha=0.5)

{'converged': True, 'iter': 4117, 'x': array([4.11236404e-02, 1.14925986e-09])}