# Adaptive methods of Gradient Decent

In [1]:
import import_ipynb
import compute_gradients
import numpy as np
import math
import random

# Prepare Dataset
data = np.random.rand(500,2) # 500 rows and 2 columns

# Initiliaze the weights and bias
theta = np.zeros(2)

importing Jupyter notebook from compute_gradients.ipynb


In [13]:
# Setting Learning Rate adaptively using Adagrad
def AdaGrad(data, theta, lr = 1e-2, epsilon = 1e-8, num_iterations = 10000):
    
    # Initiliaze the gradient sum
    gradient_sum = np.zeros(theta.shape[0])
    
    for t in range(num_iterations):
        # Compute Gradient
        gradients = compute_gradients.compute_gradients(data,theta)
        # Get the sum of gradient squares in each iterations
        gradient_sum += gradients ** 2
        # Update the gradient
        gradient_update = gradients/np.sqrt(gradient_sum + epsilon)
        # Update the Model Parameters
        theta = theta - (lr*gradient_update)
        
    return theta
    
AdaGrad(data,theta,lr = 1e-2, epsilon = 1e-8, num_iterations = 10000)

array([ 2.29600325, -2.25983234])

In [14]:
# Doing away with learning rate using Adadelta
def AdaDelta(data, theta, gamma = 0.9, epsilon = 1e-8, num_iterations = 1000):
    
    # Initiliaze the Running Average of gradients
    E_grad2 = np.zeros(theta.shape[0])
    # Initiliaze the average of parameter update
    E_delta_theta2 = np.zeros(theta.shape[0])
    
    for t in range(num_iterations):
        # Compute Gradient
        gradients = compute_gradients.compute_gradients(data,theta)
        # Update the running average of gradients
        E_grad2 = (gamma * E_grad2) + ((1 - gamma) * (gradients ** 2))
        # Compute the units of parameter updates
        delta_theta = - (np.sqrt(E_delta_theta2 + epsilon) / np.sqrt(E_grad2 + epsilon)) * gradients
        # Compute the running average of parameter update
        E_delta_theta2 = (gamma * E_delta_theta2) + ((1-gamma)*(delta_theta ** 2))
        
        # Update the Model Parameters
        theta = theta + delta_theta
        
    return theta

AdaDelta(data, theta)

array([ 0.64107713, -0.45835509])

In [15]:
# Overcoming limitations of Adagrad using RMSProp
def RMSProp(data, theta, lr = 1e-2, gamma = 0.9, epsilon = 1e-8, num_iterations = 1000):
    
    # Initiliaze the Running Average of gradients
    E_grad2 = np.zeros(theta.shape[0])
    
    for t in range(num_iterations):
        # Compute Gradient
        gradients = compute_gradients.compute_gradients(data,theta)
        # Update the running average of gradients
        E_grad2 = (gamma * E_grad2) + ((1 - gamma) * (gradients ** 2))
        
        # Update the Model Parameters
        theta = theta - (lr / (np.sqrt(E_grad2 + epsilon)) * gradients)
        
    return theta

RMSProp(data,theta)

array([ 10.26238654, -10.02607312])

In [16]:
# Adaptive Moment Estimation
def Adam(data, theta, lr = 1e-2, beta1 = 0.9, beta2 = 0.9, epsilon = 1e-6, num_iterations = 1000):
    
    # Initiliaze the first and second moments
    mt = np.zeros(theta.shape[0])
    vt = np.zeros(theta.shape[0])
    
    for t in range(num_iterations):
        # Compute Gradient
        gradients = compute_gradients.compute_gradients(data,theta)
        # Update the first moment
        mt = (beta1 * mt) + ((1-beta1)*(gradients))
        # Update the second moment
        vt = (beta2 * vt) + ((1-beta2)*(gradients ** 2))
        # Compute the bias corrected estimate of first moment
        mt_hat = mt / (1 - beta1 ** (t+1))
        # Compute the bias corrected estimate of second moment
        vt_hat = vt / (1 - beta2 ** (t+1))
        
        # Update the Model Parameters
        theta = theta - (lr / (np.sqrt(vt_hat) + epsilon)) * mt_hat
        
    return theta

Adam(data,theta)

array([ 9.99450953, -9.93641256])

In [3]:
# Adaptive Moment estimation with AMSGrad
def AMSGrad(data, theta, lr = 1e-2, beta1 = 0.9, beta2 = 0.9, epsilon = 1e-6, num_iterations = 1000):
    
    # Initiliaze the first and second moments
    mt = np.zeros(theta.shape[0])
    vt = np.zeros(theta.shape[0])
    # Initiliaze the second order biased corrected estimates
    vt_hat = np.zeros(theta.shape[0])
    
    for t in range(num_iterations):
        # Compute Gradient
        gradients = compute_gradients.compute_gradients(data,theta)
        # Update the first moment
        mt = (beta1 * mt) + ((1-beta1)*(gradients))
        # Update the second moment
        vt = (beta2 * vt) + ((1-beta2)*(gradients ** 2))
        # Compute the bias corrected estimate of first moment
        mt_hat = mt / (1 - beta1 ** (t+1))
        # Compute the bias corrected estimate of second moment
        vt_hat = np.maximum(vt,vt_hat)
        
        # Update the Model Parameters
        theta = theta - (lr / (np.sqrt(vt_hat) + epsilon)) * mt_hat
        
    return theta

AMSGrad(data,theta)
        

array([ 10.08599393, -10.01494278])

In [7]:
# NADAM - Adding NAG to ADAM
def nadam(data, theta, lr = 1e-2, beta1 = 0.9, beta2 = 0.9, epsilon = 1e-6, num_iterations = 1000):
    
    # Initiliaze the first and second moments
    mt = np.zeros(theta.shape[0])
    vt = np.zeros(theta.shape[0])
    # To be use in mt_hat
    beta_prod = 1
    
    for t in range(num_iterations):
        # Compute Gradient
        gradients = compute_gradients.compute_gradients(data,theta)
        # Update the first moment
        mt = (beta1 * mt) + ((1-beta1)*(gradients))
        # Update the second moment
        vt = (beta2 * vt) + ((1-beta2)*(gradients ** 2))
        # Update the beta_prod
        beta_prod = beta_prod * beta1
        # Compute the mt_hat by adding NAG in first moment estimation
        mt_hat = mt / 1 - beta_prod
        g_hat = gradients / 1 - beta_prod
        vt_hat = vt / (1 - beta2 ** (t))
        mt_tilde = ((1-beta1**t+1) * mt_hat) + ((beta1**t)*g_hat)
        
        # Update the Model Parameters
        theta = theta - (lr / (np.sqrt(vt_hat) + epsilon)) * mt_tilde
        
        return theta
    
nadam(data, theta)
        



array([0., 0.])