<a href="https://colab.research.google.com/github/thapaliya123/cat_dog_predictions/blob/master/optimizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#gradient_descent_update_rule
def update_parameters(parameters, grads, learning_rate):
    """
    Update parameters using gradient descent
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients, output of L_model_backward
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    
    L = len(parameters) // 2 # number of layers in the neural network

    # Update rule for each parameter. Use a for loop.
    ### START CODE HERE ### (≈ 3 lines of code)
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W"+str(l+1)] - learning_rate*grads["dW"+str(l+1)]
        parameters["b" + str(l+1)] = parameters["b"+str(l+1)] - learning_rate*grads["db"+str(l+1)]
    ### END CODE HERE ###
    return parameters

In [None]:

import numpy as np
#initialization_for_adam_optimization
def initialize_adam(parameters):
  '''
  Initializes v and s two python dictionaries with key's.
    -keys:"dW1","db1"........"dWL","dbL"
    -values:numpy areas initializes with zeros with same shape as that of respective gradients/parameters

  Arguments:
  parameters -- python dictionary containin your parameters.

  Returns:
  v -- python dictionary that will contain your exponentially weighted average of the gradients
    v["dW"+str(l)]=....
    v["db"+str(l)]=....

  s -- python dictionary that will contain the exponentially weighted average of the square of the gradients
    s["dW"+str(l)]=....
    s["db"+str(l)]=....
  '''
  L = len(parameters)//2 #number_of_layers_in_the_model
  v = {}
  s = {}
  for l in range(1,L+1):
    v["dW" + str(l)] = np.zeros((parameters["W"+str(l)].shape[0], parameters["W"+str(l)].shape[1]))
    v["db" + str(l)] = np.zeros((parameters["b"+str(l)].shape[0], parameters["b"+str(l)].shape[1]))
    s["dW" + str(l)] = np.zeros((parameters["W"+str(l)].shape[0], parameters["W"+str(l)].shape[1]))
    s["db" + str(l)] = np.zeros((parameters["b"+str(l)].shape[0], parameters["b"+str(l)].shape[1]))
  return v, s


In [None]:
#adam_optimizer_update_rules
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8):
  '''
  update parameters using Adam

  Arguments:
  parameters -- python dictionary containing your parameters
  grads -- python dictionary containing your gradients
  v -- Adam variable, moving average of the first gradient, python dictionary
  s -- Adam variable, moving gradients of the square gradient, python dictionary
  t -- epoch_num
  learning_rate -- the learning_rate scalar.
  beta1 -- Exponentially decay hyperparameter for the first moment estimates
  beta2 -- Exponentially decay hyperparameter for the second moment estimates
  epsilon -- hyperparameter preventing division by zeros in the Adam updates

  Returns:
  parameters -- python dictionary containing your updated parameters
  v -- Adam variable, moving gradients of the first quadrant, python dictionary
  s -- Adam variable, moving gradients of the second quadrant, python dictionary
  '''
  L = len(parameters)//2 #number of Layers in the model
  v_corrected = {} #initializing first moment estimates, python dictionary
  s_corrected = {} #initializing second moment estimates, python dictionary
  for l in range(L):
    # Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".
    v["dW" + str(l+1)] = (beta1*v["dW"+str(l+1)]) + ((1-beta1)*grads['dW'+str(l+1)])
    v["db" + str(l+1)] = (beta1*v["db"+str(l+1)]) + ((1-beta1)*grads['db'+str(l+1)])
    
    # Compute bias-corrected first moment estimate. Inputs: "v, beta1, t". Output: "v_corrected".
    v_corrected["dW" + str(l + 1)] = v["dW" + str(l + 1)] / (1 - np.power(beta1, t))
    v_corrected["db" + str(l + 1)] = v["db" + str(l + 1)] / (1 - np.power(beta1, t))

    # Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: "s".
    s["dW" + str(l + 1)] = beta2 * s["dW" + str(l + 1)] + (1 - beta2) * np.power(grads['dW' + str(l + 1)], 2)
    s["db" + str(l + 1)] = beta2 * s["db" + str(l + 1)] + (1 - beta2) * np.power(grads['db' + str(l + 1)], 2)

    # Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".
    s_corrected["dW" + str(l + 1)] = s["dW" + str(l + 1)] / (1 - np.power(beta2, t))
    s_corrected["db" + str(l + 1)] = s["db" + str(l + 1)] / (1 - np.power(beta2, t))

    # Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".+ epsilon
    parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * (v_corrected["dW" + str(l + 1)] / np.sqrt(s_corrected["dW" + str(l + 1)]+epsilon))
    parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * (v_corrected["db" + str(l + 1)] / np.sqrt(s_corrected["db" + str(l + 1)]+epsilon))
  return parameters, v, s