In [1]:
import numpy as np

In [3]:
def make_diagonal(x):
    """
    Converts a vector into an diagonal matrix
    """
    m = np.zeros((len(x),len(x)))
    for i in range(len(m[0])):
        m[i,i]=x[i]
    return m

In [4]:
def normalize(X,axis=-1,order=2):
    """
    Normalize the dataset X
    """
    l2 = np.atleast_1d(np.linalg.norm(X,order,axis))
    l2[l2 == 0] = 1##??
    return X/np.expand_dims(l2,axis)

In [14]:
class StochasticGradientDescent():
    def __init__(self,learning_rate=0.01,momentum=0):
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.w_updt = None
    def update(self,w,grad_wrt_w):
        #if not initialized
        if self.w_updt is None:
            self.w_updt = np.zeros(np.shape(w))
        #Use momentum is set
        self.w_updt = self.momentum*self.w_updt + (1-self.momentum)*grad_wrt_w
        #Move against the gradient to minimize loss
        return w - self.learning_rate*self.w_updt
    


In [15]:
class Adagrad():
    def __init__(self,learning_rate = 0.01):
        self.learning_rate = learning_rate
        self.G = None#Sum of squares of the gradients
        self.eps = 1e-8
    def update(self,w,grad_wrt_w):
        #If not initialized
        if self.G is None:
            self.G = np.zeros(np.shape(w))
        #Add the square of the gradient of the loss function at w
        self.G += np.power(grad_wrt_w,2)
        #Adaptive gradient with higher learning rate for sparse data
        return w - self.learning_rate * grad_wrt_w/np.sqrt(self.G+self.eps)

In [16]:
class Adam():
    def __init__(self,learning_rate=0.001,b1=0.9,b2=0.999):
        self.learning_rate = learning_rate
        self.eps = 1e-8
        self.m = None
        self.v = None
        #Delay rates
        self.b1 = b1
        self.b2 = b2
    def update(self,w,grad_wrt_w):
        # If not initialized
        if self.m is None:
            self.m = np.zeros(np.shape(grad_wrt_w))
            self.v = np.zeros(np.shape(grad_wrt_w))
        self.m = self.b1 * self.m + (1-self.b1) * grad_wrt_w
        self.v = self.b2 * self.v + (1-self.b2) * grad_wrt_w
        
        m_hat = self.m / (1 - self.b1)
        v_hat = self.v / (1 - self.b2)
        
        self.w_updt = self.learning_rate * m_hat/(np.sqrt(v_hat) + self.eps)
        return w - self.w_updt