In [None]:
def adagrad():
    # initial learning rate
    eta0 = 1e-2
    # safe-guard against divergence
    epsilon = 1e-8
    g2_t = 0
    while True:
        gradient = yield
        g2_t += gradient**2
        yield eta0 / np.sqrt(epsilon + g2_t)
    
def rmsprop():
    # initial learning rate
    eta0 = 1e-3
    # decay constant
    beta=0.9 
    # safe-guard against divergence
    epsilon = 1e-8
    # moving average of second moment of gradient
    g2_t_avg = 0
    
    while True:
        gradient = yield
        g2_t_avg = beta*g2_t_avg + (1-beta)*gradient**2
        yield eta0 / np.sqrt(epsilon + g2_t_avg)
        
def adam():
    # initial learning rate
    eta0 = 1e-3
    # decay constant for first moment
    beta1=0.9 
    # decay constant for second moment 
    beta2=0.999
    # safe-guard against divergence
    epsilon = 1e-8
    # moving average of second moment of gradient
    g2_t_avg = 0
    # moving average of first moment of gradient
    g_t_avg = 0
    
    while True:
        gradient = yield
        g_t_avg = beta1*g_t_avg + (1-beta1)*gradient
        g2_t_avg = beta2*g2_t_avg + (1-beta2)*gradient**2
        
        g_t_avg_corr = g_t_avg/(1-beta1**2)
        g2_t_avg_corr = g2_t_avg/(1-beta2**2)
        # divide out gradient to conform with structure of standard SGD update (see SVD class defintion)
        yield eta0 / (epsilon + np.sqrt(g2_t_avg_corr) + gradient) * g_t_avg

In [None]:
X = np.c_[np.ones((m,1)), x]
X_scaled = np.c_[np.ones((m,1)), x_scaled]
sgd_own = StochasticGradientDescent(batches=5, 
                                    verbose=True, 
                                    tol=1e-4, 
                                    momentum=0.6,
                                    early_stopping=True, 
                                    validation_fraction=0.2)
sgd_own.fit(X,y.ravel())
print(sgd_own.params)
print(sgd_own.score(X,y.ravel()))