For this project we will 
    - implement SGD
    - implement varrying 
    - implement batch SGD
    - other variations of accelerations

In [None]:
import numpy as np

In [None]:
def f1(x): # f(x,y,z) = y^2 + z
    return x[1:2, :]**2 + x[2:3, :]

def df1(x):
    x = list(x.squeeze())
    return cv([0, 2*x[1], 1])

def f2(x): # f(x,y,z) = xy
    return x[0:1, :]*x[1:2, :]

def df2(x):
    x = list(x.squeeze())
    return cv([x[1], x[0], 1])

def rv(values):
    return np.array([values])

def cv(values):
    return rv(values).T

In [None]:
def mse(X, Y, th, th0):
  return np.mean((Y - lin_reg(X, th, th0)) ** 2, axis=0, keepdims=True)

def ridge_obj(X, Y, th, th0, lam):
  return mse(X, Y, th, th0) + lam * np.linalg.norm(th) ** 2

def J(Xi, yi, w):
    # translate from (1-augmented X, y, theta) to (separated X, y, th, th0) format
    return ridge_obj(Xi[:, :-1], yi, w[:-1, :], w[-1:, :], 0)

def dJ(Xi, yi, w):
    def f(w):
        return J(Xi, yi, w)

    return make_num_grad_fn(f)(w)

In [None]:
def objective_func(X, Y, lam):
    def J(theta):
        err = X @ theta - Y
        reg = lam * (theta.T @ theta)
        return float((err.T@err)/len(Y)+reg)

    return J

def objective_func_grad(X, Y, lam):
    def dJ(theta):
        return 2/len(Y)*X.T@(X@theta-Y)+2*lam*theta
    return dJ

In [None]:
def make_num_grad_fn(f, delta=1e-6):
    def df_num(x):
        n = x.shape[0]
        grads = []
        for i in range(n):
            temp = np.zeros_like(x, dtype=float)
            temp[i, 0] = delta
            diff = (f(x + temp) - f(x - temp)) / (2 * delta)
            grads.append(diff.item())
        return cv(grads)
    return df_num

In [None]:
def sgd(X, y, J, dJ, w0, step_size_fn, num_iter):
    n = len(y)
    w = w0
    for i in range(num_iter):
        this_n = np.random.randint(n)
        w -= (step_size_fn(i)*dJ(rv(X[this_n]), y[this_n], w))
    return w