In [3]:
import numpy as np
import math 

############################################################
# Optimization problem

trueW = np.array([1, 2, 3, 4, 5])
def generate():
    x = np.random.randn(len(trueW))
    y = trueW.dot(x) + np.random.randn()
    #print('example', x, y)
    return (x, y)

trainExamples = [generate() for i in range(1000)]

def phi(x):
    return np.array(x)

def initialWeightVector():
    return np.zeros(len(trueW))

def trainLoss(w):
    return 1.0 / len(trainExamples) * sum((w.dot(phi(x)) - y)**2 for x, y in trainExamples)

def gradientTrainLoss(w):
    return 1.0 / len(trainExamples) * sum(2 * (w.dot(phi(x)) - y) * phi(x) for x, y in trainExamples)

def loss(w, i):
    x, y = trainExamples[i]
    return (w.dot(phi(x)) - y)**2

def gradientLoss(w, i):
    x, y = trainExamples[i]
    return 2 * (w.dot(phi(x)) - y) * phi(x)

############################################################
# Optimization algorithm

def gradientDescent(F, gradientF, initialWeightVector):
    w = initialWeightVector()
    eta = 0.1
    for t in range(500):
        value = F(w)
        gradient = gradientF(w)
        w = w - eta * gradient
        print(f'epoch {t}: w = {w}, F(w) = {value}, gradientF = {gradient}')

def stochasticGradientDescent(f, gradientf, n, initialWeightVector):
    w = initialWeightVector()
    numUpdates = 0
    for t in range(500):
        for i in range(n):
            value = f(w, i)
            gradient = gradientf(w, i)
            numUpdates += 1
            eta = 1.0 / math.sqrt(numUpdates)
            w = w - eta * gradient
        print(f'epoch {t}: w = {w}, F(w)  = {value}, gradientF = {gradient}')

#gradientDescent(trainLoss, gradientTrainLoss, initialWeightVector)
stochasticGradientDescent(loss, gradientLoss, len(trainExamples), initialWeightVector)

epoch 0: w = [1.05296019 1.9647289  2.59362708 4.34739214 5.09384811], F(w)  = 1.0258253140057605, gradientF = [-1.01393343  1.44351618  0.56649065 -0.83226204  3.02171761]
epoch 1: w = [1.04615132 1.96529979 2.71417191 4.28741561 5.10575441], F(w)  = 1.0720724077610915, gradientF = [-1.03653697  1.47569638  0.57911938 -0.85081559  3.08908053]
epoch 2: w = [1.03255603 1.9634665  2.7770284  4.25452455 5.1014431 ], F(w)  = 1.078440557748461, gradientF = [-1.03961094  1.48007273  0.58083683 -0.85333879  3.09824157]
epoch 3: w = [1.02077406 1.96150318 2.81678294 4.2318722  5.0951256 ], F(w)  = 1.076328385493385, gradientF = [-1.03859238  1.47862263  0.58026775 -0.85250273  3.09520606]
epoch 4: w = [1.01115781 1.95961737 2.84467558 4.21469533 5.0891705 ], F(w)  = 1.0723457979471496, gradientF = [-1.03666912  1.47588452  0.57919322 -0.85092407  3.08947437]
epoch 5: w = [1.0033324  1.95788687 2.86556059 4.20098116 5.08400957], F(w)  = 1.0683184373549743, gradientF = [-1.03472061  1.47311046  