# Optimizer (November 17, 2024)

## Optimize the following function

$$ f(x_1, x_2) = 0.1w_1^2 + 2w_2^2

### With `Gradient Descent`

> $$ W = W - \alpha * dW

>\begin{align*}
>    W & = [w_1, w_2] \\
>    => dW & = [0.4w_1, 4w_2]
>\end{align*}

Initial values:
$$
w_1 = -5\\
w_2 = -2\\
\alpha = 0.4
$$

Epoch 1:
$$
\begin{align*}
dW & \leftarrow [0.4w_1, 4w_2] \\
   & = [0.4*(-5), 4*(-2)] \\
   & = [-2, -8] \\
W & \leftarrow W - \alpha*dW\\
  & = [-5,-2] - 0.4*[-2,-8]\\
  & = [-4.2, 1.2]
\end{align*}
$$

Epoch 2:
$$
\begin{align*}
dW & \leftarrow [0.4w_1, 4w_2] \\
   & = [0.4*(-4.2), 4*(1.2)] \\
   & = [-1.68, 4.8] \\
W & \leftarrow W - \alpha*[0.4\alpha_1, 4\alpha_2] \\
  & = [-4.2, 1.2] - 0.4*[-1.68, 4.8] \\
  & = [-3.528, -0.72]
\end{align*}
$$

In [None]:
import numpy as np

In [None]:
def df_W(W):
    return np.array([0.4, 4]) * W

In [None]:
def sgd(W, dW, lr):
    W = W - lr * dW
    return W

In [None]:
def train_p1(optimizer, lr, epochs):
    W = np.array([-5, -2], dtype=np.float32)
    results = [W]
    for _ in range(epochs):
        dW = df_W(W)
        W = optimizer(W, dW, lr)
        results.append(W)
    return results

In [None]:
train_p1(sgd, 0.4, 30)

[array([-5., -2.], dtype=float32),
 array([-4.2,  1.2]),
 array([-3.528, -0.72 ]),
 array([-2.96352,  0.432  ]),
 array([-2.4893568, -0.2592   ]),
 array([-2.09105971,  0.15552   ]),
 array([-1.75649016, -0.093312  ]),
 array([-1.47545173,  0.0559872 ]),
 array([-1.23937946, -0.03359232]),
 array([-1.04107874,  0.02015539]),
 array([-0.87450614, -0.01209324]),
 array([-0.73458516,  0.00725594]),
 array([-0.61705154, -0.00435356]),
 array([-0.51832329,  0.00261214]),
 array([-0.43539156, -0.00156728]),
 array([-0.36572891,  0.00094037]),
 array([-0.30721229, -0.00056422]),
 array([-0.25805832,  0.00033853]),
 array([-2.16768990e-01, -2.03119913e-04]),
 array([-1.82085951e-01,  1.21871948e-04]),
 array([-1.52952199e-01, -7.31231688e-05]),
 array([-1.28479847e-01,  4.38739013e-05]),
 array([-1.07923072e-01, -2.63243408e-05]),
 array([-9.06553802e-02,  1.57946045e-05]),
 array([-7.61505194e-02, -9.47676268e-06]),
 array([-6.39664363e-02,  5.68605761e-06]),
 array([-5.37318065e-02, -3.41163

### With `Gradient Descent + Momentum`

> \begin{align*}
>  V_t & = \beta V_{t-1}+(1-\beta)dW_t \\
>  W_t & = W_{t-1} - \alpha*V_t
> \end{align*}

>\begin{align*}
>    W & = [w_1, w_2] \\
>    => dW & = [0.4w_1, 4w_2]
>\end{align*}

Initial values:
$$ \begin{align*}
W & = [w_1, w_2] = [-5,-2]\\
V & = [v_1, v_2] = [0, 0]\\
\alpha & = 0.6 \\
\beta & = 0.5
\end{align*}

Epoch 1:
$$
\begin{align*}
t & \leftarrow 1\\
dW_1 & \leftarrow [0.4w_1, 4w_2] \\
   & = [0.4*(-5), 4*(-2)] \\
   & = [-2, -8] \\
V_1 & \leftarrow \beta V_{0}+(1-\beta)dW_1\\
    & = 0.5*0 + (1-0.5)*[-2, -8] \\
    & = [-1, -4] \\
W_1 & \leftarrow W_0 - \alpha V_1 \\
    & = [-5, -2] - 0.6*[-1,-4] \\
    & = [-4.4, 0.4]
\end{align*}
$$

Epoch 2:
$$
\begin{align*}
t & \leftarrow 1\\
dW_2 & \leftarrow [0.4w_1, 4w_2] \\
   & = [0.4*(-4.4), 4*(0.4)] \\
   & = [-1.76, 1.6] \\
V_2 & \leftarrow \beta V_{1}+(1-\beta)dW_2\\
    & = 0.5*[-1,-4] + (1-0.5)*[-1.76, 1.6] \\
    & = [-1.38, -1.2 ] \\
W_2 & \leftarrow W_1 - \alpha V_2 \\
    & = [-4.4, 0.4] - 0.6*[-1.38, -1.2 ] \\
    & = [-3.572,  1.12 ]
\end{align*}
$$

In [None]:
def df_W(W):
    return np.array([0.4, 4]) * W


def sgd_momentum(W, dW, V, lr, beta):
    V_new = beta * V + (1-beta)*dW
    W_new = W - lr * V_new
    return V_new, W_new


def train_p2(optimizer, lr, epochs):
    W = np.array([-5, -2], dtype=np.float32)
    V = np.zeros_like(W)
    beta = 0.5
    results = [W]
    for _ in range(epochs):
        dW = df_W(W)
        V, W = optimizer(W, dW, V, lr, beta)
        results.append(W)
    return results


train_p2(sgd_momentum, 0.6, 30)

[array([-5., -2.], dtype=float32),
 array([-4.4,  0.4]),
 array([-3.572,  1.12 ]),
 array([-2.72936,  0.136  ]),
 array([-1.9805168, -0.5192   ]),
 array([-1.36843318, -0.22376   ]),
 array([-0.89817939,  0.192472  ]),
 array([-0.55527097,  0.1696216 ]),
 array([-0.31718424, -0.04534952]),
 array([-0.16007877, -0.09841566]),
 array([-0.06231658, -0.00684994]),
 array([-0.0059575 ,  0.04715285]),
 array([0.02293694, 0.01757082]),
 array([ 0.03463173, -0.01830518]),
 array([ 0.03632332, -0.01427696]),
 array([0.03281031, 0.0048695 ]),
 array([0.02711657, 0.00859933]),
 array([0.02101571, 0.00014505]),
 array([ 0.0154434 , -0.00425615]),
 array([ 0.01080403, -0.00134937]),
 array([0.00718787, 0.00172326]),
 array([0.00451724, 0.00119166]),
 array([ 0.00263986, -0.00050413]),
 array([ 0.00138438, -0.00074707]),
 array([5.90519957e-04, 2.79448010e-05]),
 array([0.00012273, 0.00038192]),
 array([-0.0001259,  0.0001006]),
 array([-0.0002351 , -0.00016078]),
 array([-2.61492134e-04, -9.8535334

### With `RMSProp`

> \begin{align*}
> S_t & = \gamma S_{t-1} + (1-\gamma)dW_t^2 \\
> W_t & = W_{t-1} - \alpha*\dfrac{dW_t}{\sqrt{S_t+\epsilon}}
> \end{align*}

Initial values:
$$ \begin{align*}
W_0 & = [w_1, w_2] = [-5,-2]\\
S_0 & = [s_1, s_2] = [0, 0]\\
\alpha & = 0.3 \\
\gamma & = 0.9 \\
\epsilon & = 10^{-6}
\end{align*}

Epoch 1:
$$
\begin{align*}
t & \leftarrow 1\\
dW_1 & \leftarrow [0.4w_1, 4w_2] \\
   & = [0.4*(-5), 4*(-2)] \\
   & = [-2, -8] \\
\Rightarrow dW_1^2 & = dW_1 \odot dW_1 \\
                   & = [-2, -8] \odot [-2, -8] \\
                   & = [4, 64] \\
S_1 & \leftarrow \gamma S_{0}+(1-\gamma)dW_1^2\\
    & = 0.9*[0, 0] + (1-0.9)*[4, 64] \\
    & = [0.4, 6.4] \\
W_1 & \leftarrow W_0 - \alpha * \dfrac{dW_1}{\sqrt{S_1+\epsilon}} \\
    & = [-5,-2] - 0.3*[\dfrac{1}{\sqrt{0.4+10^{-6}}} * (-2), \dfrac{1}{\sqrt{6.4+10^{-6}}} * (-8)] \\
    & \approx [-4.0513, -1.0513]
\end{align*}
$$

Epoch 2:
$$
\begin{align*}
t & \leftarrow 2\\
dW_2 & \leftarrow [0.4w_1, 4w_2] \\
   & = [0.4*(-4.0513), 4*(-1.0513)] \\
   & \approx [-1.6205, -4.2052] \\
\Rightarrow dW_2^2 & = dW_2 \odot dW_2 \\
                   & = [-1.6205, 4.2052] \odot [-1.6205, 4.2052] \\
                   & \approx [2.6261, 17.6837] \\
S_2 & \leftarrow \gamma S_1+(1-\gamma)dW_2^2\\
    & = 0.9*[0.4, 6.4] + (1-0.9)*[2.6261, 17.6837] \\
    & \approx [0.6226, 7.5284] \\
W_2 & \leftarrow W_1 - \alpha * \dfrac{dW_2}{\sqrt{S_2+\epsilon}} \\
    & = [-4.0513, -1.0513] - 0.3*[\dfrac{1}{\sqrt{0.6226+10^{-6}}} * (-1.6205), \dfrac{1}{\sqrt{7.52837+10^{-6}}} * (-4.2052)] \\
    & \approx [-3.4352, -0.5915]
\end{align*}
$$

In [None]:
def df_W(W):
    return np.array([0.4, 4]) * W


def rmsprop(W, dW, S, lr, gamma, epsilon):
    S_new = gamma * S + (1-gamma)*dW*dW
    W_new = W - lr * dW/np.sqrt(S_new+epsilon)
    return S_new, W_new


def train_p3(optimizer, lr, epochs):
    W = np.array([-5, -2], dtype=np.float32)
    S = np.zeros_like(W)
    gamma = 0.9
    epsilon = 1e-6
    results = [W]
    for _ in range(epochs):
        dW = df_W(W)
        S, W = optimizer(W, dW, S, lr, gamma, epsilon)
        results.append(W)
    return results


train_p3(rmsprop, 0.3, 30)

[array([-5., -2.], dtype=float32),
 array([-4.05131789, -1.05131678]),
 array([-3.43519281, -0.59152343]),
 array([-2.95893161, -0.3294394 ]),
 array([-2.56545722, -0.17756482]),
 array([-2.22919966, -0.09163256]),
 array([-1.93626156, -0.04494499]),
 array([-1.67817088, -0.02081423]),
 array([-1.44934392, -0.00903559]),
 array([-1.24587616, -0.00364591]),
 array([-1.06489733, -0.00135351]),
 array([-9.04196768e-01, -4.56444431e-04]),
 array([-7.61991235e-01, -1.37562928e-04]),
 array([-6.36773507e-01, -3.62601019e-05]),
 array([-5.27210543e-01, -8.11337456e-06]),
 array([-4.32074135e-01, -1.47473412e-06]),
 array([-3.50194482e-01, -2.02783991e-07]),
 array([-2.80430983e-01, -1.84231187e-08]),
 array([-2.21656535e-01, -7.67742748e-10]),
 array([-1.72752581e-01,  7.80451998e-12]),
 array([-1.32612565e-01, -5.05794800e-13]),
 array([-1.00151561e-01,  6.19123501e-14]),
 array([-7.43198850e-02, -1.13373781e-14]),
 array([-5.41185515e-02,  2.80166702e-15]),
 array([-3.86146217e-02, -8.81341

### With `Adam`

> \begin{align*}
> V_t & = \beta_1V_{t-1} + (1-\beta_1)dW_t  \\
> S_t & = \beta_2 S_{t-1} + (1-\beta_2)dW_t^2 \\
> \hat{V}_t & = \dfrac{V_t}{1-\beta_1^t} \\
> \hat{S}_t & = \dfrac{S_t}{1-\beta_2^t} \\
> W_t & = W_{t-1} - \alpha*\dfrac{\hat{V}_t}{\sqrt{\hat{S}_t}+\epsilon}
> \end{align*}

Initial values:
$$ \begin{align*}
W_0 & = [w_1, w_2] = [-5,-2]\\
V_0 & = [s_1, s_2] = [0, 0]\\
S_0 & = [s_1, s_2] = [0, 0]\\
\alpha & = 0.2 \\
\beta_1 & = 0.9 \\
\beta_2 & = 0.999 \\
\epsilon & = 10^{-6}
\end{align*}

Epoch 1:
$$
\begin{align*}
t & \leftarrow 1\\
dW_1 & \leftarrow [0.4w_1, 4w_2] \\
   & = [0.4*(-5), 4*(-2)] \\
   & = [-2, -8] \\
\Rightarrow dW_1^2 & = dW_1 \odot dW_1 \\
                   & = [-2, -8] \odot [-2, -8] \\
                   & = [4, 64] \\
V_1 & \leftarrow \gamma_1 V_{0}+(1-\gamma_1)dW_1\\
    & = 0.9*[0, 0] + (1-0.9)*[-2, -8] \\
    & = [-0.2, -0.8] \\
\Rightarrow \hat{V}_1 & = \dfrac{V_1}{1-\beta_1^t} \\
    & = \left[\dfrac{-0.2}{1-0.9^1}, \dfrac{-0.8}{1-0.9^1}\right] \\
    & = [-2, -8] \\
S_1 & \leftarrow \gamma_2 S_{0}+(1-\gamma_2)dW_1^2\\
    & = 0.999*[0, 0] + (1-0.999)*[4, 64] \\
    & = [0.004, 0.064] \\
\Rightarrow \hat{S}_1 & \leftarrow \dfrac{S_1}{1-\gamma_2^t} \\
    & = \left[\dfrac{0.004}{1-0.999^1}, \dfrac{0.064}{1-0.999^1}\right] \\
    & = [4, 64] \\
W_1 & \leftarrow W_0 - \alpha * \dfrac{\hat{V}_1}{\sqrt{\hat{S}_1}+\epsilon} \\
    & = [-5,-2] - 0.2*\left[\dfrac{1}{\sqrt{4}+10^{-6}} * (-2), \dfrac{1}{\sqrt{64}+10^{-6}} * (-8)\right] \\
    & \approx [-4.8, -1.8]
\end{align*}
$$

Epoch 2:
$$
\begin{align*}
t & \leftarrow 1\\
dW_2 & \leftarrow [0.4w_1, 4w_2] \\
   & = [0.4*(-4.8), 4*(-1.8)] \\
   & = [-1.98, -7.2] \\
\Rightarrow dW_2^2 & = dW_2 \odot dW_2 \\
                   & = [-1.98, -7.2] \odot [-1.98, -7.2] \\
                   & \approx [3.6864, 51.84] \\
V_2 & \leftarrow \gamma_1 V_{2}+(1-\gamma_1)dW_2\\
    & = 0.9*[-0.2, 0.8] + (1-0.9)*[-1.98, -7.2] \\
    & = [-0.0372, -1.44] \\
\Rightarrow \hat{V}_1 & = \dfrac{V_2}{1-\beta_1^t} \\
    & = \left[\dfrac{-0.0372}{1-0.9^2}, \dfrac{-1.44}{1-0.9^2}\right] \\
    & \approx [-1.9579, -7.579] \\
S_2 & \leftarrow \gamma_2 S_{1}+(1-\gamma_2)dW_2^2\\
    & = 0.999*[0.004, 0.064] + (1-0.999)*[3.6864, 51.84] \\
    & = [0.0077, 0.1158] \\
\Rightarrow \hat{S}_2 & \leftarrow \dfrac{S_2}{1-\gamma_2^t} \\
    & = \left[\dfrac{0.0077}{1-0.999^2}, \dfrac{0.1158}{1-0.999^2}\right] \\
    & \approx [3.8431, 57.917] \\
W_1 & \leftarrow W_0 - \alpha * \dfrac{\hat{V}_1}{\sqrt{\hat{S}_1}+\epsilon} \\
    & = [-4.8, -1.8] - 0.2*\left[\dfrac{1}{\sqrt{3.8431}+10^{-6}} * (-1.9579), \dfrac{1}{\sqrt{57.917}+10^{-6}} * (-7.579)\right] \\
    & \approx [-4.6003, -1.6008]
\end{align*}
$$

In [None]:
def df_W(W):
    return np.array([0.4, 4]) * W


def adam(W, dW, t, V, S, lr, gamma_1, gamma_2, epsilon):
    V_new = gamma_1*V + (1-gamma_1)*dW
    S_new = gamma_2 * S + (1-gamma_2)*dW*dW
    V_corr = V_new / (1-gamma_1**t)
    S_corr = S_new / (1-gamma_2**t)
    W_new = W - lr * V_corr/(np.sqrt(S_corr)+epsilon)
    return V_new, S_new, W_new


def train_p4(optimizer, lr, epochs):
    W = np.array([-5, -2], dtype=np.float32)
    V = np.zeros_like(W)
    S = np.zeros_like(W)
    gamma_1 = 0.9
    gamma_2 = 0.999
    epsilon = 1e-6
    results = [W]
    for t in range(epochs):
        dW = df_W(W)
        V, S, W = optimizer(W, dW, t+1, V, S, lr, gamma_1, gamma_2, epsilon)
        results.append(W)
    return results


train_p4(adam, 0.2, 30)

[array([-5., -2.], dtype=float32),
 array([-4.8000001 , -1.80000002]),
 array([-4.60025458, -1.60082451]),
 array([-4.40094817, -1.40317262]),
 array([-4.20227723, -1.20787822]),
 array([-4.00444981, -1.01592745]),
 array([-3.80768576, -0.82847307]),
 array([-3.61221659, -0.64684159]),
 array([-3.41828539, -0.47252765]),
 array([-3.22614645, -0.30716934]),
 array([-3.03606486, -0.15249855]),
 array([-2.8483159 , -0.01026326]),
 array([-2.66318417,  0.11787552]),
 array([-2.48096263,  0.23046161]),
 array([-2.30195133,  0.3263587 ]),
 array([-2.12645586,  0.40484195]),
 array([-1.95478566,  0.46564961]),
 array([-1.78725195,  0.50898799]),
 array([-1.62416544,  0.53549442]),
 array([-1.46583375,  0.54617144]),
 array([-1.3125587 ,  0.54230812]),
 array([-1.16463323,  0.52540206]),
 array([-1.02233827,  0.4970906 ]),
 array([-0.8859395,  0.4590951]),
 array([-0.755684  ,  0.41317781]),
 array([-0.631797 ,  0.3611089]),
 array([-0.51447868,  0.30464048]),
 array([-0.40390125,  0.24548409]