# Week 4
# day3: 31 Aug 2022

4. Learn about Gradient Descent and its below variants:

    Momentum

    Nesterov

    Adagrad

    RMSProp

    Adam
5. Implement all the above in Numpy
6. How does the "Exponential weighted average" lecture given in the readings   relate to some of the variants of Gradient Descent?

In [1]:
import numpy as np

In [2]:
x = np.array([1.0, 3.0, 7.0, 2.0, 5.0, 4.0])
y = np.array([4.0, 9.0, 20.0, 7.0, 15.0, 11.0])


def linear_regression(x, m, b):
    yhat = x * m + b
    return yhat

## RMSprop

In [3]:
# RMSProp
class RMSProp:
    def __init__(self):
        self.learning_rate = 0.1
        self.w = 9
        self.b = 0
        self.update_w = 0
        self.update_b = 0
        self.epsilon = 1e-6
        self.beta = 0.9

    def update_weight(self, x, y):
        yhat = self.w * x + self.b
        self.update_w = self.beta * self.update_w + (1 - self.beta) * (
            (-2 * sum(x * (y - yhat)).mean()) ** 2
        )
        self.update_b = self.beta * self.update_b + (1 - self.beta) * (
            (-2 * sum(y - yhat).mean()) ** 2
        )
        self.w -= (self.learning_rate / np.sqrt(self.update_w + self.epsilon)) * (
            -2 * sum(x * (y - yhat)).mean()
        )
        self.b -= (self.learning_rate / np.sqrt(self.update_b + self.epsilon)) * (
            -2 * sum(y - yhat).mean()
        )

    def MSE(self, y, yhat):
        return np.square(np.subtract(y, yhat)).mean()

    def fit(self, x, y, epochs=2000):
        history = []
        for e in range(epochs):
            self.update_weight(x, y)
            loss = self.MSE(y, (self.w * x + self.b))
            if e % 100 == 0:
                print(f"Epoch: {e}, Loss: {loss}")
                print(f"weight:{self.w},bias:{self.b}")
            history.append(loss)
            if loss <= 1:
                print(f"Epoch: {e}, Loss: {loss}")
                print(f"weight:{self.w},bias:{self.b}")
                return history
        return history

In [4]:
rms = RMSProp()
history = rms.fit(x, y)
print("Prediction:")
print(rms.w, rms.b)
print("yhat:", linear_regression(x, rms.w, rms.b))

Epoch: 0, Loss: 562.7006622555053
weight:8.683772233984152,bias:-0.31622776599415175
Epoch: 100, Loss: 2.324351041920164
weight:3.325713812214073,bias:-1.807915773506083
Epoch: 116, Loss: 0.9664660797786159
weight:3.0531544913639608,bias:-0.5783922218373122
Prediction:
3.0531544913639608 -0.5783922218373122
yhat: [ 2.47476227  8.58107125 20.79368922  5.52791676 14.68738023 11.63422574]


In [5]:
# Adam
class Adam:
    def __init__(self):
        self.learning_rate = 0.1
        self.w = 9
        self.b = 0
        self.update_m_w = 0
        self.update_m_b = 0
        self.update_v_w = 0
        self.update_v_b = 0
        self.epsilon = 1e-8
        self.beta1 = 0.9
        self.beta2 = 0.99

    def update_weight(self, x, y, e):
        yhat = self.w * x + self.b
        self.update_m_w = self.beta1 * self.update_m_w + (1 - self.beta1) * (
            -2 * sum(x * (y - yhat)).mean()
        )
        self.update_m_b = self.beta1 * self.update_m_b + (1 - self.beta1) * (
            -2 * sum(y - yhat).mean()
        )
        self.update_v_w = self.beta2 * self.update_v_w + (1 - self.beta2) * (
            (-2 * sum(x * (y - yhat)).mean()) ** 2
        )
        self.update_v_b = self.beta2 * self.update_v_b + (1 - self.beta2) * (
            (-2 * sum(y - yhat).mean()) ** 2
        )
        self.update_m_w = self.update_m_w / (1 - self.beta1**e + 1)
        self.update_m_b = self.update_m_b / (1 - self.beta1**e + 1)
        self.update_v_w = self.update_v_w / (1 - self.beta2**e + 1)
        self.update_v_b = self.update_v_b / (1 - self.beta2**e + 1)
        self.w -= (self.learning_rate / np.sqrt(self.update_v_w + self.epsilon)) * (
            self.update_m_w
        )
        self.b -= (self.learning_rate / np.sqrt(self.update_v_b + self.epsilon)) * (
            self.update_m_b
        )

    def MSE(self, y, yhat):
        return np.square(np.subtract(y, yhat)).mean()

    def fit(self, x, y, epochs=2000):
        history = []
        for e in range(epochs):
            self.update_weight(x, y, e)
            loss = self.MSE(y, (self.w * x + self.b))
            if e % 100 == 0:
                print(f"Epoch: {e}, Loss: {loss}")
                print(f"weight:{self.w},bias:{self.b}")
            history.append(loss)
            if loss <= 1:
                print(f"Epoch: {e}, Loss: {loss}")
                print(f"weight:{self.w},bias:{self.b}")
                return history
        return history

In [6]:
adam = Adam()
history = adam.fit(x, y)
print("Prediction:")
print(adam.w, adam.b)
print("yhat:", linear_regression(x, adam.w, adam.b))

Epoch: 0, Loss: 615.4566666667038
weight:8.90000000000003,bias:-0.09999999999928254
Epoch: 100, Loss: 5.544956832672793
weight:3.6919044593921315,bias:-3.619330884227773
Epoch: 140, Loss: 0.9998795863272743
weight:3.049258941689907,bias:-0.6192726745819843
Prediction:
3.049258941689907 -0.6192726745819843
yhat: [ 2.42998627  8.52850415 20.72553992  5.47924521 14.62702203 11.57776309]


### 6. How does the "Exponential weighted average" lecture given in the readings   relate to some of the variants of Gradient Descent?

    Weight updates are guided by the previous gradients.The most recent gradient provide more information so the older the gradient the lesser the weight should be given thus exponential weightd average is used 