In [4]:
import numpy as np

In [None]:
class Adadelta:
    def __init__(self, rho=0.95, epsilon=1e-6, num_iterations=1000):
        self.rho = rho
        self.epsilon = epsilon
        self.num_iterations = num_iterations

    def loss_function(self, X, y):
        m = len(y)
        predictions = self.theta * X
        return (1 / m) * np.sum((predictions - y) ** 2)

    def grad(self, X, y):
        return 2 / len(X) * X.T.dot(X.dot(self.theta) - y)    

    def fit(self, X, y):
        X = np.c_[np.ones((X.shape[0], 1)), X]

        self.theta = np.zeros(X.shape[1])

        Eg2 = np.zeros(X.shape[1])
        Edx2 = np.zeros(X.shape[1])

        for iteration in range(1, self.num_iterations + 1):
            gradients = self.grad(X, y)

            Eg2 = self.rho * Eg2 + (1 - self.rho) * gradients**2

            dx = -np.sqrt(Edx2 + self.epsilon) / np.sqrt(Eg2 + self.epsilon) * gradients
            Edx2 = self.rho * Edx2 + (1 - self.rho) * dx**2

            self.theta += dx

            if iteration % 100 == 0:
                loss = self.loss_function(X, y)

    def predict(self, X):
        X = np.c_[np.ones((X.shape[0], 1)), X]
        return X.dot(self.theta)

In [35]:
class RAdam:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, eps=1e-8, weight_decay=0, num_iterations=1000):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.weight_decay = weight_decay
        self.num_iterations = num_iterations

    def loss_function(self, X, y):
        return np.mean((X.dot(self.theta) - y) ** 2)

    def grad(self, X, y):
        return 2 / len(X) * X.T.dot(X.dot(self.theta) - y)    

    def fit(self, X, y):
        X = np.c_[np.ones((X.shape[0], 1)), X]

        self.theta = np.zeros(X.shape[1])
        self.m = np.zeros(X.shape[1])
        self.v = np.zeros(X.shape[1])
        self.t = 0

        for iteration in range(1, self.num_iterations + 1):
            gradients = self.grad(X, y)

            self.t += 1
            self.m = self.beta1 * self.m + (1 - self.beta1) * gradients
            self.v = self.beta2 * self.v + (1 - self.beta2) * gradients**2

            m_hat = self.m / (1 - self.beta1**self.t)
            v_hat = self.v / (1 - self.beta2**self.t)

            rho_inf = 2 / (1 - self.beta2) - 1
            rho_t = rho_inf - 2 * self.t * (self.beta2**self.t) / (1 - self.beta2**self.t)

            if rho_t > 4:
                r_t = np.sqrt((rho_t - 4) * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t))
                lr_t = self.learning_rate * r_t / (np.sqrt(v_hat) + self.eps)
            else:
                lr_t = self.learning_rate / (np.sqrt(v_hat) + self.eps)

            self.theta -= lr_t * m_hat

            if iteration % 100 == 0:
                loss = self.loss_function(X, y)

    def predict(self, X):
        X = np.c_[np.ones((X.shape[0], 1)), X]
        return X.dot(self.theta)

In [36]:
X = np.array([[1], [1]])
y = np.array([2, 4])

In [37]:
radam = RAdam(learning_rate=0.001, num_iterations=1)
radam.fit(X, y)

In [38]:
predictions = radam.predict(X)
print(f"Predictions: {predictions}")

Predictions: [0.002 0.002]


In [39]:
final_loss = radam.loss_function(np.c_[np.ones((X.shape[0], 1)), X], y)
print(f"Final Loss: {final_loss}")
print(f"Final Weights (Theta): {radam.theta}")

Final Loss: 9.988004000019988
Final Weights (Theta): [0.001 0.001]
