In [1]:
import numpy as np
from typing import Optional, Callable, Tuple

In [2]:
class NesterovOptimizer:
    def __init__(
        self,
        learning_rate: float = 0.01,
        momentum: float = 0.9,
        max_iters: int = 100,
        tol: float = 1e-6,
    ):
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.max_iters = max_iters
        self.tol = tol

        self.velocity: Optional[np.ndarray] = None
        self.iter_count: int = 0

    def step(
        self,
        grad_func: Callable[[np.ndarray], np.ndarray],
        x: np.ndarray,
    ) -> Tuple[np.ndarray, float]:
        if self.velocity is None:
            self.velocity = np.zeros_like(x)

        x_ahead = x + self.momentum * self.velocity
        
        grad = grad_func(x_ahead)

        self.velocity = self.momentum * self.velocity - self.learning_rate * grad

        x_new = x + self.velocity

        loss = np.linalg.norm(grad)

        self.iter_count += 1

        return x_new, loss

    def optimize(
        self,
        grad_func: Callable[[np.ndarray], np.ndarray],
        x0: np.ndarray,
    ) -> Tuple[np.ndarray, list]:
        x = x0.copy()
        loss_history = []

        for _ in range(self.max_iters):
            x, loss = self.step(grad_func, x)
            loss_history.append(loss)

            if loss < self.tol:
                break

        return x, loss_history

In [3]:
def quadratic_loss(x):
    return (x - 3)**2

def quadratic_grad(x):
    return 2 * (x - 3)

In [4]:
optimizer = NesterovOptimizer(learning_rate=0.1, momentum=0.9)

In [5]:
x0 = np.array([0.0])

In [6]:
x_opt, loss_history = optimizer.optimize(quadratic_grad, x0)

In [7]:
print('X opt:', x_opt)

X opt: [2.99999962]


In [8]:
print('Min val:', quadratic_loss(x_opt))

Min val: [1.42710532e-13]
