In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker, cm, colors

# Problem 4: Rosenbrock's banana

Let us consider the **Rosenbrock's banana function**  

$$
f(x,y) = (x-1)^2 + 10(y-x^2)^2
$$

In [None]:
# the Rosenbrock function
def ros(x,y):
    # your code here

In [None]:
# run this cell to plot the Rosenbrock function

# Initialize figure and axes
from mpl_toolkits import mplot3d
fig = plt.figure(figsize=(12, 7))
ax = plt.axes(projection='3d')

# Evaluate Rosenbrock's function
x = np.linspace(-2, 2, 1000) 
y = np.linspace(-1, 3, 1000)
X, Y = np.meshgrid(x, y)
Z = ros(X,Y)

# Plot the surface
surf = ax.plot_surface(X, Y, Z, cmap=cm.gist_heat_r,
                       linewidth=0, antialiased=False)
ax.set_zlim(0, 200)
fig.colorbar(surf, shrink=0.5, aspect=10)
plt.show()

The minimum value of  the Rosenbrock function is zero.
That minimum is reached at the point  (1,1).
This minimum is inside a long, narrow, banana shaped flat valley. 

In [None]:
# run this cell to plot the contours of the Rosenbrock function
plt.figure(figsize=(12,5))
plt.contour(X,Y,Z,200, cmap=cm.coolwarm)
plt.plot(1,1,'ko')
plt.annotate('minimum', xy = (1,1),xytext=(1,1-0.2), fontsize=15)
plt.colorbar()
plt.title('Contour plot of the Rosenbrock function')

Find the **gradient** of the Rosenbrock function

$$
\nabla f(x,y) = 
\begin{bmatrix}
\frac{\partial f}{\partial x} \\ \frac{\partial f}{\partial y}
\end{bmatrix}
$$

In [None]:
# gradient of the Rosenbrock function
def grad_ros(x,y):
    # your code here

Starting at the point $(x,y)=(-1.25,2.75)$ and using a learning rate $s=0.01$, how many iterations (epochs) does Gradient Descent need to find the minimum?

In [None]:
# this cell runs Gradient Descent

plt.figure(figsize=(12,5))
plt.contour(X,Y,Z,cmap=cm.coolwarm, levels=250)
plt.scatter(1,1,color='black')
plt.annotate('minimum', xy = (1,1),xytext=(1,1-0.2), fontsize=15)
plt.colorbar()
plt.title('Gradient Descent',fontsize = 20)

s = 0.01 #learning rate

x = np.array([-1.25,2.75]) # initial point 
plt.scatter(x[0],x[1],color='blue')

######################################
# Set number of epochs
epochs = 
######################################

for epoch in range(epochs): 
    new_x = x-s*grad_ros(x[0],x[1])
    plt.scatter(new_x[0],new_x[1],color'blue)
    plt.plot([x[0],new_x[0]],[x[1],new_x[1]],'r--')
    x = new_x

Add momentum to the Gradient Descent method. 
How many epochs does Gradient Descent with Momentum need to find the minimum?

In [None]:
# This cell runs Gradient Descent with Momentum.

plt.figure(figsize=(12,5))
plt.contour(X,Y,Z,cmap=cm.coolwarm, levels=250)
plt.scatter(1,1,color='black')
plt.annotate('minimum', xy = (1,1),xytext=(1,1-0.2), fontsize=15)
plt.colorbar()
plt.title('Gradient Descent with Momentum',fontsize = 20)

# learning rate
s = 0.01 

######################################
# set momentum coefficient
beta  = 
######################################

x = np.array([-1.25,2.75]) # initial point 
plt.scatter(x[0],x[1],color='blue')

######################################
# set number of epochs
epochs = 
######################################

for epoch in range(epochs): 
    
    d_current = -s*grad_ros(x[0],x[1]) + beta*d_previous
    
    new_x = x+d_current
    plt.scatter(new_x[0],new_x[1],color='blue')
    plt.plot([x[0],new_x[0]],[x[1],new_x[1]],'r--')a
    x = new_x
    d_previous = d_current