In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# to ensure reproducibility, as we will sample data points, we fix the random seed
np.random.seed(42)

# **Contours and gradients: a quick glance**

In [None]:
# Define the function f(x1, x2)
def f(x1, x2):
    return x1**2 + x2**2

# Define the gradient of f(x1, x2)
def grad_f(x1, x2):
    return np.array([2 * x1, 2 * x2])

# Define the range of x1 and x2
x1_range = np.linspace(-5, 5, 100)
x2_range = np.linspace(-5, 5, 100)

# Create a meshgrid of x1 and x2 values
X1, X2 = np.meshgrid(x1_range, x2_range)

# Compute the function values at each point in the meshgrid
Z = f(X1, X2)

In [None]:
# Ask the user to input the coordinates of the point
x1 = float(input("Enter the x1 coordinate of the point: "))
x2 = float(input("Enter the x2 coordinate of the point: "))

# Compute the gradient of f at the user-defined point
gradient = grad_f(x1, x2)

# Plot contours of the function f(x1, x2)
plt.contour(X1, X2, Z, levels=25)
plt.xlabel('x1')
plt.ylabel('x2')

# Plot the gradient vector at the user-defined point
plt.quiver(x1, x2, gradient[0], gradient[1], color='red', angles='xy', scale_units='xy', scale=1)

# Show the plot
plt.title('Contours of f(x1, x2) and Gradient at the Point')
plt.grid(True)
plt.axis('equal')
plt.show()

## **(Mini-batch) Stochastic Gradient Descent**
Let us implement a (mini-batch) stochastic gradient descent algorithm for the linear regression model

\begin{align}
f_{\boldsymbol{\theta}}(x) = \theta_0+\theta_1 x.
\end{align}

We will simulate a few data points, compute the empirical risk and minimize it using mini-batch SGD. We will investigate how the choice of the initial point of the algorithm, the learning rate and the mini-batch size affect its results.

In [None]:
# Generate 10000 data points x from Unif(-5, 5)
x = np.random.uniform(-5, 5, 100)

# Compute the array y = x + epsilon, where epsilon has distribution N(0, 1)
epsilon = np.random.normal(0, 1, 100)
y = x + epsilon

# Linear model function
def linear_model(x, theta0, theta1):
    return theta0 + theta1 * x

In [None]:
# quick plot of the generated data points with a chosen "trend line"
plt.scatter(x,y)

x_line = np.arange(-5, 6)
theta0, theta1 = 0, 1   # choose your intercept and slope
y_line = theta0+ theta1*x_line
plt.plot(x_line, y_line,'b-',lw=2)
plt.show()

## Implementing the mini-batch SGD algorithm

In [None]:
# prediction function for a linear regression
def predict(x, theta0, theta1):
    return theta0 + theta1 * x

# gradients function (as usual, we choose the squared loss)
def compute_gradients(x_batch, y_batch, theta0, theta1):
    predictions = predict(x_batch, theta0, theta1)

    # we computed this gradient already - can you reconstruct these formulae?
    error = predictions - y_batch
    grad_theta0 = np.mean(error)
    grad_theta1 = np.mean(error * x_batch)

    return grad_theta0, grad_theta1

In [None]:
# Mini-batch stochastic gradient descent (SGD) function
def mini_batch_sgd(start_theta0, start_theta1, x, y, learning_rate, num_iterations, batch_size):

    # initialization of theta0, theta1
    theta0 = start_theta0
    theta1 = start_theta1

    # in these lists we will collect the theta's for all SGD iterations/steps
    theta0_values, theta1_values = [theta0], [theta1]

    # we start with the SGD
    for i in range(num_iterations):

          # mini-batch selection
          batch_indices = np.random.choice(range(x.shape[0]), batch_size, replace=False)
          x_batch = x[batch_indices]
          y_batch = y[batch_indices]

          # gradient computation on the mini-batch
          grad_theta0, grad_theta1 = compute_gradients(x_batch, y_batch, theta0, theta1)

          # parameters update according to the SGD formula
          theta0 -= learning_rate * grad_theta0
          theta1 -= learning_rate * grad_theta1

          # we store the updated values, adding a new element to the lists we initialized above
          theta0_values.append(theta0)
          theta1_values.append(theta1)

    return theta0_values, theta1_values

In [None]:
# let us test our SGD function

# initializing theta's (you can also specify your own values)
start_theta0 = np.random.rand()
start_theta1 = np.random.rand()

# choosing the other parameters
learning_rate = 0.01
num_iterations = 100
batch_size = 90       # maximum size = number sampled data points

# SGD: collecting all results
theta0_seq, theta1_seq = mini_batch_sgd(start_theta0, start_theta1, x, y, learning_rate, num_iterations, batch_size)

# SGD: summary of results
print('SGD: initialized theta0:', theta0_seq[0])
print('SGD: initialized theta1:', theta1_seq[0])
print('-------')
print('SGD: estimate of theta0:', theta0_seq[-1])
print('SGD: estimate of theta1:', theta1_seq[-1])

In [None]:
# Plot the sequence of points computed by mini-batch SGD as a time series wrt SGD update step
plt.plot(theta1_seq, label='Mini-batch SGD output sequence')

plt.xlabel('iteration')
plt.ylabel('theta')
plt.title('SGD - Results')
plt.legend()
plt.grid(True)
plt.show()