# Lecture 8: Vector Calculus Preview for Machine Learning

[![Watch the Video](https://img.shields.io/badge/Watch%20on%20YouTube-FF0000?style=for-the-badge&logo=youtube&logoColor=white)](https://youtube.com/your-channel)

This lecture provides a preview of vector calculus concepts that are essential for understanding optimization in machine learning.

## Learning Objectives
- Understand derivatives in multiple dimensions
- Learn about gradients and their geometric interpretation
- Preview optimization concepts
- Connect vector calculus to machine learning

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

plt.style.use('seaborn')
%matplotlib inline

def plot_gradient_field(f, grad_f, x_range, y_range, points=20):
    """Plot a function and its gradient field"""
    x = np.linspace(x_range[0], x_range[1], points)
    y = np.linspace(y_range[0], y_range[1], points)
    X, Y = np.meshgrid(x, y)
    
    # Calculate function values
    Z = f(X, Y)
    
    # Calculate gradients
    U, V = grad_f(X, Y)
    
    # Normalize gradients for better visualization
    norm = np.sqrt(U**2 + V**2)
    U = U / (norm + 1e-6)
    V = V / (norm + 1e-6)
    
    # Create figure
    fig = plt.figure(figsize=(15, 5))
    
    # Surface plot
    ax1 = fig.add_subplot(121, projection='3d')
    surf = ax1.plot_surface(X, Y, Z, cmap='viridis')
    ax1.set_xlabel('x')
    ax1.set_ylabel('y')
    ax1.set_zlabel('f(x,y)')
    ax1.set_title('Function Surface')
    
    # Gradient field plot
    ax2 = fig.add_subplot(122)
    ax2.quiver(X, Y, U, V)
    ax2.set_xlabel('x')
    ax2.set_ylabel('y')
    ax2.set_title('Gradient Field')
    
    plt.tight_layout()
    return fig, (ax1, ax2)

## 1. From Single to Multiple Variables

In machine learning, we often work with functions of many variables. The transition from single-variable to multi-variable calculus introduces new concepts:

1. Partial derivatives
2. Gradients
3. Directional derivatives
4. Chain rule in multiple dimensions

In [None]:
# Example: Simple quadratic function
def f(x, y):
    return x**2 + y**2

def grad_f(x, y):
    return 2*x, 2*y  # [∂f/∂x, ∂f/∂y]

# Visualize function and its gradients
x_range = (-2, 2)
y_range = (-2, 2)

plot_gradient_field(f, grad_f, x_range, y_range)
plt.show()

# Calculate partial derivatives at a point
point = (1, 1)
print(f"At point {point}:")
print(f"∂f/∂x = {2*point[0]}")
print(f"∂f/∂y = {2*point[1]}")

## 2. The Gradient

The gradient is a vector of partial derivatives:

$\nabla f = \begin{bmatrix} 
\frac{\partial f}{\partial x_1} \\
\frac{\partial f}{\partial x_2} \\
\vdots \\
\frac{\partial f}{\partial x_n}
\end{bmatrix}$

Key properties:
1. Points in direction of steepest increase
2. Perpendicular to level curves/surfaces
3. Magnitude indicates rate of change

In [None]:
# Example: More complex function
def g(x, y):
    return np.sin(x) * np.cos(y)

def grad_g(x, y):
    return (np.cos(x) * np.cos(y), -np.sin(x) * np.sin(y))

# Visualize
plot_gradient_field(g, grad_g, (-np.pi, np.pi), (-np.pi, np.pi))
plt.show()

# Plot level curves with gradient vectors
x = np.linspace(-np.pi, np.pi, 20)
y = np.linspace(-np.pi, np.pi, 20)
X, Y = np.meshgrid(x, y)
Z = g(X, Y)

plt.figure(figsize=(10, 8))
plt.contour(X, Y, Z, levels=20)
U, V = grad_g(X, Y)
plt.quiver(X, Y, U, V)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Level Curves and Gradient Vectors')
plt.colorbar(label='Function Value')
plt.axis('equal')
plt.show()

## 3. Optimization Preview

In machine learning, we often need to minimize or maximize functions. The gradient helps us find:
1. Local minima and maxima
2. Saddle points
3. Directions of steepest descent/ascent

In [None]:
# Example: Gradient descent visualization
def h(x, y):
    return (x**2 + y**2) * np.exp(-0.1*(x**2 + y**2))

def grad_h(x, y):
    # Compute gradients analytically
    dx = 2*x * np.exp(-0.1*(x**2 + y**2)) - 0.2*x*(x**2 + y**2) * np.exp(-0.1*(x**2 + y**2))
    dy = 2*y * np.exp(-0.1*(x**2 + y**2)) - 0.2*y*(x**2 + y**2) * np.exp(-0.1*(x**2 + y**2))
    return dx, dy

# Generate surface data
x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x, y)
Z = h(X, Y)

# Perform gradient descent
def gradient_descent(start_point, learning_rate=0.1, num_steps=50):
    path = [start_point]
    point = np.array(start_point)
    
    for _ in range(num_steps):
        grad = np.array(grad_h(point[0], point[1]))
        point = point - learning_rate * grad
        path.append(point)
    
    return np.array(path)

# Run gradient descent from different starting points
start_points = [
    (-2, -2),
    (2, 2),
    (-2, 2),
    (2, -2)
]

plt.figure(figsize=(15, 5))

# Surface plot with paths
ax1 = plt.subplot(121, projection='3d')
surf = ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8)
for start in start_points:
    path = gradient_descent(start)
    ax1.plot(path[:, 0], path[:, 1], h(path[:, 0], path[:, 1]), 'r.-')
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x,y)')
ax1.set_title('Gradient Descent Paths (3D)')

# Contour plot with paths
ax2 = plt.subplot(122)
plt.contour(X, Y, Z, levels=20)
for start in start_points:
    path = gradient_descent(start)
    plt.plot(path[:, 0], path[:, 1], 'r.-')
plt.colorbar(label='Function Value')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Gradient Descent Paths (Contour)')
plt.axis('equal')

plt.tight_layout()
plt.show()

## 4. The Chain Rule in Multiple Dimensions

The chain rule is crucial for:
1. Backpropagation in neural networks
2. Computing gradients of composite functions
3. Understanding error propagation

In [None]:
# Example: Simple neural network layer
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def d_sigmoid(x):
    s = sigmoid(x)
    return s * (1 - s)

# Forward pass
x = np.array([1, 2])  # Input
W = np.array([[0.1, 0.2],
              [0.3, 0.4]])  # Weights
b = np.array([0.1, 0.2])  # Biases

# Compute forward pass
z = x @ W + b
a = sigmoid(z)

# Backward pass (chain rule)
da = np.array([0.5, 0.5])  # Gradient from next layer
dz = da * d_sigmoid(z)
dW = np.outer(x, dz)
db = dz
dx = dz @ W.T

print("Forward pass:")
print(f"z = {z}")
print(f"a = {a}")
print("\nBackward pass (gradients):")
print(f"dW =\n{dW}")
print(f"db = {db}")
print(f"dx = {dx}")

# Visualize the computation graph
from graphviz import Digraph
dot = Digraph(comment='Computation Graph')
dot.attr(rankdir='LR')

# Add nodes
dot.node('x', 'x')
dot.node('W', 'W')
dot.node('b', 'b')
dot.node('z', 'z = Wx + b')
dot.node('a', 'a = sigmoid(z)')

# Add edges
dot.edge('x', 'z')
dot.edge('W', 'z')
dot.edge('b', 'z')
dot.edge('z', 'a')

# Display graph
dot.render('computation_graph', format='png', cleanup=True)

## 5. Applications in Machine Learning

Vector calculus is fundamental in:

1. **Optimization Algorithms**
   - Gradient descent
   - Momentum methods
   - Adam optimizer

2. **Neural Networks**
   - Backpropagation
   - Weight updates
   - Learning rate scheduling

3. **Model Evaluation**
   - Loss function gradients
   - Performance metrics
   - Regularization

In [None]:
# Example: Different optimization methods on a simple loss landscape
def loss_surface(x, y):
    return 0.1 * (x**2 + y**2) + np.sin(x) * np.cos(y)

def grad_loss(x, y):
    dx = 0.2 * x + np.cos(x) * np.cos(y)
    dy = 0.2 * y - np.sin(x) * np.sin(y)
    return np.array([dx, dy])

# Different optimization methods
def gradient_descent_momentum(start, lr=0.1, momentum=0.9, steps=50):
    path = [np.array(start)]
    velocity = np.zeros(2)
    point = np.array(start)
    
    for _ in range(steps):
        grad = grad_loss(point[0], point[1])
        velocity = momentum * velocity - lr * grad
        point = point + velocity
        path.append(point)
    
    return np.array(path)

# Compare different optimization paths
x = np.linspace(-4, 4, 100)
y = np.linspace(-4, 4, 100)
X, Y = np.meshgrid(x, y)
Z = loss_surface(X, Y)

# Plot comparison
plt.figure(figsize=(15, 5))

# Standard gradient descent
plt.subplot(131)
plt.contour(X, Y, Z, levels=20)
path_gd = gradient_descent((-3, 3), learning_rate=0.1)
plt.plot(path_gd[:, 0], path_gd[:, 1], 'r.-', label='Path')
plt.title('Standard Gradient Descent')
plt.xlabel('x')
plt.ylabel('y')
plt.colorbar(label='Loss')

# Momentum
plt.subplot(132)
plt.contour(X, Y, Z, levels=20)
path_momentum = gradient_descent_momentum((-3, 3))
plt.plot(path_momentum[:, 0], path_momentum[:, 1], 'r.-', label='Path')
plt.title('Gradient Descent with Momentum')
plt.xlabel('x')
plt.ylabel('y')
plt.colorbar(label='Loss')

# Loss history comparison
plt.subplot(133)
loss_gd = [loss_surface(x, y) for x, y in path_gd]
loss_momentum = [loss_surface(x, y) for x, y in path_momentum]
plt.plot(loss_gd, label='Standard GD')
plt.plot(loss_momentum, label='Momentum')
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('Loss History')
plt.legend()

plt.tight_layout()
plt.show()

## 6. Practice Exercises

1. Implement gradient descent from scratch
2. Visualize gradients for different loss functions
3. Implement backpropagation for a simple neural network
4. Compare different optimization methods

Write your solutions in the cell below:

In [None]:
# Your solution here


## Next Steps

In Module 2, we'll dive deep into matrices as transformations and their applications in machine learning.

### Preparation for Next Module
1. Review matrix operations
2. Practice with gradients and optimization
3. Think about how transformations can be represented by matrices

### Additional Resources
- [Interactive Gradient Visualization](../../resources/visualizations/gradients.html)
- [Vector Calculus Cheat Sheet](../../resources/cheat_sheets/vector_calculus.pdf)
- [3Blue1Brown: Derivatives and Gradients](https://www.3blue1brown.com/lessons/derivatives)