#Assignment 5: Gradient Descent - Linear Regression

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets
import sklearn.model_selection

diabetes_X, diabetes_y = sklearn.datasets.load_diabetes(return_X_y = True)
split = sklearn.model_selection.train_test_split(diabetes_X, diabetes_y)
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = split

## 1. Loss Functions

In this exercise we'll be considering a simple linear model:
$$y \approx \theta x$$
The hypothesis for the model is written as
$$h(\theta) = \theta x$$

### a. Fill in the following methods for the loss functions and their derivatives.


In [0]:
def squared_loss(h, y):
    """
    Returns the squared difference of each row, averaged over all rows (each datapoint)
    Input:
    h: n by 1 vector
    y: n by 1 vector
    Output:
    loss: scalar
    """
    # TODO:
    loss = None
    loss = np.mean(np.square(np.subtract(h,y)))
    return loss

In [0]:
def squared_deriv(h, y):
    """
    Returns the gradient wrt theta of the squared loss, averaged over the datapoints
    Input:
    h: n by 1 vector
    y: n by 1 vector
    Output:
    grad: scalar
    """
    # TODO:
    grad = None
    
    return grad

In [0]:
def abs_loss(h, y):
    """
    Returns the absolute value of the difference of each row, averaged over the datapoints
    Input:
    h: n by 1 vector
    y: n by 1 vector
    Output:
    loss: scalar
    """
    # TODO:
    loss = None
    
    return loss

In [0]:
def abs_deriv(h, y):
    """
    Returns the gradient wrt theta of the absolute loss, averaged over the datapoints
    Input:
    h: n by 1 vector
    y: n by 1 vector
    Output:
    grad: scalar
    """
    # TODO:
    grad = None

    return grad

### b. Plot the loss and the gradient for the provided data

In [0]:
# Data you'll use with the above methods
simple_x = np.arange(-20,20,0.5)

# yields a float between 3 and 7
true_theta = 4*np.random.random_sample()+3
simple_y = true_theta*simple_x + np.random.normal(scale = 10, size=simple_x.shape)

plt.figure(figsize = (8,6))
plt.scatter(simple_x, simple_y, linewidths=0.5)
plt.show()

In [0]:
# Possible theta values (to iterate through)
pos_theta = np.arange(0, 10, 0.1)

# For convenience in using the above methods
simple_x.reshape((simple_x.shape[0], 1));
simple_y.reshape((simple_y.shape[0], 1)); 

In [0]:
# TODO: plot squared loss and gradient
plt.figure(figsize=(8,10))
plt.suptitle("Squared Loss Function")
plt.subplots_adjust(hspace=0.2)
plt.subplot(2,1,1)

# TODO:

plt.title("Loss")

plt.subplot(2,1,2)

# TODO:

plt.title("Gradient")
plt.xlabel("Theta")

plt.show()

In [0]:
# TODO: plot absolute loss and gradient
plt.figure(figsize=(8,10))
plt.suptitle("Absolute Loss Function")
plt.subplots_adjust(hspace=0.2)
plt.subplot(2,1,1)

# TODO:

plt.title("Loss")

plt.subplot(2,1,2)

# TODO:

plt.title("Gradient")
plt.xlabel("Theta")

plt.show()

### c. Given that the gradient descent algorithm uses the first derivative to  find a local minimum, which of the above loss functions is preferable for linear regression using gradient descent? Briefly explain using the above plots.

#### Answer:

TODO:

## 2. Gradient Descent Linear Regression

Here you'll implement a linear regressor using gradient descent and the diabetes dataset initialized at the top of this assignment. Using the loss function you chose for 1c, the gradient descent algorithm will follow the below given formula to update the parameters and find the optimal solution.

The model:
$$ y \approx X w $$
Hypothesis:
$$ h(w) = Xw $$

Hint: w is a vector, thus the gradient will also need to be a vector.

**Gradient Descent Update Function:**

$$w_{n+1} = w_n - \alpha \nabla L(w_n) $$

Due to the relatively small size of the dataset, use all datapoints for computing the gradient (also known as batch gradient descent - compare to stochastic gradient descent, an optimization over batch).

In [0]:
def gd_linreg(X, y, alpha, epsilon=0.001):
    """
    Performs linear regression on X and y using gradient descent

    Input:
    X: n x m matrix - n datapoints, m features
    y: n x 1 vector
    alpha: step size for gradient descent update
    epsilon: maximum difference between the w_n+1 and w_n for convergence 

    Output:
    w: m x 1 vector - weights for each feature of a data point
    losses: array of losses at each step/iteration
    """

    # TODO:
    w = None
    losses = None

    return w, losses

In [0]:
# TODO: set an appropriate alpha
alpha = None

diabetes_w, losses = gd_linreg(diabetes_X_train, diabetes_y_train, alpha)

In [0]:
# Plot losses (may help find a good value for alpha)
num_iter = len(losses)

plt.title("Loss over Iterations of Gradient Descent")
plt.plot(range(1, num_iter+1), losses, c = 'orange')
plt.plot(range(1,num_iter+1), 26226.66*np.ones(num_iter), c = 'blue');

## 3. Evaluate your Implementation

### a. Find your chosen loss using the output weights as found above with the test set using gradient descent.

In [0]:
# TODO:
gd_test_loss = None
gd_test_loss

### b. Using sklearn's OLS Linear Regression method, evaluate the loss of the out of the box method.

Steps: Fit the regressor using the training data (X_train and y_train) then predict using X_test, then use your chosen loss method to evaluate the output against y_test.

In [0]:
from sklearn import linear_model
regr = linear_model.LinearRegression(fit_intercept=False)

In [0]:
# Fit using train and predict for test
# TODO:

linreg_h_w = None

In [0]:
# TODO:
linreg_test_loss = None
linreg_test_loss