<a href="https://colab.research.google.com/github/Data-Science-and-Data-Analytics-Courses/MITx---Machine-Learning-with-Python-From-Linear-Models-to-Deep-Learning-Jun-11-2019/blob/master/Project%202%3A%20Digit%20recognition%20(Part%201)/Multinomial_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multinomial (Softmax) Regression and Gradient Descent


---



## Setup

In [0]:
import os, sys
from pathlib import Path

# Notebook Library
url = "https://github.com/Data-Science-and-Data-Analytics-Courses/Notebook-Library"
repo = Path("/nblib")
!git clone "{url}" "{repo}"
if repo.parent.as_posix() not in sys.path:
  sys.path.append(repo.parent.as_posix())
%run "{repo}/.Importable.ipynb"

from nblib import Git
# Remote
URL = "https://github.com/Data-Science-and-Data-Analytics-Courses/MITx---Machine-Learning-with-Python-From-Linear-Models-to-Deep-Learning-Jun-11-2019"
REPO = Git.clone(URL, dest="/content")
if REPO.as_posix() not in sys.path:
  sys.path.append(REPO.as_posix())

# Working directory, for running modules in part1
part1dir = REPO / "Project 2: Digit recognition (Part 1)/mnist/part1"
os.chdir(part1dir)

from setup.Setup import *
import main

## Test Error on Softmax Regression



In [0]:
def compute_cost_function(X, Y, theta, lambda_factor, temp_parameter):
    """
    Computes the total cost over every datapoint.

    Args:
        X - (n, d) NumPy array (n datapoints each with d features)
        Y - (n, ) NumPy array containing the labels (a number from 0-9) for each
            data point
        theta - (k, d) NumPy array, where row j represents the parameters of our
                model for label j
        lambda_factor - the regularization constant (scalar)
        temp_parameter - the temperature parameter of softmax function (scalar)

    Returns
        c - the cost value (scalar)
    """
    #YOUR CODE HERE
    n, d = X.shape # datapoints, features
    prob = compute_probabilities(X, theta, temp_parameter)
    loss = -np.log(prob[Y, range(n)]).mean() # only account prob value for label Y(i)
    reg = 1/2 * lambda_factor * np.sum(theta**2) # regularization term
    
    return loss + reg

In [0]:
def compute_probabilities(X, theta, temp_parameter):
    """
    Computes, for each datapoint X[i], the probability that X[i] is labeled as j
    for j = 0, 1, ..., k-1
    Args:
        X - (n, d) NumPy array (n datapoints each with d features)
        theta - (k, d) NumPy array, where row j represents the parameters of our model for label j
        temp_parameter - the temperature parameter of softmax function (scalar)
    Returns:
        H - (k, n) NumPy array, where each entry H[j][i] is the probability that X[i] is labeled as j
    """
    #YOUR CODE HERE
    n, d = X.shape
    k = theta.shape[0] # number of labels
    
    H = np.zeros((k, n))
    for i in range(n): # each data point
        # Linear transformation
        z = theta.dot(X[i]) / temp_parameter
        z -= np.max(z) # keep the resulting number from getting too large
    
        # Softmax
        h = np.exp(z)
        h /= np.sum(h)
        H[:, i] = h
    
    return H

In [0]:
n, d, k = 3, 5, 7
X = np.arange(0, n * d).reshape(n, d)
Y = np.arange(0, n)
zeros = np.zeros((k, d))
alpha = 2
temp = 0.2
lambda_factor = 0.5
theta = np.zeros((k, d))
exp_res = np.zeros((k, d))
exp_res = np.array([
[ -7.14285714,  -5.23809524,  -3.33333333,  -1.42857143, 0.47619048],
[  9.52380952,  11.42857143,  13.33333333,  15.23809524, 17.14285714],
[ 26.19047619,  28.0952381 ,  30.        ,  31.9047619 , 33.80952381],
[ -7.14285714,  -8.57142857, -10.        , -11.42857143, -12.85714286],
[ -7.14285714,  -8.57142857, -10.        , -11.42857143, -12.85714286],
[ -7.14285714,  -8.57142857, -10.        , -11.42857143, -12.85714286],
[ -7.14285714,  -8.57142857, -10.        , -11.42857143, -12.85714286]
])
print(x)
print(Y)
run_gradient_descent_iteration(X, Y, theta, alpha, lambda_factor, temp)

In [0]:
def run_gradient_descent_iteration(X, Y, theta, alpha, lambda_factor, temp_parameter):
    """
    Runs one step of batch gradient descent

    Args:
        X - (n, d) NumPy array (n datapoints each with d features)
        Y - (n, ) NumPy array containing the labels (a number from 0-9) for each
            data point
        theta - (k, d) NumPy array, where row j represents the parameters of our
                model for label j
        alpha - the learning rate (scalar)
        lambda_factor - the regularization constant (scalar)
        temp_parameter - the temperature parameter of softmax function (scalar)

    Returns:
        theta - (k, d) NumPy array that is the final value of parameters theta
    """
    #YOUR CODE HERE
    n, d = X.shape
    k = theta.shape[0]
    
    Yoh = sparse.coo_matrix(([1]*n, (Y, range(n))), shape=(k,n)).toarray() # labels in one vs. all (one hot) format
    prob_app = Yoh*1 - compute_probabilities(X, theta, temp_parameter) # probabilities applicable to each label
    grad = np.zeros(theta.shape)
    loss_grad = -1/(temp_parameter*n) * np.sum(prob_app[:, None] * X, axis=1)
    reg_grad = lambda_factor * theta

#     for i in range(k): # each label
#         loss_grad = -1/(temp_parameter*n) * np.sum(prob_app[i][:, None] * X, axis=0)
#         grad[i] = loss_grad + reg_grad
    
    # Update
    theta -= alpha * (loss_grad + reg_grad)
    
    return theta