# Softmax classification and categorical cross entropy loss

In [0]:
import numpy as np
from sklearn.utils import shuffle

In [2]:
# Load MNIST
from keras.datasets import mnist

(train_images_original, train_labels_original), (test_images_original, test_labels_original) = mnist.load_data()

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz


In [3]:
print(train_images_original.shape)
print(train_labels_original.shape)
print(test_images_original.shape)
print(test_labels_original.shape)
print(train_labels_original[0])
print(train_labels_original[10])
print(train_labels_original[21])
print(train_labels_original[34])
print(test_labels_original[0])
print(test_labels_original[10])
#print(train_images_original[0])

(60000, 28, 28)
(60000,)
(10000, 28, 28)
(10000,)
5
3
0
0
7
0


# Data preprocessing

In [4]:
train_images_flat = train_images_original.reshape((60000, 28 * 28)).T
test_images_flat = test_images_original.reshape((10000, 28 * 28)).T
# Normalize data
train_images = train_images_flat / 255.
test_images = test_images_flat / 255.
train_labels = train_labels_original.reshape((train_labels_original.shape[0], 1)).T
test_labels = test_labels_original.reshape((test_labels_original.shape[0], 1)).T

print(train_images.shape)
print(test_images.shape)
print(train_labels.shape)
print(test_labels.shape)
print(train_labels)
print(train_labels[0])
print(train_labels[0][0])
print(train_labels[0][1])
print(train_labels[0][2])
print(train_labels[0, 2])
print(train_labels[0][10])
print(train_labels[0][21])
print(train_labels[0][34])
print(train_labels[0][59997])
print(train_labels[0][59998])
print(train_labels[0][59999])
print(test_labels[0][0])
print(test_labels[0][1])
print(test_labels[0][2])
print(test_labels[0][10])
#print(train_images[0])

(784, 60000)
(784, 10000)
(1, 60000)
(1, 10000)
[[5 0 4 ... 5 6 8]]
[5 0 4 ... 5 6 8]
5
0
4
4
3
0
0
5
6
8
7
2
1
0


**One-hot encoded labels**

In [5]:
one_hot_train_labels = np.zeros(shape=(10, train_labels.shape[1]), dtype = int)
print(one_hot_train_labels.shape)
print(one_hot_train_labels[:, 0])
print(one_hot_train_labels[:, 1])
print(one_hot_train_labels[:, 2])
print(one_hot_train_labels[:, 10])
print()

for i in range(train_labels.shape[1]):  
  one_hot_train_labels[train_labels[0, i], i] = 1
  
print(one_hot_train_labels[:, 0])
print(one_hot_train_labels[:, 1])
print(one_hot_train_labels[:, 2])
print(one_hot_train_labels[:, 10])

(10, 60000)
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]

[0 0 0 0 0 1 0 0 0 0]
[1 0 0 0 0 0 0 0 0 0]
[0 0 0 0 1 0 0 0 0 0]
[0 0 0 1 0 0 0 0 0 0]


# Softmax classification model

In [0]:
def stable_softmax(X):
    exps = np.exp(X - np.max(X))
    
    return exps / exps.sum(axis=1, keepdims=True)

In [7]:
nums = np.array([[1,2,3], [4,5,6], [7,9,10], [8,11,14], [12,14,16]])  
print(nums.shape)
print(stable_softmax(nums))

(5, 3)
[[0.09003057 0.24472847 0.66524096]
 [0.09003057 0.24472847 0.66524096]
 [0.03511903 0.25949646 0.70538451]
 [0.00235563 0.04731416 0.95033021]
 [0.01587624 0.11731043 0.86681333]]


In [0]:
def initialize_params(dim):

    # dim -- number of parameters
    
    w = np.zeros((dim, 10))
    b = 0.0

    assert(w.shape == (dim, 10))
    assert(isinstance(b, float) or isinstance(b, int))
    
    return w, b

**Categorical cross entropy loss**

In [0]:
def propagate(w, b, X, Y):
    
    # cost -- cross entropy loss
    # dw -- gradient of the loss with respect to w, thus same shape as w
    # db -- gradient of the loss with respect to b, thus same shape as b
    
    
    m = X.shape[1]

    # 1. FORWARD PROPAGATION

    A = (stable_softmax((np.dot(w.T, X) + b).T)).T # Apply softmax
    #print(A.shape)
    #print(Y.shape)
    
    # Y is one-hot encoded
    cost = 1/m * np.sum(-Y * np.log(A)) # cross entropy loss

    # 2. BACKWARD PROPAGATION
    # Compute gradients of loss function
    dw = 1/m * np.dot(X, (A - Y).T)
    db = 1/m * np.sum(A - Y)

    #print(dw.shape)
    #print(w.shape) # dw.shape = w.shape 
    assert(dw.shape == w.shape) #telling the program to test that condition, and trigger an error if the condition is false
    assert(db.dtype == float)
    cost = np.squeeze(cost)
    assert(cost.shape == ())
    grads = {"dw": dw,
             "db": db}
    
    return grads, cost

**Mini-batch gradient descent**

In [0]:
def optimize(w, b, X, Y, num_iters, learning_rate, batch_size, print_cost=False):
    
    # X, Y: training data and training labels
    
    costs = []
    #print(X.shape)
    #print(Y.shape)
    
    for iter in range(num_iters):
        # Randomize training data
        X, Y = shuffle(X.T, Y.T)
        minibatch_size = batch_size
        #if iter == 500:
          #print(X.shape)
          #print(Y.shape)
          
        # mini batch  
        for i in range(0, X.shape[0], minibatch_size):   
            X_mini = X[i:i + minibatch_size]
            Y_mini = Y[i:i + minibatch_size]
        #if iter == 500:
          #print(X_mini.shape)
          #print(Y_mini.shape)
 
        grads, cost = propagate(w, b, X_mini.T, Y_mini.T)                             
        
       
        # Retrieve gradients
        dw = grads["dw"]
        db = grads["db"]
        
        # Update w, b using gradient descent 
        w = w - learning_rate * dw                    
        b = b - learning_rate * db
        
        X, Y = X.T, Y.T
        #if iter == 500:
          #print(X.shape)
          #print(Y.shape)
          
        # Record the costs
        if iter % 100 == 0:
            costs.append(cost)
        
        # Print the cost every 200 training examples
        if print_cost and (iter % 200 == 0 or iter == num_iters - 1):
            print ("Cost after iteration %i: %f" % (iter, cost))
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs

In [0]:
def predict(w, b, X):
    
    m = X.shape[1]
    
    Y_pred = []
    
    # Apply w and b that are learned through training
    A = stable_softmax((np.dot(w.T, X) + b).T) 
    #print(A.shape)
    
    for i in range(A.shape[0]):
      Y_pred.append(np.argmax(A[i])) # Convert one-hot encoded labels to original labels (0-9)
    
    Y_pred = np.asarray(Y_pred)
    Y_pred = Y_pred.reshape((1, Y_pred.shape[0]))
    #print(Y_pred.shape)
    #print(Y_pred)
               
    assert(Y_pred.shape == (1, m))
    
    return Y_pred

In [0]:
def model(X_train, Y_train, X_test, Y_test, num_iters=2000, learning_rate=0.005, batch_size=32, print_cost=False):
        
    #Initialize parameters (with zeros)
    w, b = initialize_params(X_train.shape[0])                            

    # Gradient descent
    parameters, grads, costs = optimize(w, b, X_train, Y_train, num_iters, learning_rate, batch_size, print_cost)   
    
    # Retrieve trained w and b
    w = parameters["w"]
    b = parameters["b"]
    
    # Predict on test/train set
    Y_pred_test = predict(w, b, X_test)
    Y_pred_train = predict(w, b, X_train)
    
    count1 = 0
    count2 = 0
    # Compute training accuracy 
    for i in range(Y_train.shape[1]):
      if(Y_pred_train[0,i] == train_labels[0,i]): # not Y_train[0,i] (one-hot encoded), but the original labels
        count1 += 1
    train_accuracy = count1 / Y_train.shape[1]
    
    # Compute test accuracy
    for j in range(Y_test.shape[1]):
      if(Y_pred_test[0,j] == Y_test[0,j]):
        count2 += 1
    test_accuracy = count2 / Y_test.shape[1]
    
    # Print train/test accuracy
    print("")
    print("train accuracy: {} %".format(train_accuracy * 100))
    print("test accuracy: {} %".format(test_accuracy * 100))
    #print(w)
    #print(b)

    # d -- dictionary storing information about the model
    d = {"costs": costs,
         "Y_pred_test": Y_pred_test, 
         "Y_pred_train" : Y_pred_train, 
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iters": num_iters}
    
    return d

# MNIST digit classification

In [20]:
classifier = model(train_images, one_hot_train_labels, test_images, test_labels, num_iters=2000, learning_rate=0.1, batch_size=128, print_cost=True)
w = classifier["w"]
b = classifier["b"]

Cost after iteration 0: 2.302585
Cost after iteration 200: 0.397438
Cost after iteration 400: 0.384394
Cost after iteration 600: 0.271052
Cost after iteration 800: 0.374602
Cost after iteration 1000: 0.474772
Cost after iteration 1200: 0.480486
Cost after iteration 1400: 0.598959
Cost after iteration 1600: 0.313791
Cost after iteration 1800: 0.323261
Cost after iteration 1999: 0.231946

train accuracy: 91.04166666666667 %
test accuracy: 91.47 %
