In [1]:
import numpy as np
import pickle
import matplotlib.pyplot as plt

def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [2]:
cifar_data_1 = unpickle('cifar-10-python/data_batch_1')
cifar_data_2 = unpickle('cifar-10-python/data_batch_2')
cifar_data_3 = unpickle('cifar-10-python/data_batch_3')
cifar_data_4 = unpickle('cifar-10-python/data_batch_4')
cifar_data_5 = unpickle('cifar-10-python/data_batch_5')

Xtr = np.concatenate([cifar_data_1[b'data'], cifar_data_2[b'data'], cifar_data_3[b'data'], cifar_data_4[b'data']])
ytr = np.concatenate([cifar_data_1[b'labels'], cifar_data_2[b'labels'], cifar_data_3[b'labels'], cifar_data_4[b'labels']])
Xtst = cifar_data_5[b'data']
ytst = cifar_data_5[b'labels']

Xtr_norm = (Xtr - Xtr.mean(axis=0)) / Xtr.var(axis=0)
Xtst_norm = (Xtst - Xtst.mean(axis=0)) / Xtst.var(axis=0)

#### Optimization: Gradient Descent

In [3]:
#implement linear classifier
class LinearClassifier:
    def __init__(self, D, K):
        self.W = np.random.normal(size=(D, K)) / D ** 0.5
        self.b = np.random.normal(size=(K, )) / K ** 0.5

    def forward(self, X):
        return X @ self.W  + self.b
   
    def loss(self, y_pred, y_true, delta, lmbda):
        '''
        y_pred = matrix of shape (N x K)
        y_true = vector of dim N
        delta = scalar
        lmbda = scalar
        '''
        yt_pred = y_pred[np.arange(len(y_true)), y_true]
        
        #use loss mask to remove the correct y trues from loss calculation
        yt_loss_mask = np.ones(y_pred.shape)
        yt_loss_mask[np.arange(yt_loss_mask.shape[0]), y_true] = 0

        dist = (y_pred.T - yt_pred + delta).T * yt_loss_mask
        l2_reg = lmbda * (np.sum(self.W ** 2) + np.sum(self.b ** 2))
        loss = np.sum(np.clip(dist, a_min=0, a_max=np.inf)) + l2_reg

        return loss, dist

    def msvm_grad(self, X, dist, lmbda):
        '''
        args:
            X = input data (N x D), where N = training set/batch size, D = data dimensionality
            dist = distance matrix (N x K) computed using multiclass SVM loss, where K = number of output categories
        algo:
            matmul input data transpose against distance matrix, yields gradient matrix (D x K)
            L2 regularization loss also included
        '''
        W_grad = X.T @ dist + 2 * lmbda * self.W
        b_grad = dist.mean(axis=0) + 2 * lmbda * self.b
        return W_grad, b_grad

#### Training the Linear Classifier

In [58]:
lc = LinearClassifier(3072, 10)

In [59]:
#hyper params
DIST = 1.0
LMBDA = 0.5
BATCH_SIZE = 64
N_ITERS = 500

#randomly generate indexes for batches
rng = np.random.default_rng()

#run training loop
losses, W_grads, b_grads, Ws, bs = [], [], [], [], []
for _ in range(N_ITERS):
    #get batch data
    batch_idx = rng.choice(Xtr_norm.shape[0], size=BATCH_SIZE, replace=False)
    X_batch = Xtr_norm[batch_idx]
    y_batch = ytr[batch_idx]

    #forward pass
    y_pred = lc.forward(X_batch)

    #backward pass
    loss, dist = lc.loss(y_pred, y_batch, DIST, LMBDA)
    W_grad, b_grad = lc.msvm_grad(X_batch, dist, LMBDA)
    lr = 0.005 #learning rate
    lc.W = -lr * W_grad + lc.W
    lc.b = -lr * b_grad + lc.b

    #logging changes
    losses.append(loss)
    W_grads.append(W_grad)
    b_grads.append(b_grad)
    Ws.append(lc.W)
    bs.append(lc.b)