In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [24]:
# some helper functions.....
def im2col_sliding(image, filter_height=3, filter_width=3, 
                   padding=0, stride=1):
    
    M, C, h, w, = image.shape
    x_padded = np.pad(image, ((0, 0), (0, 0), (padding, padding), (padding, padding)), 
                      mode='constant')    
    h_new = int((h - filter_height + 2*padding) / stride + 1)
    w_new = int((w - filter_width + 2*padding) / stride + 1)
    
    output_vectors = np.zeros((filter_width*filter_height*C, M*h_new*w_new), dtype=image.dtype)
    
    itr = 0
    for i in range(h_new):
        for j in range(w_new):
             for m in range(M):
                    start_i = stride * i
                    end_i = stride * i + filter_width
                    start_j = stride * j
                    end_j = stride * j + filter_height
                    output_vectors[:, itr] = x_padded[m, :, start_i:end_i, start_j:end_j].ravel()
                    itr += 1                    
    return output_vectors

def col2img_sliding(cols,  x_shape, filter_height=3, filter_width=3, 
                    padding=0, stride=1):
    N, C, H, W = x_shape
    H_padded, W_padded = H + 2 * padding, W + 2 * padding
    x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
    
    idx = 0
    for i in range(0, H_padded - filter_height + 1, stride):
        for j in range(0, W_padded - filter_width + 1, stride):
            for m in range(N):
                col = cols[:, idx]
                col = col.reshape((C, filter_height, filter_width))            
                x_padded[m, :, i:i+filter_height, j:j+filter_width] += col  
                idx += 1
    if padding > 0:
        return x_padded[:, :, padding:-padding, padding:-padding]
    else:
        return x_padded

# gradient checking utilities....
def eval_numerical_gradient_array(f, x, df, h=1e-5):
    """
    Evaluate a numeric gradient for a function that accepts a numpy
    array and returns a numpy array.
    """
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        ix = it.multi_index

        oldval = x[ix]
        x[ix] = oldval + h
        pos = f(x).copy()
        x[ix] = oldval - h
        neg = f(x).copy()
        x[ix] = oldval
        grad[ix] = np.sum((pos - neg) * df) / (2 * h)
        it.iternext()
    return grad

def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))


def conv_forward(X, W, b, stride=1, padding=1):
    cache = W, b, stride, padding
    n_filters, d_filter, h_filter, w_filter = W.shape
    n_x, d_x, h_x, w_x = X.shape
    h_out = (h_x - h_filter + 2 * padding) / stride + 1
    w_out = (w_x - w_filter + 2 * padding) / stride + 1

    if not h_out.is_integer() or not w_out.is_integer():
        raise Exception('Invalid output dimension!')

    h_out, w_out = int(h_out), int(w_out)

    X_col = im2col_sliding(X, h_filter, w_filter, padding=padding, stride=stride)
    W_col = W.reshape(n_filters, -1)

    out = W_col @ X_col + b
    out = out.reshape(n_filters, h_out, w_out, n_x)
    out = out.transpose(3, 0, 1, 2)

    cache = (X, W, b, stride, padding, X_col)

    return out, cache


def conv_backward(dout, cache):
    X, W, b, stride, padding, X_col = cache
    n_filter, d_filter, h_filter, w_filter = W.shape

    db = np.sum(dout, axis=(0, 2, 3))
    db = db.reshape(n_filter, -1)

    dout_reshaped = dout.transpose(1, 2, 3, 0).reshape(n_filter, -1)
    dW = dout_reshaped @ X_col.T
    dW = dW.reshape(W.shape)

    W_reshape = W.reshape(n_filter, -1)
    dX_col = W_reshape.T @ dout_reshaped
    dX = col2im_indices(dX_col, X.shape, h_filter, w_filter, padding=padding, stride=stride)

    return dX, dW, db

In [25]:
# affine layer...

def affine_forward(x, w, b):
    """
    Computes the forward pass for an affine (fully-connected) layer.

    The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N
    examples, where each example x[i] has shape (d_1, ..., d_k). We will
    reshape each input into a vector of dimension D = d_1 * ... * d_k, and
    then transform it to an output vector of dimension M.

    Inputs:
    - x: A numpy array containing input data, of shape (N, d_1, ..., d_k)
    - w: A numpy array of weights, of shape (D, M)
    - b: A numpy array of biases, of shape (M,)

    Returns a tuple of:
    - out: output, of shape (N, M)
    - cache: (x, w, b)
    """
    out = None
    ###########################################################################
    # TODO: Implement the affine forward pass. Store the result in out. You   #
    # will need to reshape the input into rows.                               #
    ###########################################################################
    out = x @ w + b
    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    cache = (x, w, b)
    return out, cache

def affine_backward(dout, cache):
    """
    Computes the backward pass for an affine layer.

    Inputs:
    - dout: Upstream derivative, of shape (N, M)
    - cache: Tuple of:
      - x: Input data, of shape (N, d_1, ... d_k)
      - w: Weights, of shape (D, M)

    Returns a tuple of:
    - dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
    - dw: Gradient with respect to w, of shape (D, M)
    - db: Gradient with respect to b, of shape (M,)
    """
    x, w, b = cache
    dx, dw, db = None, None, None
    ###########################################################################
    # TODO: Implement the affine backward pass.                               #
    ###########################################################################
    dx = dout @ w.T
    dw = x.T @ dout 
    db = dout.sum(axis=0)
    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    return dx, dw, db

In [26]:
X = np.arange(10.0).reshape(2, 5)
W = np.random.randn(5, 3)
b = np.random.randn(3, )

out, cache = affine_forward(X, W, b)
dout = np.random.randn(2, 3)
dx, dw, db = affine_backward(dout, cache)

dx_num = eval_numerical_gradient_array(lambda x: affine_forward(X, W, b)[0], X, dout)
dw_num = eval_numerical_gradient_array(lambda x: affine_forward(X, W, b)[0], W, dout)
db_num = eval_numerical_gradient_array(lambda x: affine_forward(X, W, b)[0], b, dout)

rel_error(dx_num, dx) < 1e-5
rel_error(dw_num, dw) < 1e-5
rel_error(db_num, db) < 1e-5

True

In [27]:
def relu_forward(x):
    """
    Computes the forward pass for a layer of rectified linear units (ReLUs).

    Input:
    - x: Inputs, of any shape

    Returns a tuple of:
    - out: Output, of the same shape as x
    - cache: x
    """
    ###########################################################################
    # TODO: Implement the ReLU forward pass.                                  #
    ###########################################################################
    out = x.copy()  # Must use copy in numpy to avoid pass by reference.
    out = np.maximum(x, 0.0)
    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    cache = x
    return out, cache


def relu_backward(dout, cache):
    """
    Computes the backward pass for a layer of rectified linear units (ReLUs).

    Input:
    - dout: Upstream derivatives, of any shape
    - cache: Input x, of same shape as dout

    Returns:
    - dx: Gradient with respect to x
    """
    dx, x = None, cache
    ###########################################################################
    # TODO: Implement the ReLU backward pass.                                 #
    ###########################################################################
    #relu_mask = (x >= 0.0)
    #dx = dout * relu_mask
    dx = np.sign(np.maximum(x, 0)) * dout
    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    return dx

In [28]:
X = np.array([[-5.0, 1.0, 0.0, 13.0, 30.0], [6.0, -20, 0.0, 1.0, 0.0]]).reshape(2, 5)
#dout = np.random.randn(2, 5)
dout = np.array([[1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 10, 1.0, 1.0, 1.0]]).reshape(2, 5)
out, cache = relu_forward(X)
d_x = relu_backward(dout, cache)

dx_num = eval_numerical_gradient_array(lambda x: relu_forward(X)[0], X, dout)
rel_error(dx_num, dx)

1.0

In [29]:
def softmax_loss(x, y):
    """
    Computes the loss and gradient for softmax classification.

    Inputs:
    - x: Input data, of shape (N, C) where x[i, j] is the score for the jth
      class for the ith input.
    - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
      0 <= y[i] < C

    Returns a tuple of:
    - loss: Scalar giving the loss
    - dx: Gradient of the loss with respect to x
    """
    shifted_logits = x - np.max(x, axis=1, keepdims=True)
    Z = np.sum(np.exp(shifted_logits), axis=1, keepdims=True)
    log_probs = shifted_logits - np.log(Z)
    probs = np.exp(log_probs)
    N = x.shape[0]
    loss = -np.sum(log_probs[np.arange(N), y]) / N
    dx = probs.copy()
    dx[np.arange(N), y] -= 1
    dx /= N
    return loss, dx

In [30]:
import numpy as np
import gzip
import pickle
import os


class MNIST:
    def __init__(self, batch_size):
        self.batch_size = batch_size

        train, valid, test = self._load_data()
        self.X_train, self.y_train = train[0], train[1]

        # encoding y_train using one-hot encoding
        #self.y_train_one_hot = np.zeros((self.y_train.shape[0], 10))
        #self.y_train_one_hot[np.arange(self.y_train.shape[0]), self.y_train] = 1

        self.X_valid, self.y_valid = valid[0], valid[1]
        self.X_test, self.y_test = test[0], test[1]

    def train_batch_generator(self):
        while True:
            rand_indices = np.random.choice(self.X_train.shape[0], self.batch_size, False)
            yield self.X_train[rand_indices], self.y_train[rand_indices] #self.y_train_one_hot[rand_indices]

    def validation(self):
        return self.X_valid, self.y_valid

    def testing(self):
        return self.X_test, self.y_test

    def num_features(self):
        return self.X_train.shape[1]

    def _load_data(self):
        script_dir = os.path.dirname('.')
        mnist_file = os.path.join(os.path.join(script_dir, 'data'), 'mnist.pkl.gz')

        with gzip.open(mnist_file, 'rb') as mnist_file:
            u = pickle._Unpickler(mnist_file)
            u.encoding = 'latin1'
            train, val, test = u.load()
        return train, val, test

In [31]:
mnist = MNIST(batch_size=32)
train_gen = mnist.train_batch_generator()

In [32]:
train, _ = next(train_gen)

In [126]:
class Network:
    def __init__(self, input_size=(1, 28, 28), num_filters=30, filter_size=5, output_size=10):
        self.input_size = input_size
        self.num_filters = num_filters 
        self.filter_size = filter_size
        self.output_size = output_size
        
        rand = np.random.RandomState(seed=1024)
        self.W1 = rand.normal(scale=0.01, size=(num_filters, 1, filter_size, filter_size))
        self.b1 = rand.normal(scale=0.01, size=(num_filters, 1))
        
        self.first_hid_h = int((28 - filter_size + 2*0) / 1) + 1
        self.first_hid_w = int((28 - filter_size + 2*0) / 1) + 1
        
        
        self.W2 = rand.normal(scale=0.01, size=(self.first_hid_h*self.first_hid_w*num_filters, output_size))
        print(self.W2.shape)
        self.b2 = rand.normal(scale=0.01, size=(output_size))
        
        self.mnist = MNIST(batch_size=64)        
        
    
    def train(self, num_iter=250):
        train_iter = self.mnist.train_batch_generator()
        for i in range(num_iter):
            X_train, y_train = next(train_gen)
            X_train = X_train.reshape((-1, 1, 28, 28))
            #print(X_train.shape)
            #print(y_train.shape)
            #print(self.W1.shape)
            
            # 
            out, conv_1_cache = conv_forward(X_train, self.W1, self.b1, padding=0)
            out, relu_cache = relu_forward(out)    
            #print(out.shape)
            out = out.reshape(32, -1)
            #print(out.shape)
            out, affine_cache = affine_forward(out, self.W2, self.b2)
            
            loss, dout = softmax_loss(out, y_train)
            if i % 250 == 0:
                print(loss)
            
            dout, dW2, db2 = affine_backward(dout, affine_cache)
            dout = dout.reshape(32, self.num_filters, self.first_hid_h, self.first_hid_w)
            #print(dout.shape)
            dout = relu_backward(dout, relu_cache)
            dout, dW1, db1 = conv_backward(dout, conv_1_cache)
            
            
            
            self.W1 = self.W1 - 0.001*dW1
            self.b1 = self.b1 - 0.001*db1
            
            self.W2 = self.W2 - 0.001*dW2
            self.b2 = self.b2 - 0.001*db2
        
    
    def test(self):
        X_test, y_test = self.mnist.testing()
        X_test = X_test.reshape((-1, 1, 28, 28))
        print(X_test.shape)
        out, conv_1_cache = conv_forward(X_test, self.W1, self.b1, padding=0)
        out, relu_cache = relu_forward(out)    
            #print(out.shape)
        out = out.reshape(10000, -1)
            #print(out.shape)
        out, affine_cache = affine_forward(out, self.W2, self.b2)
        
        correct = np.sum(np.equal(y_test, np.argmax(out, axis=1)))
        percentage = (correct / (y_test.shape[0])) * 100.00
        print(percentage)

In [127]:
network = Network()
network.train(num_iter=5000)
network.test()

(17280, 10)
2.3040088618
2.28791685391
2.24247573162
2.03836845755
1.5710882028
1.12664798764
0.712651984088
0.620957181107
0.505868698416
0.515338445754
0.628905424147
0.294336287199
0.329698561095
0.444646784282
0.271236667222
0.429907827419
0.369935553695
0.945401472879
0.321740119706
0.362152374742
(10000, 1, 28, 28)
89.9


In [122]:
class Network:
    def __init__(self, input_size=28*28, output_size=10):
        self.input_size = input_size
        self.output_size = output_size
        
        rand = np.random.RandomState(seed=1024)
        self.W1 = rand.normal(scale=0.01, size=(input_size, output_size))
        self.b1 = rand.normal(scale=0.01, size=(output_size))
              
        self.mnist = MNIST(batch_size=64)        
        
    
    def train(self, num_iter=250):
        train_iter = self.mnist.train_batch_generator()
        for i in range(num_iter):
            X_train, y_train = next(train_gen) 
            out, affine_cache = affine_forward(X_train, self.W1, self.b1)
            
            loss, dout = softmax_loss(out, y_train)
            if i % 250 == 0:
                print(loss)
            
            dout, dW1, db1 = affine_backward(dout, affine_cache)            
            
            
            self.W1 = self.W1 - 0.001*dW1
            self.b1 = self.b1 - 0.001*db1
            
            #self.W2 = self.W2 - 0.001*dW2
            #self.b2 = self.b2 - 0.001*db2
        
    
    def test(self):
        X_test, y_test = self.mnist.testing()
        out, affine_cache = affine_forward(X_test, self.W1, self.b1)
        
        correct = np.sum(np.equal(y_test, np.argmax(out, axis=1)))
        percentage = (correct / (y_test.shape[0])) * 100.00
        print(percentage)

In [123]:
network = Network()
network.train(num_iter=5000)
network.test()

2.31531253936
2.02714240171
1.94361258532
1.7470106729
1.64754674547
1.48137462908
1.29335680744
1.21296555738
1.13913115393
1.20271920262
0.949475155889
0.985219385182
1.13710925994
1.10417337316
0.891957873001
1.00166146398
0.931458211235
0.732332387641
0.869039140127
0.757504745317
84.96
