In [1]:
import numpy as np
import matplotlib.pyplot as plt

from __future__ import division

np.random.seed(1)
%matplotlib inline

## 1. Function define

### Util function

In [2]:
def one_hot_encode(x, n_class):
    """
    One Hot encoding
    
    Inputs:
    - x: N smaple vector
    - n_class: Number of class
    
    Returns:
    - en_1hot: Encoding matrix shape of (n_smaple, n_class)
    """
    
    en_1hot = np.zeros([len(x), n_class])
    
    for idx, cat in enumerate(x):
        en_1hot[idx, cat] = 1

    return en_1hot

def next_batch(X, y, batch_size, shuffle=True):
    """
    Get next batch data
    
    Inputs: 
    - X: input data
    - y: input data label
    - batch_size: s
    
    Outputs tuple of batch data 
    - X_batch: batch sampled X 
    - y_batch: batch sampled y
    """
    
    n_sample = X.shape[0]
    n_batch = n_sample // batch_size
    n_batch = n_batch + 1 if (n_sample % n_batch) != 0 else n_batch 
    idx = np.array(range(n_sample))
    
    if shuffle:
        np.random.shuffle(idx)
        
    for b_idx in range(n_batch):
        start, end = b_idx * batch_size, (b_idx + 1) * batch_size
        if end >= n_sample:
            sample_idx = idx[start:]
        else:
            sample_idx = idx[start:end] 
        
        X_batch, y_batch = X[sample_idx, :], y[sample_idx, :]
        yield X_batch, y_batch


def softmax(x):
    
    x_reshape = x.reshape(x.shape[0], -1)
    # trick to avoid numerical unstable
    probs = np.exp(x_reshape - np.max(x_reshape, axis=-1, keepdims=True))
    probs /= np.sum(probs, axis=-1, keepdims=True)
    
    return probs
    

def xavier_initializer(n_inputs, n_outputs, shape):
    init_range = np.sqrt(6.0 / (n_inputs + n_outputs))
    return  np.random.uniform(-init_range, init_range, shape)


def im2col_idx(x_shape, KH, KW, pad, stride):
    """
    Get im2col index based on kerenl size
    
    Inputs:
    - x_shape: Shape of input in (NH, NW, ND, NB) order
    - KH: Kernel height
    - KW: Kernel width
    - pad: Padding step
    - stride: Stride step
    
    Returns a tuple of idx
    - r_idx: Kernel slide row idx
    - c_idx: Kernel slide col idx
    - d_idx: Kernel slide depth idx
    """
    
    #(img_h, img_w, img_c, n_batch)
    NH, NW, ND, NB = x_shape
    OH = (NH + 2 * pad - KH) // stride + 1
    OW = (NW + 2 * pad - KW) // stride + 1
    
    # each block row had KH cols 
    # compute row idx 
    r_idx = np.repeat(np.arange(KH), KW)
    r_idx = np.tile(r_idx, ND)
    # compute row block shfit
    r_shift = stride * np.repeat(np.arange(OH), OW)
    r_idx = r_idx.reshape(-1, 1) + r_shift.reshape(1, -1)
    
    # compute col idx
    c_idx = np.tile(np.arange(KW), KH * ND)
    # compute col block shift
    c_shift = stride * np.tile(np.arange(OW), OH)
    c_idx = c_idx.reshape(-1, 1) + c_shift.reshape(1, -1)
    
    d_idx = np.repeat(np.arange(ND), KH * KW).reshape(-1, 1)
    
    return (r_idx, c_idx, d_idx)
    

def im2col(x, KH, KW, pad, stride):
    """
    Convert input x along kernel size
    
    Extract x into kernel patch cols 
    
    Inputs:
    - x_shape: Shape of input in (NH, NW, ND, NB) order
    - KH: Kernel height
    - KW: Kernel width
    - pad: Pad step
    - stride: Stride step
    
    Returns:
    - cols: Each cols contain KH * KW * ND elements
    """
    
    NH, NW, ND, NB = x.shape
    x_pad = np.pad(x, 
                   # only padding on height and weight
                   ((pad, pad),    # height
                    (pad, pad),    # width 
                    (0, 0),        # channel
                    (0, 0)),       # batch
                   'constant', constant_values=0)
    
    # get patch index
    r, c, d = im2col_idx(x.shape, KH, KW, pad, stride)
    # extract correspond patch to cols
    cols = x_pad[r, c, d, :]
    cols = cols.reshape(KH * KW * ND, -1)
    
    return cols


def col2im(cols, x_shape, KH, KW, pad, stride):
    """Put col kerenel patch back to x
    
    Inputs:
    - x_shape: Shape of input in (NH, NW, ND, NB) order
    - KH: Kernel height
    - KW: Kernel width
    - pad: Pad step
    - stride: Stride step
    
    Returns: 
    - x_pad: a pad x matrix accroding cols patch
    """
    
    NH, NW, ND, NB = x_shape
    OH, OW = NH + 2 * pad, NW + 2 * pad
    
    x_pad = np.zeros((OH, OW, ND, NB), dtype=cols.dtype)
    r, c, d = im2col_idx(x_shape, KH, KW, pad, stride)

    # add correspond patch back to location
    cols_reshape = cols.reshape(KH * KW * ND, -1, NB)    
    np.add.at(x_pad, (r, c, d, slice(None)), cols_reshape)
    if pad == 0:
        return x_pad
    return x_pad[pad:-pad, pad:-pad, :, :]


def get_output_size(input_size, ksize, n_filters, stride, pad):
    """
    Get output size of layer
    
    Inputs:
    - input_size: shape of input data
    - ksize: kernel size
    - n_filters: number of output channel
    - stride: Stride steps
    - pad: Pad steps
    
    Returns:
    - output_size: output size
    """
    
    NH, NW, ND = input_size
    KH, KW = ksize
    OH = (NH + 2 * pad - KH) // stride + 1
    OW = (NW + 2 * pad - KW) // stride + 1

    output_size = (OH, OW, n_filters)
    return output_size

### Foward and backward operations

In [3]:
def fc_foward(x, w, b):
    """
    Fully connected foward
    
    Inputs:
    - x: input data of shape (N, d_1, ... d_k)
    - w: weights of shape (D, M)
    - b: bias of shape (M, )
    
    Returns a tuple of:
    - out: output of shape (N, M)
    - cache: (x, w, b)
    """
    
    NB = x.shape[0]
    
    x_reshape = x.reshape(NB, -1)
    out = np.dot(x_reshape, w) + b

    cache = (x, w, b)
    
    return out, cache

def fc_backward(dout, cache):
    """
    Fully connected backward
    
    Inputs:
    - dout: Gradient of output, of shape (N, M)
    - cache: Tuple of:
      - x: Input data, of shape (N, d_1, ... d_k)
      - w: Weights, of shape (D, M)

    Returns a tuple of:
    - dx: Gradient of x, of shape (N, d1, ..., d_k)
    - dw: Gradient of w, of shape (D, M)
    - db: Gradient of b, of shape (M,)
    """
    
    x, w, b = cache
    NB = x.shape[0]
    
    x_reshape = x.reshape(NB, -1)
    
    dx = np.dot(dout, w.T)
    dx = dx.reshape(x.shape)
    dw = np.dot(x_reshape.T, dout)
    db = np.sum(dout, axis=0)
    
    return dx, dw, db

def conv_foward_navie(x, w, b, params):
    pass

def conv_backword_navie(dout, cache):
    pass
    
def maxpool_foward_navie(x, param):
    pass

def maxpool_backward_navie(dout, cache):
    pass

def conv_foward(x, w, b, params):
    """
    Fast conv layer foward implementation using im2col
    
    Inputs:
    - x: input data
    - w: weights
    - b: bias
    - params: conv config params
    
    Returns a tuple of:
    - out: conv output
    - cache: (x, w, b, params, x_cols)
    """
    
    NB, NH, NW, ND = x.shape
    KH, KW, ND, KF = w.shape
    stride = params['stride']
    pad = params['pad']
    
    OH = (NH + 2 * pad - KH) // stride + 1
    OW = (NW + 2 * pad - KW) // stride + 1
    
    # switch n_batch to last dim
    x = x.transpose(1, 2, 3, 0)
    x_cols = im2col(x, KH, KW, pad, stride)
    w_cols = w.transpose(3, 0, 1, 2).reshape(KF, -1)

    out = w_cols.dot(x_cols) + b.reshape(-1, 1)
    out = out.reshape(KF, OH, OW, NB)
    # swith n_batch to first dim
    x = x.transpose(3, 0, 1, 2)
    out = out.transpose(3, 1, 2, 0)
    
    cache = (x, w, b, params, x_cols)
    
    return out, cache
    
def conv_backward(dout, cache):
    """
    Fast conv layer backward implementation using im2col
    
    Inputs:
    - dout: Gradient of output
    - cache: Tuple of:
      - x: Input data
      - w: Weights
      - b: Bias
      - params: Conv config params
      - x_cols: Input data along extract along kernel size path matrix
    
    Returns a tuple of:
    - dx: Gradient of x
    - dw: Gradient of w
    - db: Gradient of b
    """

    x, w, b, params, x_cols = cache
    NB, NH, NW, ND = x.shape
    KH, KW, ND, KF = w.shape
    stride = params['stride']
    pad = params['pad']
    
    db = np.sum(dout, axis=(0, 1, 2))
    w_cols = w.transpose(3, 0, 1, 2).reshape(KF, -1)
    # switch [NB, NH, NW, NC] -> [NC, NH, NW, NB]
    dout = dout.transpose(3, 1, 2, 0)
    dout = dout.reshape(KF, -1)
    w_cols = w.transpose(3, 0, 1, 2).reshape(KF, -1)
    dx_cols = w_cols.T.dot(dout)
    dx = col2im(dx_cols, (NH, NW, ND, NB), KH, KW, pad, stride)
    
    dw_cols = dout.dot(x_cols.T)
    dw = dw_cols.reshape(KF, KH, KW, ND)
    dw = dw.transpose(1, 2, 3, 0)
    
    return dx, dw, db


def maxpool_foward(x, params):
    """
    Fast max pool layer foward implementation using im2col
    
    Inputs:
    - x: Input data
    - params: Maxpool config params
    
    Returns a tuple of:
    - out: Maxpool output
    - cache: (x, params, x_cols, x_cols_argmax)
    """
    
    NB, NH, NW, ND = x.shape
    stride = params['stride']
    pad = params['pad']
    KH, KW = params['ksize']
    
    OH = (NH + 2 * pad - KH) // stride + 1
    OW = (NW + 2 * pad - KW) // stride + 1
    
    # take batch sample as channels
    x_pool = x.transpose(1, 2, 3, 0)
    x_pool = x_pool.reshape(NH, NW, 1, -1)
    x_cols = im2col(x_pool, KH, KW, pad, stride)
    # perform max pool
    # get max value index
    x_cols_argmax = np.argmax(x_cols, axis=0)

    # x_col: [block_size, n_blocks]
    # each block get its max value
    x_cols_max = x_cols[x_cols_argmax, np.arange(x_cols.shape[1])]
    out = x_cols_max.reshape(OH, OW, ND, NB)
    out = out.transpose(3, 0, 1, 2)
    
    cache = (x, params, x_cols, x_cols_argmax)
    
    return out, cache

def maxpool_backward(dout, cache):
    """
    Fast conv layer backward implementation using im2col
    
    Inputs:
    - dout: Gradient of output
    - cache: Tuple of:
      - x: Input data
      - params: Conv config params
      - x_cols: Input data along extract along kernel size path matrix
      - x_cols_argmax: Max pool kernel path max idx
      
    Returns:
    - dx: Gradient of x
    """
    
    x, params, x_cols, x_cols_argmax = cache
    NB, NH, NW, ND = x.shape
    stride = params['stride']
    pad = params['pad']
    KH, KW = params['ksize']
    
    dout_reshape = dout.transpose(1, 2, 3, 0).ravel()
    dx_cols = np.zeros_like(x_cols)
    dx_cols[x_cols_argmax, np.arange(dx_cols.shape[1])] = dout_reshape
    dx = col2im(dx_cols, (NH, NW, 1, ND * NB), KH, KW, pad, stride)
    
    dx = dx.reshape(NH, NW, ND, NB)
    dx = dx.transpose(3, 0, 1, 2)
    
    return dx


def sigmoid_foward(x):
    
    cache = x
    
    out = 1. / (1. + np.exp(-x))
    # trick to avoid numerical unstable
    #out = np.exp(x) / (np.exp(x) + np.exp(0))
    
    return out, cache


def sigmoid_backward(dout, cache):
    
    cache = x
    dx = dout * (1. - dout)
    
    return dx


def relu_forward(x):
    
    cache = x
    out = np.maximum(x, 0)
    
    return out, cache


def relu_backward(dout, cache):
    
    x = cache
    dx = np.where(x > 0, dout, 0)
    
    return dx


def logistic_loss(x, y):
    """
    Compute logistic loss and gradient
    
    Inputs:
    - x: Conv network raw reslut after sigmoid prob
    - y: True label one hot encoding matrix
    
    Outpus:
    - loss: Logistic loss
    - dx: Gradient of x
    
    """
    
    loss = y * np.log(x) + (1 - y) * np.log(1 - x)
    loss = -np.mean(np.sum(loss, axis=-1, keepdims=True))
    
    dx = (x - y) / x.shape[0]
    
    return loss, dx

### Model foward and backward

In [4]:
def layer_foward(x, layer, layer_type):
    """
    Layer foward
    
    Wrap different type of foward operation
    
    Inputs:
    - x: input data
    - layer: Layer data structure in dictionary
    - layer_type: Layer type
    
    Outputs:
    - out: Layer fowrard output 
    - cache: Layer forward cache
    """
      
    out, cache = None, None
    if layer_type == 'conv':
        
        params = layer['conv_params']
        w, b = layer['w'], layer['b']
        out, cache = conv_foward(x, w, b, params)
    
    elif layer_type == 'pool':
    
        params = layer['pool_params']
        out, cache = maxpool_foward(x, params)
        
    elif layer_type == 'fc':
        
        w, b = layer['w'], layer['b']
        out, cache = fc_foward(x, w, b)
    
    elif layer_type == 'relu':
        
        out, cache = relu_forward(x)
        
    elif layer_type == 'sigmoid':
        
        out, cache = sigmoid_foward(x)
        
    return out, cache

def layer_backward(dout, cache, layer, layer_type):
    """
    Layer backward
    
    Wrap different type backward operations
    
    Inputs:
    - dout: Gradient of output
    - cache: Foward reslut cache
    - layer: Layer data structure in dictionary
    - layer_type: Layer type
    
    """
      
    dx = None
    if layer_type == 'conv':
        
        dx, dw, db = conv_backward(dout, cache)
        #layer['grad'] = {'dw': dw, 'db': db}
        grad = layer['grad']
        grad['dw'][:] = dw
        grad['db'][:] = db
        
        
    elif layer_type == 'pool':

        dx = maxpool_backward(dout, cache)
        
    elif layer_type == 'fc':
        
        dx, dw, db = fc_backward(dout, cache)
        #layer['grad'] = {'dw': dw, 'db': db}
        grad = layer['grad']
        grad['dw'][:] = dw
        grad['db'][:] = db
    
    elif layer_type == 'relu':
        
        dx = relu_backward(dout, cache)
        
    elif layer_type == 'sigmoid':
        
        dx = sigmoid_backward(dout, cache)
        
    return dx

def model_predict(model, x):
    
    out, _ = model_foward(model, x)
    prob = softmax(out)
    pred = np.argmax(prob, axis=1)
    
    return pred
    

def model_foward(model, x):
    """
    Model forward
    
    Inputs:
    - model: CNN Model
    - x : input data
    
    Outputs tuple of:
    - out: Model foward output
    - model_cache: Model intermedia result cache
    """
    
    model_cache = []
    
    for layer in model:
        layer_type = layer['name']
        sub_layers = layer_type.split('_')
        
        layer_cache = []
        for sub_layer in sub_layers:
            out, cache = layer_foward(x, layer, sub_layer)
            x = out
            layer_cache.append(cache)
        model_cache.append(layer_cache)
    
    return out, model_cache

def model_backward(model, dout, model_cache):
    """
    Model backward
    
    Inputs:
    - model: CNN Model
    - dout: Gradient of output
    
    """

    for layer, layer_cache in zip(reversed(model), reversed(model_cache)):
        layer_type = layer['name']
        sub_layers = layer_type.split('_')
                                  
        for sub_layer, cache in zip(reversed(sub_layers), reversed(layer_cache)):
            dx = layer_backward(dout, cache, layer, sub_layer)
            dout = dx
            
    return dx

def model_update(model, learning_rate):
    
    for layer in model:
        grad = layer['grad']
        if grad is not None:
            # TODO: l2 reg
            w, b = layer['w'], layer['b']
            
            w[:] = w - learning_rate * (grad['dw'])
            b[:] = b - learning_rate * (grad['db'])
            

def sgd(model, X, y, epochs=100, learning_rate=1e-2, batch_size=32, verbose=False):
    """
    Stochastic gradient descent
    
    Inputs:
    - model:
    - X:
    - y:
    - epochs:
    - learning_rate:
    - batch_size:
    """
    
    history_loss = []
    for e in range(epochs):
    
        batch_loss = []
        for X_batch, y_batch in next_batch(X, y, batch_size, shuffle=True):
            
            out, cache = model_foward(model, X_batch)

            # Model does not have activation in last layer
            # To use logistic loss we need to do sigmoid to covnert to prob
            # And can compute logistic loss y_hat - y
            prob, _ = sigmoid_foward(out)
            loss, dx = logistic_loss(prob, y_batch) 
            
            #print('loss: {:4.2f}'.format(loss))
            #print('true:', np.argmax(y_batch[:8], axis=1))
            #print('pred:', np.argmax(prob[:8], axis=1))
            
            # TODO: compute l2 reg here
            
            # TODO: Do we need return grad to support l2 reg ? 
            model_backward(model, dx, cache)
        
            model_update(model, learning_rate)
            
            batch_loss.append(loss)
            
        # compute avg loss
        avg_loss = np.mean(batch_loss)
        history_loss.append(avg_loss)
        
        if verbose:
            print('[{:3d}|{:3d}] loss: {:4.4f}'.format(e + 1, epochs, avg_loss))
        
    return model, history_loss

## 2. Load data

### Data normalize

In [5]:
def std_norm(X, mu=None, sigma=None):
    """
    Standard normalize input data X with mu and sigma.
    If mu and sigma not given compute from input data X
    
    Inputs:
    - X: Input data of shape (N, H, W, C)
    
    Returns a tuple of:
    - X_sc: Standard normalize of X 
    - mu: Mean of x of shape
    - sigma: standard deviation of x of shape
    
    """
    
    mu = np.mean(X, axis=0, keepdims=True)
    X_sc = X - mu
    sigma = np.std(X, axis=0, keepdims=True)
    X_sc = X_sc / sigma
    
    return X_sc, mu, sigma

In [6]:
X = np.load('ex5_train_x.npy')
y = np.load('ex5_train_y.npy')

In [7]:
y_en = one_hot_encode(y, 6)
X_sc, mu, sigma = std_norm(X)

## 3.  Initialize parameters (Weights, bias for each layer)

In [8]:
def layer_init(layer, input_size, layer_type):
    """
    Initialize layer weights and its output size
    
    Inputs:
    - layer: Layer data structure in dictionary
    - input_size: Input size of layer
    - layer type: Layer type
    
    Returns:
    - output_size: Layer output shapes
    """
    
    output_size = None
    # only conv and fc layer need weights initialize
    if layer_type == 'conv':  
        
        NH, NW, ND = input_size
        params = layer['conv_params']
        stride, pad = params['stride'], params['pad']
        KH, KW = params['ksize']
        KF = params['n_filters']
        
        output_size = \
            get_output_size(input_size, (KH, KW), KF, stride, pad)
        
        # weights initialize
        n_inputs = KH * KW * ND
        n_outputs = KH * KW * KF
        
        w = xavier_initializer(n_inputs, n_outputs, (KH, KW, ND, KF))
        b = np.zeros((1, 1, 1, KF))
        layer['w'], layer['b'] = w, b
        
        dw, db = np.zeros_like(w), np.zeros_like(b)
        layer['grad'] = {'dw': dw, 'db': db}

    elif layer_type == 'pool':
        
        NH, NW, ND = input_size
        params = layer['pool_params']
        stride, pad = params['stride'], params['pad']
        KH, KW = params['ksize']

        output_size = \
            get_output_size(input_size, (KH, KW), ND, stride, pad)
                
    elif layer_type == 'fc':
        
        params = layer['params']
        n_inputs = np.prod(input_size)
        n_outputs = params['n_outputs']
        
        output_size = n_outputs
        
        # weights initialize
        w = xavier_initializer(n_inputs, n_outputs, (n_inputs, n_outputs))
        b = np.zeros(n_outputs)
        layer['w'], layer['b'] = w, b
        
        dw, db = np.zeros_like(w), np.zeros_like(b)
        layer['grad'] = {'dw': dw, 'db': db}

    # other layer
    else:
        output_size = input_size
            
    return output_size
            
def model_init(model, input_size):
    """
    Initialize the cnn model
    
    Inputs:
    - input_size: Input data size
    
    """
    for layer in model:
        layer_type = layer['name']
        for sub_layer in layer_type.split('_'):
            input_size = layer_init(layer, input_size, sub_layer)
        layer['output_size'] = input_size
        
    return model

In [9]:
model = [
{
    # Using '_' to define sandwish layer
    'name': 'conv_relu_pool',
    'conv_params': 
        {'ksize': (3, 3), 'stride': 1, 'pad': 1, 'n_filters': 8},
    'pool_params':
        {'ksize': (2, 2), 'stride': 2, 'pad': 0},
    'grad': None,
},
{   
    'name': 'conv_relu_pool',
    'conv_params': 
        {'ksize': (3, 3), 'stride': 1, 'pad': 1, 'n_filters': 16},
    'pool_params': 
        {'ksize': (2, 2), 'stride': 2, 'pad': 0},
    'grad': None,
},
# {   
#     'name': 'conv_relu_pool',
#     'conv_params': 
#         {'ksize': (3, 3), 'stride': 1, 'pad': 1, 'n_filters': 32},
#     'pool_params': 
#         {'ksize': (2, 2), 'stride': 2, 'pad': 0},
#     'grad': None,
# },
# {   
#     'name': 'conv_relu_pool',
#     'conv_params': 
#         {'ksize': (3, 3), 'stride': 1, 'pad': 1, 'n_filters': 64},
#     'pool_params': 
#         {'ksize': (2, 2), 'stride': 2, 'pad': 0},
#     'grad': None,
# },
# {
#     'name': 'conv_relu',
#     'conv_params': 
#         {'ksize': (1, 1), 'stride': 1, 'pad': 0, 'n_filters': 8},
#     'grad': None,
# },
{
    'name': 'fc_relu',
    'params': {'n_outputs': 128},
    'grad': None
},
{    
    # Don't add activation in last layer just output raw neuron output
    # finally decide used softmax or logisitic
    'name': 'fc',
    'params': {'n_outputs': 6},
    'grad': None
},
]

In [10]:
input_size = (64, 64, 3)
model = model_init(model, input_size)

## 4.  Optimization of Convolution Neural Network model

In [11]:
def model_summary(model, input_size):
    
    msg_format = \
        '{type:6} | {sz:12s} | {ksize:12s} | {stride:6s} | {pad:6s}'
            
    msg_header = {
            'type': 'Type', 'sz': 'Output Size', 'ksize': 'Kernel Size',
            'stride': 'Stride', 'pad': 'Padding'
    }
    
    def layer_summary(input_size, layer, layer_type):
        
        layer_info = {
            'type': '', 
            'sz': '', 'chs': '', 'ksize': '',
            'stride': '', 'pad': ''
        }
        
        output_size = None
        
        if layer_type == 'conv':
            
            NH, NW, ND = input_size
            params = layer['conv_params']
            stride, pad = params['stride'], params['pad']
            KH, KW = params['ksize']
            KF = params['n_filters']
            channel = KF
            
            output_size = \
                get_output_size(input_size, [KH, KW], KF, stride, pad)
                
            layer_info['sz'] = str(output_size)
            layer_info['ksize'] = str(params['ksize'])
            layer_info['stride'] = str(stride)
            layer_info['pad'] = str(pad)
            
        elif layer_type == 'pool':
            
            NH, NW, ND = input_size
            params = layer['pool_params']
            stride, pad = params['stride'], params['pad']
            KH, KW = params['ksize']
            channel = ND
            
            output_size = \
                get_output_size(input_size, [KH, KW], ND, stride, pad)
            
            layer_info['sz'] = str(output_size)
            layer_info['ksize'] = str((KH, KW))
            layer_info['stride'] = str(stride)
            layer_info['pad'] = str(pad)
            
        elif layer_type == 'fc':
            
            params = layer['params']
            output_size = params['n_outputs']
            
            layer_info['sz'] = str(output_size)
            
        else:
            
            output_size = input_size
            layer_info['sz'] = str(output_size)
        
        layer_info['type'] = layer_type
        print(msg_format.format(**layer_info))
        
        return output_size
    
    print(msg_format.format(**msg_header))
    print('-'*len(msg_format))
    for layer in model:
        layer_type = layer['name']
        sub_layers = layer_type.split('_')
        
        for sub_layer in sub_layers:
            output_size = layer_summary(input_size, layer, sub_layer)
            input_size = output_size

In [12]:
input_size = (64, 64, 3)
model_summary(model, input_size)

Type   | Output Size  | Kernel Size  | Stride | Padding
----------------------------------------------------------
conv   | (64, 64, 8)  | (3, 3)       | 1      | 1     
relu   | (64, 64, 8)  |              |        |       
pool   | (32, 32, 8)  | (2, 2)       | 2      | 0     
conv   | (32, 32, 16) | (3, 3)       | 1      | 1     
relu   | (32, 32, 16) |              |        |       
pool   | (16, 16, 16) | (2, 2)       | 2      | 0     
fc     | 128          |              |        |       
relu   | 128          |              |        |       
fc     | 6            |              |        |       


In [13]:
# warm start 
# model, loss_history = sgd(model, X_sc[:64, :], y_en[:64, :], 
#                           epochs=10, learning_rate=1e-2, batch_size=8, verbose=True)

In [14]:
model, loss_history = sgd(model, X_sc, y_en, 
                          epochs=20, learning_rate=1e-1, batch_size=64, verbose=True)

[  1| 20] loss: 3.3474
[  2| 20] loss: 2.0274
[  3| 20] loss: 1.8821
[  4| 20] loss: 1.4822
[  5| 20] loss: 1.5801
[  6| 20] loss: 1.0147
[  7| 20] loss: 0.9734
[  8| 20] loss: 0.8089
[  9| 20] loss: 0.5287
[ 10| 20] loss: 0.5753
[ 11| 20] loss: 0.4980
[ 12| 20] loss: 0.1836
[ 13| 20] loss: 0.1858
[ 14| 20] loss: 0.2215
[ 15| 20] loss: 0.1620
[ 16| 20] loss: 0.0764
[ 17| 20] loss: 0.0541
[ 18| 20] loss: 0.0279
[ 19| 20] loss: 0.0163
[ 20| 20] loss: 0.0112


In [15]:
pred = model_predict(model, X_sc)

In [16]:
acc = np.mean(pred == y)

In [17]:
print('accuracy: ', acc * 100)

('accuracy: ', 100.0)


In [18]:
pred[:100]

array([4, 4, 4, 4, 0, 5, 0, 2, 0, 3, 5, 5, 2, 0, 4, 3, 1, 4, 4, 4, 1, 4,
       0, 4, 4, 1, 0, 1, 2, 0, 1, 5, 2, 1, 1, 2, 3, 1, 4, 1, 4, 1, 1, 5,
       5, 2, 2, 3, 0, 0, 1, 2, 1, 4, 4, 4, 0, 3, 4, 4, 0, 0, 5, 0, 4, 5,
       5, 1, 5, 4, 1, 1, 0, 3, 2, 2, 2, 2, 0, 5, 2, 5, 1, 4, 2, 5, 4, 5,
       1, 2, 3, 2, 4, 3, 3, 4, 4, 4, 1, 0])

In [19]:
y[:100]

array([4, 4, 4, 4, 0, 5, 0, 2, 0, 3, 5, 5, 2, 0, 4, 3, 1, 4, 4, 4, 1, 4,
       0, 4, 4, 1, 0, 1, 2, 0, 1, 5, 2, 1, 1, 2, 3, 1, 4, 1, 4, 1, 1, 5,
       5, 2, 2, 3, 0, 0, 1, 2, 1, 4, 4, 4, 0, 3, 4, 4, 0, 0, 5, 0, 4, 5,
       5, 1, 5, 4, 1, 1, 0, 3, 2, 2, 2, 2, 0, 5, 2, 5, 1, 4, 2, 5, 4, 5,
       1, 2, 3, 2, 4, 3, 3, 4, 4, 4, 1, 0])