In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
#
# These are taken from https://github.com/mila-udem/blocks
# 

class Constant():
    """Initialize parameters to a constant.
    The constant may be a scalar or a :class:`~numpy.ndarray` of any shape
    that is broadcastable with the requested parameter arrays.
    Parameters
    ----------
    constant : :class:`~numpy.ndarray`
        The initialization value to use. Must be a scalar or an ndarray (or
        compatible object, such as a nested list) that has a shape that is
        broadcastable with any shape requested by `initialize`.
    """
    def __init__(self, constant):
        self._constant = numpy.asarray(constant)

    def generate(self, rng, shape):
        dest = numpy.empty(shape, dtype=np.float32)
        dest[...] = self._constant
        return dest


class IsotropicGaussian():
    """Initialize parameters from an isotropic Gaussian distribution.
    Parameters
    ----------
    std : float, optional
        The standard deviation of the Gaussian distribution. Defaults to 1.
    mean : float, optional
        The mean of the Gaussian distribution. Defaults to 0
    Notes
    -----
    Be careful: the standard deviation goes first and the mean goes
    second!
    """
    def __init__(self, std=1, mean=0):
        self._mean = mean
        self._std = std

    def generate(self, rng, shape):
        m = rng.normal(self._mean, self._std, size=shape)
        #print(np.sum(m > 0.))
        #print(np.sum(m == 0.))
        #print(np.sum(m < 0.))
        return m.astype(np.float32)


class Uniform():
    """Initialize parameters from a uniform distribution.
    Parameters
    ----------
    mean : float, optional
        The mean of the uniform distribution (i.e. the center of mass for
        the density function); Defaults to 0.
    width : float, optional
        One way of specifying the range of the uniform distribution. The
        support will be [mean - width/2, mean + width/2]. **Exactly one**
        of `width` or `std` must be specified.
    std : float, optional
        An alternative method of specifying the range of the uniform
        distribution. Chooses the width of the uniform such that random
        variates will have a desired standard deviation. **Exactly one** of
        `width` or `std` must be specified.
    """
    def __init__(self, mean=0., width=None, std=None):
        if (width is not None) == (std is not None):
            raise ValueError("must specify width or std, "
                             "but not both")
        if std is not None:
            # Variance of a uniform is 1/12 * width^2
            self._width = numpy.sqrt(12) * std
        else:
            self._width = width
        self._mean = mean

    def generate(self, rng, shape):
        w = self._width / 2
        m = rng.uniform(self._mean - w, self._mean + w, size=shape)
        return m.astype(np.float32)

In [129]:
def get_im2col_indices(x_shape, field_height, field_width, padding=1, stride=1):
    # First figure out what the size of the output should be
    N, C, H, W = x_shape
    assert (H + 2 * padding - field_height) % stride == 0
    assert (W + 2 * padding - field_height) % stride == 0
    out_height = int((H + 2 * padding - field_height) / stride + 1)
    out_width = int((W + 2 * padding - field_width) / stride + 1)

    i0 = np.repeat(np.arange(field_height), field_width)
    i0 = np.tile(i0, C)
    i1 = stride * np.repeat(np.arange(out_height), out_width)
    j0 = np.tile(np.arange(field_width), field_height * C)
    j1 = stride * np.tile(np.arange(out_width), out_height)
    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
    j = j0.reshape(-1, 1) + j1.reshape(1, -1)

    k = np.repeat(np.arange(C), field_height * field_width).reshape(-1, 1)

    return (k.astype(int), i.astype(int), j.astype(int))


def im2col_indices(x, field_height, field_width, padding=1, stride=1):
    """ An implementation of im2col based on some fancy indexing """
    # Zero-pad the input
    p = padding
    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode='constant')

    k, i, j = get_im2col_indices(x.shape, field_height, field_width, padding, stride)

    cols = x_padded[:, k, i, j]
    C = x.shape[1]
    cols = cols.transpose(1, 2, 0).reshape(field_height * field_width * C, -1)
    return cols

def col2im_indices(cols, x_shape, field_height=3, field_width=3, padding=1,
                   stride=1):
    """ An implementation of col2im based on fancy indexing and np.add.at """
    N, C, H, W = x_shape
    H_padded, W_padded = H + 2 * padding, W + 2 * padding
    x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
    k, i, j = get_im2col_indices(x_shape, field_height, field_width, padding, stride)
    cols_reshaped = cols.reshape(C * field_height * field_width, -1, N)
    cols_reshaped = cols_reshaped.transpose(2, 0, 1)
    np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped)
    if padding == 0:
        return x_padded
    return x_padded[:, :, padding:-padding, padding:-padding]

class Layer(object):
    def __init__(self, rng=None):
        if rng is None:
            rng = numpy.random
        self.rng = rng
    
    @property
    def parameters(self):
        return []
    
    @property
    def parameter_names(self):
        return []
    
    def get_gradients(self, dLdY, fprop_context):
        return [] 

class AffineLayer(Layer):
    def __init__(self, num_in, num_out, weight_init=None, bias_init=None, **kwargs):
        super(AffineLayer, self).__init__(**kwargs)
        if weight_init is None:
            weight_init = IsotropicGaussian(std=0.2, mean=0.0)
        if bias_init is None:
            bias_init = Constant(0.0)

        self.W = weight_init.generate(self.rng, (num_out, num_in))
        self.b = bias_init.generate(self.rng, (num_out, 1))
    
    @property
    def parameters(self):
        return [self.W, self.b]
    
    @property
    def parameter_names(self):
        return ['W','b']
    
    def fprop(self, X):
        fprop_context = dict(X=X)
        Y = np.dot(self.W, X) + self.b
        return Y, fprop_context
    
    def bprop(self, dLdY, fprop_context):
        return self.W.T.dot(dLdY)
    
    def get_gradients(self, dLdY, fprop_context):
        X = fprop_context['X']
        dLdW = np.dot(dLdY, X.T)
        dLdb = dLdY.sum(1, keepdims=True)
        return [dLdW, dLdb]


class TanhLayer(Layer):
    def __init__(self, **kwargs):
        super(TanhLayer, self).__init__(**kwargs)
    
    def fprop(self, X):
        Y = np.tanh(X)
        fprop_context = dict(Y=Y)
        return Y, fprop_context
    
    def bprop(self, dLdY, fprop_context):
        Y = fprop_context['Y']
        return dLdY * (1.0 - Y**2)

    
class ReLULayer(Layer):
    def __init__(self, **kwargs):
        super(ReLULayer, self).__init__(**kwargs)
    
    def fprop(self, X):
        Y = np.maximum(X, 0.0)
        fprop_context = dict(Y=Y)
        return Y, fprop_context
    
    def bprop(self, dLdY, fprop_context):
        Y = fprop_context['Y']
        dLdX = dLdY.reshape(Y.shape) * (Y > 0)
        return dLdX

    
class SoftMaxLayer(Layer):
    def __init__(self, **kwargs):
        super(SoftMaxLayer, self).__init__(**kwargs)
    
    def compute_probabilities(self, X):
        O = X - X.max(axis=0, keepdims=True)
        O = np.exp(O)
        O /= O.sum(axis=0, keepdims=True)
        return O
    
    def fprop_cost(self, X, Y):
        NS = X.shape[1]
        O = self.compute_probabilities(X)
        Cost = -1.0/NS * np.log(O[Y.ravel(), range(NS)]).sum()
        return Cost, O, dict(O=O, X=X, Y=Y)
    
    def bprop_cost(self, fprop_context):
        X = fprop_context['X']
        Y = fprop_context['Y']
        O = fprop_context['O']
        NS = X.shape[1]
        dLdX = O.copy()
        dLdX[Y, range(NS)] -= 1.0
        dLdX /= NS
        return dLdX
    
class FeedForwardNet(object):
    def __init__(self, layers=None):
        if layers is None:
            layers = []
        self.layers = layers
    
    def add(self, layer):
        self.layers.append(layer)
    
    @property
    def parameters(self):
        params = []
        for layer in self.layers:
            params += layer.parameters
        return params
    
    @parameters.setter
    def parameters(self, values):
        for ownP, newP in zip(self.parameters, values):
            ownP[...] = newP
    
    @property
    def parameter_names(self):
        param_names = []
        for layer in self.layers:
            param_names += layer.parameter_names
        return param_names
    
    def fprop(self, X):
        for layer in self.layers[:-1]:
            X, fp_context = layer.fprop(X)
        return self.layers[-1].compute_probabilities(X)
    
    def get_cost_and_gradient(self, X, Y):
        fp_contexts = []
        for layer in self.layers[:-1]:
            X, fp_context = layer.fprop(X)
            fp_contexts.append(fp_context)
        
        L, O, fp_context = self.layers[-1].fprop_cost(X, Y)
        dLdX = self.layers[-1].bprop_cost(fp_context)
        
        dLdP = [] #gradient with respect to parameters
        for i in xrange(len(self.layers)-1):
            layer = self.layers[len(self.layers)-2-i]
            fp_context = fp_contexts[len(self.layers)-2-i]
            dLdP = layer.get_gradients(dLdX, fp_context) + dLdP
            dLdX = layer.bprop(dLdX, fp_context)
            
        return L, O, dLdP
    


In [147]:
   
class PoolLayer(Layer):
    def __init__(self, image_shape, **kwargs):
        super(PoolLayer, self).__init__(**kwargs)
        
        self.image_shape = image_shape
        self.poolsize = 2
        self.stride = 2
    
    def fprop(self, X):
        num_images, num_channels, img_w, img_h = self.image_shape

        Xr = X.reshape(num_images * num_filters, 1, img_h, img_w)
        X_pool = im2col_indices(Yr, self.poolsize, self.poolsize, stride = self.stride, padding=0)
        #print X_pool.shape

        max_idx = np.argmax(X_pool, axis=0)
        #print max_idx.shape
        Y_pool = X_pool[max_idx, range(max_idx.size)]
        #print Y_pool.shape

        Y_pool = Y_pool.reshape(img_h / self.stride, img_w / self.stride, n, d)
        #print Y_pool.shape

        Y_pool = Y_pool.transpose(2, 3, 0, 1)
        
        Y_pool = Y_pool.reshape(num_images, -1).T
        
        return Y_pool, dict(X=Xr, X_pool=X_pool, max_idx = max_idx)
    
    def bprop(self, dLdY, fprop_context):
        X = fprop_context['X']
        max_idx = fprop_context['max_idx']
        X_col = fprop_context['X_pool']

        n, d, w, h = X.shape

        dX_col = np.zeros_like(X_col)
        
        dout_col = dLdY.ravel()

        dX_col[max_idx, range(dout_col.size)] = dout_col
        dX = dX_col

        dX = col2im_indices(dX_col, (n * d, 1, h, w), self.poolsize, self.poolsize, padding=0, stride=self.stride)
        dX = dX.reshape(X.shape)

        return dX
    
class ConvPoolLayer(Layer):
    def __init__(self, image_shape, filter_shape, weight_init = None, bias_init = None, **kwargs):
        super(ConvPoolLayer, self).__init__(**kwargs)
        
        if weight_init is None:
            weight_init = IsotropicGaussian(std=0.2, mean=0.0)
        if bias_init is None:
            bias_init = Constant(0.0)
        
        self.image_shape = image_shape
        self.filter_shape = filter_shape
        
        self.W = weight_init.generate(self.rng, filter_shape)
        self.b = weight_init.generate(self.rng, (filter_shape[0],))
        
    @property
    def parameters(self):
        return [self.W, self.b]
    
    @property
    def parameter_names(self):
        return ['W','b']
    
    def fprop(self, X):
        num_images, num_channels, img_w, img_h = self.image_shape
        num_filters, num_channels2, filter_w, filter_h = self.filter_shape

        Xr = X.reshape(self.image_shape)
        X_col = im2col_indices(Xr, filter_w, filter_h, stride = 1, padding=0)

        #print X_col.shape

        W_col = W.reshape(num_filters, -1)
        #print W_col.shape

        Y = W_col.dot(X_col) + b
        #Y = np.maximum(Y, 0.0) # ReLU
        #print Y.shape

        w_out = img_w - filter_w + 1
        h_out = img_h - filter_h + 1

        Y = Y.reshape(num_filters, w_out, h_out, num_images)
        #print Y.shape

        Y = Y.transpose(3, 0, 1, 2)
        #print Y.shape
        
        return Y, dict(X=Xr, X_col=X_col)
    
    def bprop(self, dLdY, fprop_context):
        X = fprop_context['X']
        #Xr = fprop_context['Xr']
        X_col = fprop_context['X_col']
        #X, W, b, stride, padding, X_col = cache
        n_filter, d_filter, h_filter, w_filter = self.W.shape

        dout_reshaped = dLdY.transpose(1, 2, 3, 0).reshape(n_filter, -1)

        W_reshape = self.W.reshape(n_filter, -1)
        dX_col = W_reshape.T.dot(dout_reshaped)
        dX = col2im_indices(dX_col, X.shape, h_filter, w_filter, padding=0, stride=1)

        return dX #, dW, db
    
    def get_gradients(self, dLdY, fprop_context):
        X = fprop_context['X']
        X_col = fprop_context['X_col']
        #X, W, b, stride, padding, X_col = cache
        n_filter, d_filter, h_filter, w_filter = self.W.shape
        
        db = np.sum(dLdY, axis=(0, 2, 3))
        
        db = db.reshape(n_filter, -1).ravel()

        dout_reshaped = dLdY.transpose(1, 2, 3, 0).reshape(n_filter, -1)
        dW = dout_reshaped.dot(X_col.T)
        dW = dW.reshape(W.shape)

        return [dW, db]
 

In [149]:
weight_init = IsotropicGaussian(std=0.05, mean=0.0)
net = FeedForwardNet([
        ConvPoolLayer(image_shape=(100, 1, 28, 28), filter_shape=(20, 1, 5, 5)),
        ReLULayer(),
        PoolLayer(image_shape=(100, 20, 24, 24)),
        AffineLayer(20 * 12 * 12, 10, weight_init = weight_init),
        SoftMaxLayer()
    ])

SGD(net, mnist_train_stream, mnist_validation_stream, mnist_test_stream,
    print_debug = True,
    epochs = 2000,
    regularization_rate = 0.0,
    alpha_alg = AlphaAlgExp(initial = 5e-2, rate = 0.992),
    momentum_alg = MomentumAlg3())

Network configuration: 
W(20, 1, 5, 5), b(20,), W(10, 2880), b(10, 1)
At minibatch 100, batch loss 2.381078, batch error rate 90.000000%
At minibatch 200, batch loss 2.413829, batch error rate 93.000000%
At minibatch 300, batch loss 2.340055, batch error rate 91.000000%
At minibatch 400, batch loss 2.360413, batch error rate 91.000000%
At minibatch 500, batch loss 2.304931, batch error rate 86.000000%


ValueError: total size of new array must be unchanged

In [4]:
from fuel.datasets.mnist import MNIST
from fuel.transformers import ScaleAndShift, Cast, Flatten, Mapping
from fuel.streams import DataStream
from fuel.schemes import SequentialScheme, ShuffledScheme

MNIST.default_transformers = (
    (ScaleAndShift, [2.0 / 255.0, -1], {'which_sources': 'features'}),
    (Cast, [np.float32], {'which_sources': 'features'}), 
    (Flatten, [], {'which_sources': 'features'}),
    (Mapping, [lambda batch: (b.T for b in batch)], {}) )

mnist_train = MNIST(("train",), subset=slice(None,50000))
#this stream will shuffle the MNIST set and return us batches of 100 examples
mnist_train_stream = DataStream.default_stream(
    mnist_train,
    iteration_scheme=ShuffledScheme(mnist_train.num_examples, 100))
                                               
mnist_validation = MNIST(("train",), subset=slice(50000, None))

# We will use larger portions for testing and validation
# as these dont do a backward pass and reauire less RAM.
mnist_validation_stream = DataStream.default_stream(
    mnist_validation, iteration_scheme=SequentialScheme(mnist_validation.num_examples, 250))
mnist_test = MNIST(("test",))
mnist_test_stream = DataStream.default_stream(
    mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, 250))

In [5]:
print "The streams return batches containing %s" % (mnist_train_stream.sources,)

print "Each trainin batch consits of a tuple containing:"
for element in next(mnist_train_stream.get_epoch_iterator()):
    print " - an array of size %s containing %s" % (element.shape, element.dtype)
    
print "Validation/test batches consits of tuples containing:"
for element in next(mnist_test_stream.get_epoch_iterator()):
    print " - an array of size %s containing %s" % (element.shape, element.dtype)

The streams return batches containing (u'features', u'targets')
Each trainin batch consits of a tuple containing:
 - an array of size (784, 100) containing float32
 - an array of size (1, 100) containing uint8
Validation/test batches consits of tuples containing:
 - an array of size (784, 250) containing float32
 - an array of size (1, 250) containing uint8


In [144]:
from copy import deepcopy

def compute_error_rate(net, stream):
    num_errs = 0.0
    num_examples = 0
            
    for X, Y in stream.get_epoch_iterator():
        O = net.fprop(X)
        num_errs += (O.argmax(0) != Y).sum()
        num_examples += X.shape[1]
            
    return num_errs/num_examples

def print_stats(train_loss, train_errors, validation_errors):
    subplot(2,1,1)
    train_loss = np.array(train_loss)
    semilogy(train_loss[:,0], train_loss[:,1], label='batch train loss')
    legend()

    subplot(2,1,2)
    train_errors = np.array(train_errors)
    plot(train_errors[:,0], train_errors[:,1], label='batch train error rate')
    validation_errors = np.array(validation_errors)
    plot(validation_errors[:,0], validation_errors[:,1], label='validation error rate', color='r')
    ylim(0,0.2)
    legend()

class AlphaAlgExp:
    def __init__(self, initial = 5e-2, rate = 0.998):
        self.initial = initial
        self.rate = rate
    
    def __call__(self, i, e):
        return (self.initial * np.power(self.rate, e))
    
class AlphaAlgBct:
    def __init__(self, b = 1e3, c = 2e4):
        self.b = b
        self.c = c
    
    def __call__(self, i, e):
        return (self.b / (self.c + i))
    
class AlphaAlgConst:
    def __init__(self, constant = 1e-2):
        self.constant = constant
        
    def __call__(self, i, e):
        return self.constant
    
    
class MomentumAlgConst:
    def __init__(self, constant = 0.5):
        self.constant = constant
        
    def __call__(self, i):
        return self.constant
    
class MomentumAlg1:
    def __call__(self, i, e):
        return (1. - 3. / (5. + i))

class MomentumAlg2:
    def __call__(self, i, e, limit = 0.9):
        v = (1. - 3. / (5. + i))
        return (v if v < limit else limit)
    
class MomentumAlg3:
    def __init__(self, start = 0.5, stop = 0.9, epochs = 500):
        self.start = start
        self.stop = stop
        self.epochs = epochs
    
    def __call__(self, i, e):
        if e >= self.epochs:
            return self.stop
        else:
            c = e / self.epochs
            return c * self.start + (1. - c) * self.stop

def SGD(net, train_stream, validation_stream, test_stream,
       print_debug = True,
       epochs = 3,
       patience = 1.5,
       alpha_alg = AlphaAlgExp(),
       momentum_alg = MomentumAlg1(),
       regularization_rate = 1e-3,
       dropout = None):
    
    print "Network configuration: "
    print ", ".join([ "%s%s" %(N, P.shape) for P, N in zip(net.parameters, net.parameter_names) ])
    
    i=0
    e=0
    
    velocities = [np.zeros(P.shape) for P in net.parameters]
    
    best_valid_error_rate = np.inf
    best_params = deepcopy(net.parameters)
    best_params_epoch = 0
    
    alpha = None
    
    train_errors = []
    train_loss = []
    validation_errors = []
    
    number_of_epochs = epochs
    patience_expansion = patience
    
    try:
        while e<number_of_epochs: #This loop goes over epochs
            e += 1
            #First train on all data from this batch
            for X,Y in train_stream.get_epoch_iterator(): 
                i += 1
                L, O, gradients = net.get_cost_and_gradient(X, Y)
                err_rate = (O.argmax(0) != Y).mean()
                train_loss.append((i,L))
                train_errors.append((i,err_rate))
                if i % 100 == 0 and print_debug:
                    print "At minibatch %d, batch loss %f, batch error rate %f%%" % (i, L, err_rate*100)
                for P, V, G, N in zip(net.parameters, velocities, gradients, net.parameter_names):
                    if N=='W' and regularization_rate:
                        G += regularization_rate * P
                        
                    
                    alpha = alpha_alg(i, e)
                    
                    epsilon = momentum_alg(i, e)
                    
                    #V = epsilon * V - (1. - epsilon) * alpha * G
                    V = epsilon * V - alpha * G
                    
                    P += V
                    
            # After an epoch compute validation error
            val_error_rate = compute_error_rate(net, validation_stream)

            if val_error_rate < best_valid_error_rate:
                number_of_epochs = np.maximum(number_of_epochs, e * patience_expansion+1)
                best_valid_error_rate = val_error_rate
                best_params = deepcopy(net.parameters)
                best_params_epoch = e
                validation_errors.append((i,val_error_rate))
            if print_debug or (e % 20 == 1):
                print "After epoch %d: valid_err_rate: %f%% currently going ot do %d epochs" %(
                    e, val_error_rate, number_of_epochs)
        print "Finished with %d epochs (minibatch %d), with best valid error rate %f" % (e, i, best_valid_error_rate)
        print_stats(train_loss, train_errors, validation_errors)
    except KeyboardInterrupt:
        print "Setting network parameters from after epoch %d" %(best_params_epoch)
        net.parameters = best_params
        print_stats(train_loss, train_errors, validation_errors)

In [141]:
weight_init = IsotropicGaussian(std=0.05, mean=0.0)
net = FeedForwardNet([
        ConvPoolLayer(image_shape=(100, 1, 28, 28), filter_shape=(20, 1, 5, 5)),
        ReLULayer(),
        PoolLayer(image_shape=(100, 20, 24, 24)),
        AffineLayer(20 * 12 * 12, 10, weight_init = weight_init),
        SoftMaxLayer()
    ])

SGD(net, mnist_train_stream, mnist_validation_stream, mnist_test_stream,
    print_debug = True,
    epochs = 2000,
    regularization_rate = 0.0,
    alpha_alg = AlphaAlgExp(initial = 5e-2, rate = 0.992),
    momentum_alg = MomentumAlg3())

Network configuration: 
W(20, 1, 5, 5), b(20,), W(10, 2880), b(10, 1)
(2000, 1, 24, 24)
(100, 20, 24, 24)
shapes
(20, 1, 5, 5)
(20, 1, 5, 5)
(20, 1, 5, 5)
shapes
(20,)
(20,)
(20, 1)


ValueError: non-broadcastable output operand with shape (20,) doesn't match the broadcast shape (20,20)

In [89]:
train_stream = mnist_train_stream
X,Y = train_stream.get_epoch_iterator().next()

image_shape = (100, 1, 28, 28)
filter_shape = (20, 1, 5, 5)
poolsize = (2,2)


rng = np.random
weight_init = IsotropicGaussian(std=0.05, mean=0.0)
W = weight_init.generate(rng, filter_shape)
b = weight_init.generate(rng, (filter_shape[0],1))

num_images, num_channels, img_w, img_h = image_shape
num_filters, num_channels2, filter_w, filter_h = filter_shape

Xr = X.reshape(image_shape)
X_col = im2col_indices(Xr, filter_w, filter_h, stride = 1, padding=0)

print X_col.shape

W_col = W.reshape(num_filters, -1)
print W_col.shape

Y = W_col.dot(X_col) + b
print Y.shape
Y = np.maximum(Y, 0.0)
print Y.shape

        # Reshape back from 20x500 to 5x20x10x10
        # i.e. for each of our 5 images, we have 20 results with size of 10x10
w_out = img_w - filter_w + 1
h_out = img_h - filter_h + 1

Y = Y.reshape(num_filters, w_out, h_out, num_images)
print Y.shape

Y = Y.transpose(3, 0, 1, 2)
print Y.shape

### ^^^ was convolution

Yr = Y.reshape(num_images * num_filters, 1, h_out, w_out)
X_pool = im2col_indices(Yr, poolsize[0], poolsize[1], stride=2, padding=0)
print X_pool.shape

max_idx = np.argmax(X_pool, axis=0)
#print max_idx.shape
Y_pool = X_pool[max_idx, range(max_idx.size)]
print Y_pool.shape

Y_pool = Y_pool.reshape(12, 12, n, d)
print Y_pool.shape

Y_pool = Y_pool.transpose(2, 3, 0, 1)
print Y_pool.shape

### ^^^ was pooling










# return Y, dict(X=X, Xr=Xr, X_col=X_col)

(25, 57600)
(20, 25)
(20, 57600)
(20, 57600)
(20, 24, 24, 100)
(100, 20, 24, 24)
(4, 288000)
(288000,)
(12, 12, 100, 20)
(100, 20, 12, 12)
