In [1]:
# Copyright 2016 Matthieu Courbariaux

# This file is part of BinaryNet.

# BinaryNet is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# BinaryNet is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with BinaryNet.  If not, see <http://www.gnu.org/licenses/>.

from __future__ import print_function

import sys
import os
import time

import numpy as np
np.random.seed(1234)  # for reproducibility

# specifying the gpu to use
# import theano.sandbox.cuda
# theano.sandbox.cuda.use('gpu1') 
import theano
import theano.tensor as T

import lasagne

import cPickle as pickle
import gzip

import binary_net

from pylearn2.datasets.mnist import MNIST
from pylearn2.utils import serial

from collections import OrderedDict

  "downsample module has been moved to the theano.tensor.signal.pool module.")


In [2]:
# BN parameters
batch_size = 100
print("batch_size = "+str(batch_size))
# alpha is the exponential moving average factor
# alpha = .15
alpha = .1
print("alpha = "+str(alpha))
epsilon = 1e-4
print("epsilon = "+str(epsilon))

# MLP parameters
num_units = 4096
print("num_units = "+str(num_units))
n_hidden_layers = 3
print("n_hidden_layers = "+str(n_hidden_layers))

# Training parameters
num_epochs = 1000
print("num_epochs = "+str(num_epochs))

# Dropout parameters
dropout_in = .2 # 0. means no dropout
print("dropout_in = "+str(dropout_in))
dropout_hidden = .5
print("dropout_hidden = "+str(dropout_hidden))

# BinaryOut
activation = binary_net.binary_tanh_unit
print("activation = binary_net.binary_tanh_unit")
# activation = binary_net.binary_sigmoid_unit
# print("activation = binary_net.binary_sigmoid_unit")

# BinaryConnect
binary = True
print("binary = "+str(binary))
stochastic = False
print("stochastic = "+str(stochastic))
# (-H,+H) are the two binary values
# H = "Glorot"
H = 1.
print("H = "+str(H))
# W_LR_scale = 1.    
W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper
print("W_LR_scale = "+str(W_LR_scale))

# Decaying LR 
LR_start = .003
print("LR_start = "+str(LR_start))
LR_fin = 0.0000003
print("LR_fin = "+str(LR_fin))
LR_decay = (LR_fin/LR_start)**(1./num_epochs)
print("LR_decay = "+str(LR_decay))
# BTW, LR decay might good for the BN moving average...

save_path = "mnist_parameters.npz"
print("save_path = "+str(save_path))

shuffle_parts = 1
print("shuffle_parts = "+str(shuffle_parts))

batch_size = 100
alpha = 0.1
epsilon = 0.0001
num_units = 4096
n_hidden_layers = 3
num_epochs = 1000
dropout_in = 0.2
dropout_hidden = 0.5
activation = binary_net.binary_tanh_unit
binary = True
stochastic = False
H = 1.0
W_LR_scale = Glorot
LR_start = 0.003
LR_fin = 3e-07
LR_decay = 0.990831944893
save_path = mnist_parameters.npz
shuffle_parts = 1


In [3]:
def inject_batch_size(shape, batch_size):
    shape = list(shape)
    shape[0] = batch_size
    return tuple(shape)

In [4]:
# Prepare Theano variables for inputs and targets
input = T.tensor4('inputs')
target = T.matrix('targets')
LR = T.scalar('LR', dtype=theano.config.floatX)

mlp = lasagne.layers.InputLayer(
        shape=(None, 1, 28, 28),
        input_var=input)
print("[Data Memory (float)]", inject_batch_size(mlp.output_shape, batch_size))

mlp = lasagne.layers.DropoutLayer(
        mlp, 
        p=dropout_in)

[Data Memory (float)] (100, 1, 28, 28)


In [5]:
prev_dim = 28 * 28

for k in range(n_hidden_layers):
    print("### hidden layer %s" % k)
    mlp = binary_net.DenseLayer(
            mlp, 
            binary=binary,
            stochastic=stochastic,
            H=H,
            W_LR_scale=W_LR_scale,
            nonlinearity=lasagne.nonlinearities.identity,
            num_units=num_units)
    print("[Parameter Memory (binary)]", "(%s, %s)" % (prev_dim, num_units))
    print("[Data Memory (binary)]", inject_batch_size(mlp.output_shape, batch_size))
    prev_dim = num_units # first hidden layer different

    mlp = lasagne.layers.BatchNormLayer(
            mlp,
            epsilon=epsilon, 
            alpha=alpha)

    mlp = lasagne.layers.NonlinearityLayer(
            mlp,
            nonlinearity=activation)

    mlp = lasagne.layers.DropoutLayer(
            mlp, 
            p=dropout_hidden)

### hidden layer 0
[Parameter Memory (binary)] (784, 4096)
[Data Memory (binary)] (100, 4096)
### hidden layer 1
[Parameter Memory (binary)] (4096, 4096)
[Data Memory (binary)] (100, 4096)
### hidden layer 2
[Parameter Memory (binary)] (4096, 4096)
[Data Memory (binary)] (100, 4096)


In [6]:
mlp = binary_net.DenseLayer(
            mlp, 
            binary=binary,
            stochastic=stochastic,
            H=H,
            W_LR_scale=W_LR_scale,
            nonlinearity=lasagne.nonlinearities.identity,
            num_units=10)
print("[Parameter Memory (binary)]", "(%s, %s)" % (prev_dim, num_units))
print("[Data Memory (binary)]", inject_batch_size(mlp.output_shape, batch_size))

mlp = lasagne.layers.BatchNormLayer(
        mlp,
        epsilon=epsilon, 
        alpha=alpha)

train_output = lasagne.layers.get_output(mlp, deterministic=False)

# squared hinge loss
loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output)))

[Parameter Memory (binary)] (4096, 4096)
[Data Memory (binary)] (100, 10)
