In [1]:
# Copyright 2016 Matthieu Courbariaux

# This file is part of BinaryNet.

# BinaryNet is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# BinaryNet is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with BinaryNet.  If not, see <http://www.gnu.org/licenses/>.

from __future__ import print_function

import sys
import os
import time

import numpy as np
np.random.seed(1234) # for reproducibility?

# specifying the gpu to use
# import theano.sandbox.cuda
# theano.sandbox.cuda.use('gpu1') 
import theano
import theano.tensor as T

import lasagne

import cPickle as pickle
import gzip

import binary_net

from pylearn2.datasets.zca_dataset import ZCA_Dataset   
from pylearn2.datasets.cifar10 import CIFAR10 
from pylearn2.utils import serial

from collections import OrderedDict

  "downsample module has been moved to the theano.tensor.signal.pool module.")


In [2]:
# BN parameters
batch_size = 50
print("batch_size = "+str(batch_size))
# alpha is the exponential moving average factor
alpha = .1
print("alpha = "+str(alpha))
epsilon = 1e-4
print("epsilon = "+str(epsilon))

# BinaryOut
activation = binary_net.binary_tanh_unit
print("activation = binary_net.binary_tanh_unit")
# activation = binary_net.binary_sigmoid_unit
# print("activation = binary_net.binary_sigmoid_unit")

# BinaryConnect    
binary = True
print("binary = "+str(binary))
stochastic = False
print("stochastic = "+str(stochastic))
# (-H,+H) are the two binary values
# H = "Glorot"
H = 1.
print("H = "+str(H))
# W_LR_scale = 1.    
W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper
print("W_LR_scale = "+str(W_LR_scale))

# Training parameters
num_epochs = 500
print("num_epochs = "+str(num_epochs))

# Decaying LR 
LR_start = 0.001
print("LR_start = "+str(LR_start))
LR_fin = 0.0000003
print("LR_fin = "+str(LR_fin))
LR_decay = (LR_fin/LR_start)**(1./num_epochs)
print("LR_decay = "+str(LR_decay))
# BTW, LR decay might good for the BN moving average...

train_set_size = 45000
print("train_set_size = "+str(train_set_size))
shuffle_parts = 1
print("shuffle_parts = "+str(shuffle_parts))

batch_size = 50
alpha = 0.1
epsilon = 0.0001
activation = binary_net.binary_tanh_unit
binary = True
stochastic = False
H = 1.0
W_LR_scale = Glorot
num_epochs = 500
LR_start = 0.001
LR_fin = 3e-07
LR_decay = 0.983907435305
train_set_size = 45000
shuffle_parts = 1


In [3]:
def inject_batch_size(shape, batch_size):
    shape = list(shape)
    shape[0] = batch_size
    return tuple(shape)

In [4]:
# Prepare Theano variables for inputs and targets
input = T.tensor4('inputs')
target = T.matrix('targets')
LR = T.scalar('LR', dtype=theano.config.floatX)

cnn = lasagne.layers.InputLayer(
        shape=(None, 3, 32, 32),
        input_var=input)
print("[Data Memory (float)]", inject_batch_size(cnn.output_shape, batch_size))

[Data Memory (float)] (50, 3, 32, 32)


In [5]:
# 128C3-128C3-P2             
cnn = binary_net.Conv2DLayer(
        cnn, 
        binary=binary,
        stochastic=stochastic,
        H=H,
        W_LR_scale=W_LR_scale,
        num_filters=128, 
        filter_size=(3, 3),
        pad=1,
        nonlinearity=lasagne.nonlinearities.identity)
print("[Parameter Memory (binary)]", "(128, 3, 3)")
print("[Data Memory (binary)]", inject_batch_size(cnn.output_shape, batch_size))

cnn = lasagne.layers.BatchNormLayer(
        cnn,
        epsilon=epsilon, 
        alpha=alpha)

cnn = lasagne.layers.NonlinearityLayer(
        cnn,
        nonlinearity=activation) 

cnn = binary_net.Conv2DLayer(
        cnn, 
        binary=binary,
        stochastic=stochastic,
        H=H,
        W_LR_scale=W_LR_scale,
        num_filters=128, 
        filter_size=(3, 3),
        pad=1,
        nonlinearity=lasagne.nonlinearities.identity)
print("[Parameter Memory (binary)]", "(128, 3, 3)")
print("[Data Memory (binary)]", inject_batch_size(cnn.output_shape, batch_size))

cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2))
print("[Data Memory (binary)]", inject_batch_size(cnn.output_shape, batch_size))

cnn = lasagne.layers.BatchNormLayer(
        cnn,
        epsilon=epsilon, 
        alpha=alpha)

cnn = lasagne.layers.NonlinearityLayer(
        cnn,
        nonlinearity=activation)

[Parameter Memory (binary)] (128, 3, 3)
[Data Memory (binary)] (50, 128, 32, 32)
[Parameter Memory (binary)] (128, 3, 3)
[Data Memory (binary)] (50, 128, 32, 32)
[Data Memory (binary)] (50, 128, 16, 16)


In [6]:
# 256C3-256C3-P2             
cnn = binary_net.Conv2DLayer(
        cnn, 
        binary=binary,
        stochastic=stochastic,
        H=H,
        W_LR_scale=W_LR_scale,
        num_filters=256, 
        filter_size=(3, 3),
        pad=1,
        nonlinearity=lasagne.nonlinearities.identity)
print("[Parameter Memory (binary)]", "(256, 3, 3)")
print("[Data Memory (binary)]", inject_batch_size(cnn.output_shape, batch_size))

cnn = lasagne.layers.BatchNormLayer(
        cnn,
        epsilon=epsilon, 
        alpha=alpha)

cnn = lasagne.layers.NonlinearityLayer(
        cnn,
        nonlinearity=activation) 

cnn = binary_net.Conv2DLayer(
        cnn, 
        binary=binary,
        stochastic=stochastic,
        H=H,
        W_LR_scale=W_LR_scale,
        num_filters=256, 
        filter_size=(3, 3),
        pad=1,
        nonlinearity=lasagne.nonlinearities.identity)
print("[Parameter Memory (binary)]", "(256, 3, 3)")
print("[Data Memory (binary)]", inject_batch_size(cnn.output_shape, batch_size))

cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2))
print("[Data Memory (binary)]", inject_batch_size(cnn.output_shape, batch_size))

cnn = lasagne.layers.BatchNormLayer(
        cnn,
        epsilon=epsilon, 
        alpha=alpha)

cnn = lasagne.layers.NonlinearityLayer(
        cnn,
        nonlinearity=activation)

[Parameter Memory (binary)] (256, 3, 3)
[Data Memory (binary)] (50, 256, 16, 16)
[Parameter Memory (binary)] (256, 3, 3)
[Data Memory (binary)] (50, 256, 16, 16)
[Data Memory (binary)] (50, 256, 8, 8)


In [7]:
# 512C3-512C3-P2              
cnn = binary_net.Conv2DLayer(
        cnn, 
        binary=binary,
        stochastic=stochastic,
        H=H,
        W_LR_scale=W_LR_scale,
        num_filters=512, 
        filter_size=(3, 3),
        pad=1,
        nonlinearity=lasagne.nonlinearities.identity)
print("[Parameter Memory (binary)]", "(512, 3, 3)")
print("[Data Memory (binary)]", inject_batch_size(cnn.output_shape, batch_size))

cnn = lasagne.layers.BatchNormLayer(
        cnn,
        epsilon=epsilon, 
        alpha=alpha)

cnn = lasagne.layers.NonlinearityLayer(
        cnn,
        nonlinearity=activation)

cnn = binary_net.Conv2DLayer(
        cnn, 
        binary=binary,
        stochastic=stochastic,
        H=H,
        W_LR_scale=W_LR_scale,
        num_filters=512, 
        filter_size=(3, 3),
        pad=1,
        nonlinearity=lasagne.nonlinearities.identity)
print("[Parameter Memory (binary)]", "(512, 3, 3)")
print("[Data Memory (binary)]", cnn.output_shape)

cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2))
print("[Data Memory (binary)]", inject_batch_size(cnn.output_shape, batch_size))

cnn = lasagne.layers.BatchNormLayer(
        cnn,
        epsilon=epsilon, 
        alpha=alpha)

cnn = lasagne.layers.NonlinearityLayer(
        cnn,
        nonlinearity=activation)

[Parameter Memory (binary)] (512, 3, 3)
[Data Memory (binary)] (50, 512, 8, 8)
[Parameter Memory (binary)] (512, 3, 3)
[Data Memory (binary)] (None, 512, 8, 8)
[Data Memory (binary)] (50, 512, 4, 4)


In [8]:
# 1024FP-1024FP-10FP            
cnn = binary_net.DenseLayer(
            cnn, 
            binary=binary,
            stochastic=stochastic,
            H=H,
            W_LR_scale=W_LR_scale,
            nonlinearity=lasagne.nonlinearities.identity,
            num_units=1024)
print("[Parameter Memory (binary)]", "(16, 1024)")
print("[Data Memory (binary)]", inject_batch_size(cnn.output_shape, batch_size))

cnn = lasagne.layers.BatchNormLayer(
        cnn,
        epsilon=epsilon, 
        alpha=alpha)

cnn = lasagne.layers.NonlinearityLayer(
        cnn,
        nonlinearity=activation) 

cnn = binary_net.DenseLayer(
            cnn, 
            binary=binary,
            stochastic=stochastic,
            H=H,
            W_LR_scale=W_LR_scale,
            nonlinearity=lasagne.nonlinearities.identity,
            num_units=1024)
print("[Parameter Memory (binary)]", "(1024, 1024)")
print("[Data Memory (binary)]", inject_batch_size(cnn.output_shape, batch_size))

cnn = lasagne.layers.BatchNormLayer(
        cnn,
        epsilon=epsilon, 
        alpha=alpha)

cnn = lasagne.layers.NonlinearityLayer(
        cnn,
        nonlinearity=activation)

cnn = binary_net.DenseLayer(
            cnn, 
            binary=binary,
            stochastic=stochastic,
            H=H,
            W_LR_scale=W_LR_scale,
            nonlinearity=lasagne.nonlinearities.identity,
            num_units=10)
print("[Parameter Memory (binary)]", "(1024, 10)")
print("[Data Memory (binary)]", inject_batch_size(cnn.output_shape, batch_size))

cnn = lasagne.layers.BatchNormLayer(
        cnn,
        epsilon=epsilon, 
        alpha=alpha)

train_output = lasagne.layers.get_output(cnn, deterministic=False)

# squared hinge loss
loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output)))

[Parameter Memory (binary)] (16, 1024)
[Data Memory (binary)] (50, 1024)
[Parameter Memory (binary)] (1024, 1024)
[Data Memory (binary)] (50, 1024)
[Parameter Memory (binary)] (1024, 10)
[Data Memory (binary)] (50, 10)
