In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

  from ._conv import register_converters as _register_converters


In [2]:
def conv_layer(input,filter_size=[3,3],num_features=[1],prob=[1.,-1.]):
    
    # Get number of input features from input and add to shape of new layer
    shape=filter_size+[input.get_shape().as_list()[-1],num_features]
    shapeR=shape
    if (prob[1]==-1.):
        shapeR=[1,1]
    R = tf.get_variable('R',shape=shapeR)
    W = tf.get_variable('W',shape=shape) # Default initialization is Glorot (the one explained in the slides)
    
    #b = tf.get_variable('b',shape=[num_features],initializer=tf.zeros_initializer) 
    conv = tf.nn.conv2d(input, W, strides=[1, 1, 1, 1], padding='SAME')
    conv = tf.clip_by_value(conv,-1.,1.)
    
    return(conv)

def grad_conv_layer(below, back_propped, current, W, R):
    w_shape=W.shape
    strides=[1,1,1,1]
    back_prop_shape=[-1]+(current.shape.as_list())[1:]
    out_backprop=tf.reshape(back_propped,back_prop_shape)
    on_zero = K.zeros_like(out_backprop)
    out_backpropF=K.tf.where(tf.equal(tf.abs(current),1.),on_zero,out_backprop)
    gradconvW=tf.nn.conv2d_backprop_filter(input=below,filter_sizes=w_shape,\
                                                     out_backprop=out_backpropF,\
                                                     strides=strides,\
                                           padding='SAME')
    input_shape=[batch_size]+(below.shape.as_list())[1:]
    
    filter=W
    if (len(R.shape.as_list())==4):
        filter=R
    gradconvx=tf.nn.conv2d_backprop_input(input_sizes=input_shape,filter=filter,out_backprop=out_backpropF,strides=strides,padding='SAME')
    
    return gradconvW, gradconvx

In [3]:
def fully_connected_layer(input,num_features,prob=[1.,-1.]):
    # Make sure input is flattened.
    flat_dim=np.int32(np.array(input.get_shape().as_list())[1:].prod())
    input_flattened = tf.reshape(input, shape=[-1,flat_dim])
    shape=[flat_dim,num_features]
    shapeR=shape
    if (prob[1]==-1.):
        shapeR=[1]
    R_fc = tf.get_variable('R',shape=shapeR)
    W_fc = tf.get_variable('W',shape=shape)

    #b_fc = tf.get_variable('b',shape=[num_features],initializer=tf.zeros_initializer)
    fc = tf.matmul(input_flattened, W_fc) # + b_fc
    return(fc)

def grad_fully_connected(below, back_propped, W, R):
    
    belowf=tf.contrib.layers.flatten(below)
    # Gradient of weights of dense layer
    gradfcW=tf.matmul(tf.transpose(belowf),back_propped)
    # Propagated error to conv layer.
    filter=W
    if (len(R.shape.as_list())==2):
        filter=R
    gradfcx=tf.matmul(back_propped,tf.transpose(filter))
    
    return gradfcW, gradfcx


In [4]:
from keras import backend as K
from keras.layers.convolutional import UpSampling2D
 
def MaxPoolingandMask(inputs, pool_size, strides,
                          padding='SAME'):

        pooled = tf.nn.max_pool(inputs, ksize=pool_size, strides=strides, padding=padding)
        upsampled = UpSampling2D(size=strides[1:3])(pooled)
        indexMask = K.tf.equal(inputs, upsampled)
        assert indexMask.get_shape().as_list() == inputs.get_shape().as_list()
        return pooled,indexMask
     
#def get_output_shape_for(self, input_shape):
#        return input_shape
 
 
def unpooling(x,mask,strides):
    '''
    do unpooling with indices, move this to separate layer if it works
    1. do naive upsampling (repeat elements)
    2. keep only values in mask (stored indices) and set the rest to zeros
    '''
    on_success = UpSampling2D(size=strides)(x)
    on_fail = K.zeros_like(on_success)
    return K.tf.where(mask, on_success, on_fail)
 
 


def grad_pool(back_propped,pool,mask,pool_size):
        gradx_pool=tf.reshape(back_propped,[-1]+(pool.shape.as_list())[1:])
    #gradfcx=tf.reshape(gradfcx_pool,[-1]+(conv.shape.as_list())[1:])
        gradx=unpooling(gradx_pool,mask,pool_size)
        return gradx

Using TensorFlow backend.


In [5]:
def find_sibling(l,parent):
      
        for ly in PARS['layers']:
            if ('parent' in ly):
                q=ly['parent']

                if (ly is not l and type(q)==str and q in parent):
                    return q
        return None  

def create_network(PARS):
    TS=[]
    ln=len(PARS['layers'])
    sibs={}
    for i,l in enumerate(PARS['layers']):
        parent=None
        prob=[1.,-1.]
        if ('force_global_prob' in PARS):
            prob=list(PARS['force_global_prob'])
        # Last output layer is fully connected to last hidden layer
        if (i==ln-1):
            prob[0]=1.
        if ('parent' in l):
            if ('input' in l['parent']):
                parent=x
            else:
                # Get list of parents
                if (type(l['parent'])==list):
                    parent=[] 
                    for s in l['parent']:
                        for ts in TS:
                            if s in ts.name and not 'Equal' in ts.name:
                                parent.append(ts)
                # Get single parent
                else:
                    for ts in TS:
                        if l['parent'] in ts.name and not 'Equal' in ts.name:
                            parent=ts
        if ('conv' in l['name']):
            with tf.variable_scope(l['name']):
                TS.append(conv_layer(parent, filter_size=list(l['filter_size']),num_features=l['num_filters'], prob=prob))
        elif ('dens' in l['name']):
            with tf.variable_scope(l['name']):
                num_units=l['num_units']
                if ('final' in l):
                    num_units=n_classes
                TS.append(fully_connected_layer(parent, num_features=num_units,prob=prob))
        elif ('pool' in l['name']):
            with tf.variable_scope(l['name']):
                pool, mask = MaxPoolingandMask(parent, [1]+list(l['pool_size'])+[1],\
                                           strides=[1]+list(l['stride'])+[1])
                TS.append(pool)
                TS.append(mask)
        elif ('drop' in l['name']):
            with tf.variable_scope(l['name']):
                U=tf.random_uniform([batch_size]+(parent.shape.as_list())[1:])<l['drop']
                Z=tf.zeros_like(parent)
                drop = K.tf.where(U,Z,parent)
                TS.append(drop)
        elif ('concatsum' in l['name']):
            with tf.variable_scope(l['name']):
                res_sum=tf.add(parent[0],parent[1])
                TS.append(res_sum)
            # This is a sum layer get its sibling
                joint_parent=find_sibling(l,l['parent'])
                if (joint_parent is not None):
                    sibs[TS[-1].name]=joint_parent
    
    with tf.variable_scope('cross_entropy_loss'):
        cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=TS[-1]),name="LOSS")
    
#     with tf.variable_scope('hinge_loss'):
#         cor=tf.boolean_mask(TS[-1],y_)
#         res=tf.boolean_mask(TS[-1],tf.subtract(tf.ones_like(y_),y_))
#        tf.nn.relu(1.-cor)+PARS['dep_fac']*tf.reduce_sum(tf.nn.relu(1.+res),axis=1)/(n_classes-1)
        
    # Accuracy computation
    with tf.variable_scope('helpers'):
        correct_prediction = tf.equal(tf.argmax(TS[-1], 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),name="ACC")
    print('sibs',sibs)
    return cross_entropy, accuracy, TS, sibs

In [6]:
def update_only_non_zero(V,gra, step):
    up=V-step*gra
    up=K.tf.where(V==0,V,up)
    assign_op = tf.assign(V,up)
    return assign_op

def back_prop(): #(VS,cross_entropy,conv,fc):
    # Get gradient of loss with respect to final output layer using tf gradient
    # The rest will be explicit backprop
    
    gradX=tf.gradients(cross_entropy,TS[0])
    corr = tf.equal(tf.argmax(TS[0], 1), tf.argmax(y_, 1))
    acc = tf.reduce_mean(tf.cast(corr, tf.float32))

    gradx=gradX[0]
    lvs=len(VS)
    lts=len(TS)
    vs=0
    ts=0
    OPLIST=[]
    grad_hold_var={}
    parent=None
    for ts in range(lts):
        T=TS[ts]
        if (ts<lts-1):
                pre=TS[ts+1]
                if ('Equal' in pre.name):
                    pre=TS[ts+2]
        else:
            pre=x
        # You have held a gradx from a higher up layer to be added to current one.
        if (parent is not None and parent in T.name):
            print('grad_hold',grad_hold_var[parent])
            gradx=tf.add(gradx,grad_hold_var[parent])
            parent=None
        if ('conv' in T.name):  
            gradconvW, gradx = grad_conv_layer(below=pre,back_propped=gradx,current=TS[ts],W=VS[vs], R=VS[vs+1])
            assign_op_convW = update_only_non_zero(VS[vs],gradconvW,step_size)
            #assign_op_convW=tf.assign(VS[vs],VS[vs]-step_size*gradconvW)
            OPLIST.append(assign_op_convW)
            if (len(VS[vs+1].shape.as_list())==4):
                assign_op_convR=update_only_non_zero(VS[vs+1],gradconvW, Rstep_size)
                #assign_op_convR=tf.assign(VS[vs+1],VS[vs+1]-Rstep_size*gradconvW)
                OPLIST.append(assign_op_convR)
            ts+=1
            vs+=2
        elif ('Equal' in T.name):
            mask=TS[ts]
            ts+=1
        elif ('Max' in T.name):
            gradx=grad_pool(gradx,TS[ts],mask,[2,2])  
            ts+=1
        elif ('dens' in T.name):
            gradfcW, gradx = grad_fully_connected(W=VS[vs],R=VS[vs+1],back_propped=gradx,below=pre)
            assign_op_fcW = update_only_non_zero(VS[vs],gradfcW,step_size)
            #assign_op_fcW=tf.assign(VS[vs],VS[vs]-step_size*gradfcW)
            OPLIST.append(assign_op_fcW)
            if (len(VS[vs+1].shape.as_list())==2):
                assign_op_fcR = update_only_non_zero(VS[vs+1],gradfcW,Rstep_size)
                #assign_op_fcR=tf.assign(VS[vs+1],VS[vs+1]-Rstep_size*gradfcW)
                OPLIST.append(assign_op_fcR)
            ts+=1
            vs+=2
        if (T.name in sibs):
            grad_hold=gradx
            parent=sibs[T.name]
            grad_hold_var[parent]=grad_hold


    #print('Length of VS',len(VS),'Length of OPLIST',len(OPLIST))
    OPLIST.append(acc)
    OPLIST.append(cross_entropy)
    
    return OPLIST

In [7]:
import h5py

def one_hot(values,n_values=10):
    n_v = np.maximum(n_values,np.max(values) + 1)
    oh=np.eye(n_v)[values]
    return oh

def get_cifar(data_set='cifar10'):
    
    filename = '/project2/cmsc25025/mnist/'+data_set+'_train.hdf5'
    print(filename)
    f = h5py.File(filename, 'r')
    key = list(f.keys())[0]
    # Get the data
    tr = f[key]
    print('tr',tr.shape)
    key = list(f.keys())[1]
    tr_lb=f[key]
    train_data=np.float32(tr[0:45000])/255.
    train_labels=one_hot(np.int32(tr_lb[0:45000]))
    val_data=np.float32(tr[45000:])/255.
    val_labels=one_hot(np.int32(tr_lb[45000:]))
    filename = '/project2/cmsc25025/mnist/'+data_set+'_test.hdf5'
    f = h5py.File(filename, 'r')
    key = list(f.keys())[0]
    # Get the data
    test_data = np.float32(f[key])/255.
    key = list(f.keys())[1]
    test_labels=one_hot(np.int32(f[key]))
    return (train_data, train_labels), (val_data, val_labels), (test_data, test_labels)

def get_data(data_set):
    if ('cifar' in data_set):
        return(get_cifar(data_set=data_set))
    elif (data_set=="mnist"):
        return(get_mnist())
    elif (data_set=="mnist_transform"):
        return(get_mnist_trans())


In [8]:
# Function to get loss and accuracy from only one run of the feature extraction network
from scipy.special import logsumexp

def get_stats(data,labels,fc):
    t1=time.time()
    lo=0.
    acc=0.
    delta=batch_size
    rr=np.arange(0,data.shape[0],delta)
    for i in rr:
        fc_out=fc.eval(feed_dict={x: data[i:i+delta], y_:labels[i:i+delta]})
        log_sf=logsumexp(fc_out,axis=1).reshape((fc_out.shape[0],1))-fc_out
        lo+=np.mean(np.sum(labels[i:i+delta]*log_sf, axis=1))
        acc += np.mean(np.equal(np.argmax(fc_out, axis=1),np.argmax(labels[i:i+delta], axis=1)))
    acc=acc/np.float32(len(rr))
    lo=lo/np.float32(len(rr))
    print('get stats time',time.time()-t1)
    # We return the final functions (they contain all the information about the graph of the network)
    return lo, acc

# Run the iterations of one epoch
def run_epoch(train,val,ii):
        t1=time.time()
        # Randomly shuffle the training data
        np.random.shuffle(ii)
        tr=train[0][ii]
        y=train[1][ii]
        lo=0.
        acc=0.
        ca=0.
        #VS=tf.trainable_variables()
        # Run disjoint batches on shuffled data
        for j in np.arange(0,len(y),batch_size):
            #if (np.mod(j,5000)==0):
            #    print('Batch',j/batch_size)
            batch=(tr[j:j+batch_size],y[j:j+batch_size])
            grad=sess.run(dW_OPs,feed_dict={x: batch[0], y_: batch[1]})
            
            acc+=grad[-2]
            lo+=grad[-1]
            ca+=1
        print('Epoch time',time.time()-t1)
        return acc/ca, lo/ca

In [None]:
def zero_out_weights():
        if (PARS['force_global_prob'][1]>=0 and PARS['force_global_prob'][0]<1.):
            shape=v.get_shape().as_list()
            Z=tf.zeros(shape)
            U=tf.random_uniform(shape)
            zero_op=tf.assign(v,K.tf.where(U<PARS['force_global_prob'][0],v,Z))
            sess.run(zero_op)

# Run the training
import parse_net_pars as pp
import time
PARS={}

net='fncrc_deep_tryR_avg'
pp.parse_text_file(net,PARS,lname='layers', dump=True)
batch_size=PARS['batch_size']
step_size=PARS['eta_init']
num_epochs=PARS['num_epochs']
num_train=PARS['num_train']
data_set=PARS['data_set']
Rstep_size=list(PARS['force_global_prob'])[1]*step_size
print('Rstep_size',Rstep_size)

model_name="model"

train,val,test=get_data(data_set=data_set)
num_train=np.minimum(num_train,train[0].shape[0])
dim=train[0].shape[1]
nchannels=train[0].shape[3]
n_classes=train[1].shape[1]
print('n_classes',n_classes,'dim',dim,'nchannels',nchannels)
    
tf.reset_default_graph()

x = tf.placeholder(tf.float32, shape=[None, dim, dim, nchannels],name="x")
y_ = tf.placeholder(tf.float32, shape=[None,n_classes],name="y")


with tf.Session() as sess:
    
    # Create the network architecture with the above placeholdes as the inputs.
    #cross_entropy, accuracy, conv, convK, fc, pool, mask =create_small_network()
    cross_entropy, accuracy, TS, sibs =create_network(PARS) 
    TS.reverse()
    for t in TS:
        print(t)
    
    # Initialize variables
    sess.run(tf.global_variables_initializer())
    
    # Show trainable variables
    VS=tf.trainable_variables()
    VS.reverse()
    for v in VS:
        print(v.name,v.get_shape().as_list(),np.std(v.eval()))
        zero_out_weights()
    # Differences between W and R
    for t in np.arange(0,len(VS),2):
       print('t',t,'zeros',np.sum(VS[t].eval()==0), np.max(np.abs(VS[t].eval()-VS[t+1].eval())))
    dW_OPs=back_prop() 
    ii=np.arange(0,num_train,1) 
   
    # Run epochs
    AC=[]
    VAC=[]
    for i in range(num_epochs):  # number of epochs
        ac,lo=run_epoch(train,val,ii)
        if (np.mod(i,1)==0):
            #lo,ac = get_stats(train[0][0:num_train],train[1][0:num_train],TS[0])
            AC.append(ac)
            print('Epoch',i,'Train loss, accuracy',lo,ac)
            vlo,vac = get_stats(val[0],val[1],TS[0])
            VAC.append(vac)
            print('EPoch',i,'Validation loss, accuracy',vlo,vac)
            # Test set accuracy
    AC=np.array(AC)
    VAC=np.array(VAC)
    lo,ac = get_stats(test[0],test[1],TS[0])
    print('test accuracy %g' % ac)
    plt.plot(AC)
    plt.plot(VAC)
    plt.show()
    ACC=np.concatenate([np.expand_dims(AC,axis=1),np.expand_dims(VAC,axis=1)],axis=1)
    np.save('ACC',ACC)
    # Save model
    #tf.add_to_collection("optimizer", train_step)
    saver = tf.train.Saver()
    save_path = saver.save(sess, "tmp/"+model_name)
    print("Model saved in path: %s" % save_path)


seed:45239
num_epochs:200
data_set:cifar100
batch_size:500
eta_init:.1
num_train:50000
#eta_schedule:(100.,.01,120,.001)
dep_fac:1.
hinge:1.
force_global_prob:(1.,-1.)
name:input1
name:conv0;parent:input1;filter_size:(3,3);num_filters:16;stride:(1,1)
name:conv0aR;parent:conv0;stride:(1,1);filter_size:(3,3);num_filters:16
name:concatsum0;parent:[conv0,conv0aR]
name:conv1R;parent:concatsum0;filter_size:(3,3);num_filters:32;stride:(1,1)
name:conv1aR;parent:conv1R;stride:(1,1);filter_size:(3,3);num_filters:32
name:concatsum1;parent:[conv1R,conv1aR]
name:pool1;parent:concatsum1;pool_size:(2,2);stride:(2,2);mode:max
name:drop1;drop:.5;parent:pool1
name:conv2R;parent:drop1;filter_size:(3,3);num_filters:64;stride:(1,1)
name:conv2aR;parent:conv2R;stride:(1,1);filter_size:(3,3);num_filters:64
name:concatsum2;parent:[conv2R,conv2aR]
name:pool2;parent:concatsum2;pool_size:(2,2);stride:(2,2)
name:conv3R;parent:pool2;filter_size:(3,3);num_filters:128;stride:(1,1)
name:conv3aR;parent:conv3R;stride:(1

Epoch time 12.640021562576294
Epoch 19 Train loss, accuracy 2.916568634245131 0.28051111300786336
get stats time 0.4323275089263916
EPoch 19 Validation loss, accuracy 3.2410026004791264 0.22400000000000003
Epoch time 12.671654224395752
Epoch 20 Train loss, accuracy 2.9070251756244234 0.2831999998953607
get stats time 0.4328954219818115
EPoch 20 Validation loss, accuracy 3.1070886329650884 0.2524
Epoch time 12.621681690216064
Epoch 21 Train loss, accuracy 2.885306543774075 0.2897777787513203
get stats time 0.4330124855041504
EPoch 21 Validation loss, accuracy 3.03529112739563 0.2598
Epoch time 12.691030263900757
Epoch 22 Train loss, accuracy 2.8184059884813095 0.29846666521496246
get stats time 0.44062018394470215
EPoch 22 Validation loss, accuracy 3.073011648416519 0.25880000000000003
Epoch time 12.676113605499268
Epoch 23 Train loss, accuracy 2.8090037769741483 0.3020000014040205
get stats time 0.43386292457580566
EPoch 23 Validation loss, accuracy 3.0034106044769286 0.2664
Epoch time

get stats time 0.43196749687194824
EPoch 63 Validation loss, accuracy 2.617912422609329 0.3442
Epoch time 12.662503719329834
Epoch 64 Train loss, accuracy 2.300118629137675 0.4036888877550761
get stats time 0.437455415725708
EPoch 64 Validation loss, accuracy 2.7797290401935575 0.324
Epoch time 12.552242517471313
Epoch 65 Train loss, accuracy 2.2812849097781713 0.4080666661262512
get stats time 0.4306468963623047
EPoch 65 Validation loss, accuracy 2.664098189163208 0.3378
Epoch time 12.57861852645874
Epoch 66 Train loss, accuracy 2.2892888095643786 0.4049777779314253
get stats time 0.4300706386566162
EPoch 66 Validation loss, accuracy 2.5935981766700746 0.351
Epoch time 12.581362962722778
Epoch 67 Train loss, accuracy 2.2784031920962864 0.40433333151870304
get stats time 0.43331170082092285
EPoch 67 Validation loss, accuracy 2.5702385149478912 0.3574
Epoch time 12.693990230560303
Epoch 68 Train loss, accuracy 2.2821395635604858 0.4091111093759537
get stats time 0.43766355514526367
EPoc