### Load the dataset and reshape accordingly 

In [1]:
from __future__ import division
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
print train_data.shape #(42000, 784)
print test_data.shape #(28000, 784)

(42000, 785)
(28000, 784)


In [3]:
train = train_data.drop(['label'], axis = 1)
label = train_data.label
target = pd.get_dummies(label, columns=['label'], drop_first=False)

In [4]:
x_train,x_cv,y_train,y_cv = train_test_split(train,target,test_size = 0.25, random_state = 4)
print x_train.shape #(31500, 784)
print x_cv.shape #(10500, 784)

(31500, 784)
(10500, 784)


In [5]:
## Reshape df
x_arr = np.array(x_train)
x_cv_arr = np.array(x_cv)
X = x_arr.reshape(31500,28,28,1)
X_cv = x_cv_arr.reshape(10500,28,28,1)

### Initialize Weights and Biases

In [6]:
## Initializing weights and bias for convolution layer

W1 = 0.1*np.random.rand(3,3,3,1)
b1 = 0.1*np.random.rand(3,1)

## Initializing weights and bias for fully connected layer
theta = 0.1*np.random.rand(2352,10)
bias = 0.1*np.random.rand(1,10)

## Normalizing input data
x_arr -= int(np.mean(x_arr))
x_arr = x_arr.astype(float)
x_arr /= int(np.std(x_arr))

## Stacking features and labels 
train_data = np.hstack((x_arr,np.array(y_train)))
t = train_data[0:400]

## Normalizing cross-validation data
x_cv_arr -= int(np.mean(x_cv_arr))
x_cv_arr = x_cv_arr.astype(float)
x_cv_arr /= int(np.std(x_cv_arr))

## Training the model on 400 images, and cv on 100 due to computation issue
cv_data = np.hstack((x_cv_arr,np.array(y_cv)))
test_data = x_cv_arr[0:100]
Y_cv = np.array(y_cv)[0:100]

np.random.shuffle(train_data)

## Assigning hyperparameter values
learning_rate = 0.01
batch_size = 40
num_epochs = 10
num_images = len(t)   ##Number of the input training examples
w = 28
l = 1
l1 = len(W1)    ## no. of filters in W1
f = len(W1[0])

# print X_cv.shape (10500, 28, 28, 1)

### Define functions 

In [7]:
## PADDING function
def zero_pad(data, pad):
    data_pad = np.pad(data,((0,0),(pad,pad),(pad,pad),(0,0)), 'constant')
    return data_pad

## Function to get the coordinates of maxpool element
def idxargmax(a):
    idx = np.argmax(a, axis=None)
    multi_idx = np.unravel_index(idx, a.shape)
    if np.isnan(a[multi_idx]):
        nan_count = np.sum(np.isnan(a))
        idx = np.argpartition(a, -nan_count-1, axis=None)[-nan_count-1]
        idx = np.argsort(a, axis=None)[-nan_count-1]
        multi_idx = np.unravel_index(idx, a.shape)
    return multi_idx

## Maxpool function
def max_pool(X,f,stride):
    (m, w, w, c) = X.shape
    output_size = int((w-f)/stride+1)
    pool = np.zeros((m,output_size,output_size, c))
    for e in range(0,m):
        for k in range(0,c):
            for i in range(output_size):
                for j in range(output_size):
                    pool[e,i,j,k] = np.max(X[e,i*stride:i*stride+f,j*stride:j*stride+f,k])
    return pool

## Softmax
def softmax_cost(out,y):
    eout = np.exp(out, dtype=np.float)  
    probs = eout/np.sum(eout, axis = 1)[:,None]
    
    p = np.sum(np.multiply(y,probs), axis = 1)
    prob_label = np.argmax(np.array(probs), axis = 1)    # arguments of max values
    cost = -np.log(p)    # -log(y*prob)
    
    return p, cost, probs, prob_label

In [8]:
def conv_net(input_data, Y, W1, b1, theta, bias):
    
## Forward propagation

    ## Input shape
    m, w, w, c = input_data.shape
    
    ## no. of filters in layer_1
    l1 = len(W1)

    ## Shape of the filter used
    (f, f, _) = W1[0].shape
    pad = 1
    ## stride = 1, to make calculations easier
    
    ## Convolution layer1 output dimensions
    nw1 = w+(2*pad)-f + 1

    ## Initializing output image matrices after convolutions
    conv1 = np.zeros((m,nw1,nw1,l1))

    ## Padding the input images
    input_pad = zero_pad(input_data,pad)

    ## Convolution layer
    ## Looping over the no. of examples, no. of filters, height and width (h,w) of image
    for i in range(0,m):
        for j in range(0,l1):
            for k in range(0,nw1): 
                for l in range(0,nw1):
                    conv1[i,k,l,j] = np.sum(input_pad[i,k:k+f,l:l+f]*W1[j])+b1[j]

        conv1[i,:,:,:][conv1[i,:,:,:] <= 0] = 0                           ##relu activation
    #print "conv1",conv1.shape
    '''
    ## Pooling layer after max_pooling filter size of 2x2 and stride 2
    pooled_layer = max_pool(conv1, 2, 2)  
    '''
    ## Fully connected layer of neurons
    fc1 = conv1.reshape(m,int((nw1)*(nw1)*l1))
    #print "fc1", fc1.shape
    
    ## Output layer of mx10 activation units
    out = np.dot(fc1,theta) + bias
        
    ## Using softmax to get the cost    
    p, cost, probs, prob_label = softmax_cost(out, Y)
    
    acc = []
    for i in range(0,len(Y)):
        if prob_label[i]==np.argmax(np.array(Y)[i,:]):
            acc.append(1)
        else:
            acc.append(0)

## Backpropagation to calculate gradients 
    
    #Backpropogation across loss and softmax
    d_out = probs - Y
    #print "d_out", d_out.shape
    #Fully connected layer
    dtheta = np.dot(d_out.T, fc1)
    dbias = np.mean(d_out, axis = 0).reshape(1,10)    

    dfc1 = np.dot(theta,d_out.T)
    #print "dfc1",dfc1.shape
    #Pooling and Convolution layer
    #dpool = dfc1.T.reshape((m, int(nw1/2), int(nw1/2), l1))
    dconv1 = dfc1.T.reshape((m, nw1, nw1, l1)) #initialization 
    #print "dconv1", dconv1.shape
    '''
    for k in range(0,m):
        for c in range(0,l1):
            i=0
            while(i<nw1):
                j=0
                while(j<nw1):
                    (a,b) = idxargmax(conv1[k,i:i+2,j:j+2,c]) ## Getting indexes of maximum value in the array
                    dconv1[k,i+a,j+b,c] = dpool[k,int(i/2),int(j/2),c]
                    j+=2
                i+=2
    '''
    dconv1[conv1<=0]=0 #brelu

    
    dW1_stack = np.zeros((m,l1,f,f,1))
    db1_stack = np.zeros((m,l1,1))

    dW1 = np.zeros((l1,f,f,1))
    db1 = np.zeros((l1,1))

    ## looping through the one batch of 40 examples
    for i in range(0,m):
        for c in range(0,l1):
            for x in range(0,nw1):
                for y in range(0,nw1):
                    dW1_stack[i,:,:,c] += dconv1[i,x,y,c]*input_pad[i,x:x+f,y:y+f,:]
            db1_stack[i,c] = np.sum(dconv1[i,:,:,c])
        dconv1[conv1<=0]=0
        
        dW1 = np.mean(dW1_stack, axis = 0)
        db1 = np.mean(db1_stack, axis = 0)
        
    return dW1, db1, dtheta, dbias, cost, probs, prob_label, acc 

In [9]:
def optimizer(batch,learning_rate,W1,b1,theta,bias):
    
    ## Slicing train data and labels from batch
    X = batch[:,0:-10]
    X = X.reshape(len(batch), w, w, l)
    Y = batch[:,784:794]
    
    
    batch_size = len(batch)
    
    ## Initializing gradient matrices 
    bW1 = {}
    dW1 = np.zeros((l1,f,f,1))
    db1 = np.zeros((l1,1))
    
    dtheta = np.zeros(theta.shape)
    dbias = np.zeros(bias.shape)
    
    grads = conv_net(X,Y,W1,b1,theta,bias)
    [dW1, db1, dtheta, dbias, cost_, probs_, prob_label, acc_] = grads
    
    #Updating weights for convolution layer and biases
    W1 = W1-learning_rate*(dW1) #convolution
    b1 = b1-learning_rate*(db1)
    theta = theta-learning_rate*(dtheta.T) #fully connected layer
    bias = bias-learning_rate*(dbias)
    
    batch_cost = np.mean(cost_) # calculating the cost for each batch
    batch_accuracy = sum(acc_)/len(acc_) #Reporting the accuracy for each batch
    
    return W1, b1, theta, bias, batch_cost, acc_, batch_accuracy

In [10]:
def main_init(train_data,W1,b1,theta,bias):
    
    cost = []
    accuracy = []
    for epoch in range(0, num_epochs):
        batches = [train_data[k:k + batch_size] for k in xrange(0, len(train_data), batch_size)]
        x=0
        i = 1
        for batch in batches:
            
            output = optimizer(batch,learning_rate,W1,b1,theta,bias)
            [W1, b1, theta, bias, batch_cost,acc_,batch_acc] = output
                        
            cost.append(batch_cost)
            accuracy.append(batch_acc)

            print 'ep:%d, Batch_no = %f, Cost = %f, Accuracy = %.2f %%' %(epoch,i,batch_cost,batch_acc*100) 
            i+=1
        print '\nAfter epoch %d, Batch Cost = %f, Batch Accuracy = %.2f %%\n' %(epoch,batch_cost,batch_acc*100)
    return W1,b1,theta,bias,cost,accuracy

In [11]:
W1_t,b1_t,theta_t,bias_t,cost_t,accuracy_t = main_init(t,W1,b1,theta,bias)

ep:0, Batch_no = 1.000000, Cost = 2.541080, Accuracy = 7.50 %
ep:0, Batch_no = 2.000000, Cost = 8.041162, Accuracy = 5.00 %
ep:0, Batch_no = 3.000000, Cost = 2.946771, Accuracy = 42.50 %
ep:0, Batch_no = 4.000000, Cost = 2.259020, Accuracy = 7.50 %
ep:0, Batch_no = 5.000000, Cost = 2.552195, Accuracy = 32.50 %
ep:0, Batch_no = 6.000000, Cost = 2.207576, Accuracy = 30.00 %
ep:0, Batch_no = 7.000000, Cost = 2.168430, Accuracy = 40.00 %
ep:0, Batch_no = 8.000000, Cost = 2.434300, Accuracy = 32.50 %
ep:0, Batch_no = 9.000000, Cost = 2.162686, Accuracy = 32.50 %
ep:0, Batch_no = 10.000000, Cost = 2.205329, Accuracy = 37.50 %

After epoch 0, Batch Cost = 2.205329, Batch Accuracy = 37.50 %

ep:1, Batch_no = 1.000000, Cost = 2.360860, Accuracy = 42.50 %
ep:1, Batch_no = 2.000000, Cost = 1.878623, Accuracy = 62.50 %
ep:1, Batch_no = 3.000000, Cost = 1.936615, Accuracy = 60.00 %
ep:1, Batch_no = 4.000000, Cost = 1.704222, Accuracy = 55.00 %
ep:1, Batch_no = 5.000000, Cost = 2.355322, Accuracy = 