In [193]:
#following the tutorial of below to implement the cnn using tensorflow.
#https://github.com/Hvass-Labs/TensorFlow-Tutorials/blob/master/02_Convolutional_Neural_Network.ipynb

In [194]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from sklearn.metrics import confusion_matrix
import time
from datetime import timedelta
import math

In [195]:
#read the blog/github above for a general idea of how cnn functions.
#watch hvass youtube video (the first 10 min) to explain how convolution, pooling work

In [196]:
tf.__version__

'1.12.0'

In [197]:
#pre-define the convolution layers
#convolution layer 1, a total of 3x3 pixels filter with 16 filters in general -> this will give 16 output channels.
#each output channel will be a result of the dot product between the filters (weight matrix) with the input
#Defined by each stride length. Note no padding at the edges and pooling can reduce the image size after the convol layer
filter_size1 = 3
num_filters1 = 16

#convolution layer 2
filter_size2 = 3 #3x3 pixels
num_filters2 = 20 #we'll lessen the number of filter for faster processing

#dense layer, fully connected (after all the convolution stuff as input), this is middle layer prior to output 
fc_size = 980

In [198]:
#data will be in the flattened form of input (28x28 = 784 1d array)
#target will be encoded in one_hot form: [0,0,0,0,0,1,0,0,0,0] = label #5.
from tensorflow.examples.tutorials.mnist import input_data
data = input_data.read_data_sets(train_dir="MNIST_data/",one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [199]:
print(data.train.num_examples)
print(data.test.num_examples)
print(data.validation.num_examples) #we won't be using this validation dataset

55000
10000
5000


In [200]:
#data dimensions defined for convience
img_size = 28 #the number of pixels in each dimension of an image
img_size_flat = 784 #28x28
img_shape = (28,28) #height x width
num_classes = 10 #10 outputs, 1 for each digit from 0 -> 9
num_channels = 1 #grayscale, initial channel is only 1, we should have the same amount of channels as the first # of filters for the 2nd convo layer 

# Define helper functions for our CNN

In [201]:
def new_weights(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.05)) #values with over 2 stdev will be truncated. Follow normal dist

In [202]:
#should be y = x*W+b. Each filter will be dot prod. with the input -> scalar value, thus allowing to add a scalar bias.
#therefore the bias should have the same size as our convo output channels
def new_bias(length):
    return tf.Variable(tf.constant(value=0.05, shape=[length])) #1d, peut-etre size of 10

Helpfer function for creating new Convolution Layer with the input of our tensor:
1. Image number
2. Y-axis size - 28 #depend on pooling & strides used, or padding for the 2nd convo input
3. X-axis size - 28 #depend on pooling & strides used, or padding for the 2nd convo input
4. Channel size - 1 for first layer input to convo layer, the second layer will be as many as the first filters we have chosen.

If an image has 3 colors perhaps, we can also have 3 channels coming in the first layer.

In [203]:
#as stated above, we have: 1-image number, 2-number of channels on first convo layer, 3-num_filters for the first convo
#4-pooling or not (to reduce image size of the output, and retain significant details - maxpooling)
def new_conv_layer(input, num_input_channels, filter_size, num_filters, use_pooling=True):
    
    #shape of the filter-weight for the convolution, formatted with Tensorflow API
    shape = [filter_size, filter_size, num_input_channels, num_filters]
    
    #weight matrices, or filters init
    #this case, it's [784 x #of filters], we have [data set size x 784 input] and require an output equal to the #of filters 
    weights = new_weights(shape=shape)
    
    #bias matrix. output = x*W+b so B must be the same as the number of channels after the first filter here
    biases = new_bias(length = num_filters)
    
    #Creating tensorflow operation for our convolution
    #first stride = 1 (due to only 1 number image input, we don't want to skip any raw data input)
    #second stride = y movement , let it be 1 ->shift 1 cell below
    #third stride = x movement, let it be 1 ->shift 1 cell to the right
    #forth = channel skipping. If it's 2 then we basically skilling channels. Maybe applicable for multi colors or 2nd layer
    #however, first layer = 1 as we only have 1 channel, the grayscale. we'll also use 1 for 2nd convo layer
    #padding="SAME" will pad the edges of our images with zeros value 
    #to keep the output the same as input with strides(prior to pooling)
    #automatically y=x*W for each filter, it should be a dot prod with single value, thus we need in bias at the end as scalar
    layer = tf.nn.conv2d(input=input,filter=weights,strides = [1,1,1,1],padding="SAME")
    layer = layer+biases #add biases to each filter channel output
    
    if use_pooling == True:
        #using 2x2 maxpooling to reduce the output size for each image of each channel
        #note, no overlapping. this will also reduce our img size by [x/2 x y/2] after the first pooling (if padded to keep convo output the same)
        #we use the same logic for number of input and channel size, we don't want to skip any.
        #however, stepping of 2 for x-y for maxpooling as discussed.
        layer = tf.nn.max_pool(value=layer,ksize=[1,2,2,1],strides=[1,2,2,1],padding="SAME")
    
    #important, relu is usually executed before pooling, but relu(max_pool(x)) == max_pool(relu(x)) will retrieve the max value
    #we can save all the relu work by just relu at the end. think about the function above, as relu(x) = 0 for x<0
    #and relu(x) = x so we do not lose any info when x>0
    layer = tf.nn.relu(layer) #activation function!
    
    #return the layer and weights (for plotting)
    return layer, weights

In [204]:
#helper function for flattening a layer
#our convo layer output will be [img number, ysize,xsize,channel] and we need to compress this to 2d for fully conn. network

def flatten_layer(layer): #input the convo layer
    layer_shape = layer.get_shape() #get the input shape
    #layer_shape will have the format of [num_images,img_height,img_width,num_channels]
    
    #to send into a fcc network, the features = img_height*img_width*num_channels
    #we can calculate with tensorflow method
    num_features = layer_shape[1:4].num_elements()
    
    #the n means to infer, as the size of our data input (batch_size in our case), while keeping the other dim
    #listed as the same: ie: numb_features need to be the same each time, but we dont know how many data we're using for the first dim
    #in our case, -1 refers to the num_images; or our input batch
    layer_flat = tf.reshape(layer,[-1,num_features])
    
    #result shape = [num_images,img_height*img_width*num_channels]
    
    return layer_flat,num_features

In [205]:
#Finally, a helper function for our fcc, this is just a 1 middle layer neural net from our prev flattened output
def new_fc_layer(input,             #the prev layer
                 num_inputs,        #number of inputs from prev layer
                 num_outputs,       #number of outputs
                 use_relu=True):    #reLu activation
    
    #creating weights and biases variables
    # Y = X*weights + b, there are 9 outputs generally 
    weights = new_weights(shape=[num_inputs,num_outputs])
    biases = new_bias(length = num_outputs)
    
    layer = tf.matmul(input,weights) + biases
    if use_relu == True:
        layer = tf.nn.relu(layer)
    return layer

# Let's run the model created 

In [206]:
x = tf.placeholder(tf.float32, shape=[None,img_size_flat],name="x") #a 784 1d vector input

In [207]:
#basically our convo layer needs a [num images, y pixel, x pixel, num of channels]. 
#-1 means the number of images will be inferred automatically
x_image = tf.reshape(x,[-1,img_size,img_size,num_channels])

In [208]:
#basically a 1-10 array with binary result
y_true = tf.placeholder(tf.float32,shape=[None, num_classes],name="y_true")

In [209]:
y_true_cls = tf.argmax(y_true,axis=1) 
#return the index of the maximum value in the y_true array to induce out the correct solution
#ie: [0,0,1,..,0] = 2 because of the index location

In [210]:
#convolution layer #1
#first input is a reshapped image of the 28x28 using 4d tensor as discussed above.
layer_conv1, weights_conv1 = new_conv_layer(input=x_image,num_input_channels=num_channels, filter_size=filter_size1,
                                           num_filters=num_filters1,use_pooling=True)

In [211]:
layer_conv1 #check out the shape. ? indicates we havent clarify the number of images for input

<tf.Tensor 'Relu_4:0' shape=(?, 14, 14, 16) dtype=float32>

In [212]:
#second convo layer, which takes account from the 1st convo outputs
#second input is the prev layer output of channels
#the number of filters based on the first image will also determine the number of channels output for the 1st covo,
#thus being the inputs of the second convo
layer_conv2,weights_conv2 = new_conv_layer(input=layer_conv1,num_input_channels=num_filters1,filter_size=filter_size2,
                                          num_filters=num_filters2,use_pooling=True)

In [213]:
layer_conv2

<tf.Tensor 'Relu_5:0' shape=(?, 7, 7, 20) dtype=float32>

In [214]:
#Now we need to flatten the output so we can use them as an input into our regular neural network
layer_flat,num_features = flatten_layer(layer_conv2)

In [215]:
layer_flat #the shape of layer_conv2 multiply together!

<tf.Tensor 'Reshape_5:0' shape=(?, 980) dtype=float32>

In [216]:
num_features #obviously all these flattened data are the input!

980

# Fully Connected layers

In [217]:
#the num_input needs to be about the same size as the total output from prev layer_flat
layer_fc1 = new_fc_layer(input=layer_flat, num_inputs=fc_size, num_outputs=600, use_relu=True) #we'll use an arbitrary 100
layer_fc1

<tf.Tensor 'Relu_6:0' shape=(?, 600) dtype=float32>

In [218]:
#the number of input = number of output from prev layer, with the final output to use softmax LATER
layer_fc2 = new_fc_layer(input=layer_fc1,num_inputs=600,num_outputs=num_classes,use_relu=False)
layer_fc2

<tf.Tensor 'add_19:0' shape=(?, 10) dtype=float32>

In [219]:
#now we'll use softmax on the last layer output
y_pred = tf.nn.softmax(layer_fc2)

In [220]:
y_pred_cls = tf.argmax(y_pred,axis=1) #return the index of which the highest prob. of the label occurs. our final solution

In [221]:
#y-true is defined above as a placeholder.
#cost function defined. We'll optimize cross entropy to minimize the errors
#THIS FUNCTION SUMS the errors
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=layer_fc2,labels=y_true)

In [222]:
#cost function average out to find the mean error 
cost = tf.reduce_mean(cross_entropy) 

In [223]:
#optimizer, let's use adam with smart gradient descent.
optimizer = tf.train.AdamOptimizer(learning_rate = 0.0005)
optimizer = optimizer.minimize(cost)

In [224]:
#Performance measurements
correct_pred = tf.equal(y_pred_cls,y_true_cls) 
#these are labels, not arrays. Ie: 5 vs. 2. They will be stored an array of [number of img] with true/false

In [225]:
accuracy = tf.reduce_mean(tf.cast(correct_pred,dtype=tf.float32)) 
#casting the boolean into 1/0 and average the entire dataset to find the accuracy

# Tensorflow execution

In [226]:
init = tf.global_variables_initializer()

In [227]:
#total number of iterations to train
total_iterations = 0
def optimize(num_iterations):
    global total_iterations
    start_time = time.time()
    
    for i in range(total_iterations,total_iterations+num_iterations):
        #as done prev in normal model, the API of mnist has its own methods to get the next batches for training
        x_batch, y_true_batch = data.train.next_batch(batch_size=50)
        
        #put the batch into dict with proper names
        feed_dict_train = {x:x_batch,y_true:y_true_batch}
        sess.run(optimizer, feed_dict=feed_dict_train)
        
        if i%100 == 0:
            #calculate accuracy every 100 iterations
            acc = sess.run(accuracy, feed_dict=feed_dict_train)
            msg = "Optimization Iteration: {0: >6}, Training Accuracy: {1:>6.1%}"
            print(acc)
            print(msg.format(i+1, acc))
        
    total_iterations += num_iterations
    end_time = time.time()
        
    time_dif = end_time - start_time
    print("time usage: " + str(timedelta(seconds = int(round(time_dif)))))
    return acc

In [228]:
#with tf.Session() as sess:
#    sess.run(init)
#    train_batch_size = 60
    
#    acc = sess.run(optimize(num_iterations=1000))


In [229]:
sess = tf.Session()

In [230]:
sess.run(init)

In [231]:
train_batch_size = 50

In [234]:
total_iterations = 0 #need to reset value first
optimize(num_iterations=1000)

0.86
Optimization Iteration:      1, Training Accuracy:  86.0%
0.98
Optimization Iteration:    101, Training Accuracy:  98.0%
0.96
Optimization Iteration:    201, Training Accuracy:  96.0%
1.0
Optimization Iteration:    301, Training Accuracy: 100.0%
1.0
Optimization Iteration:    401, Training Accuracy: 100.0%
0.96
Optimization Iteration:    501, Training Accuracy:  96.0%
0.98
Optimization Iteration:    601, Training Accuracy:  98.0%
1.0
Optimization Iteration:    701, Training Accuracy: 100.0%
0.98
Optimization Iteration:    801, Training Accuracy:  98.0%
0.96
Optimization Iteration:    901, Training Accuracy:  96.0%
time usage: 0:00:23


0.96

#need to find out how to resolve the accuracy issue of rounding decimals. However, we've done a decent job at getting started with the theory of cnn and how to implement a basic system with much higher accuracy than the normal DNN