In [1]:
import tensorflow as tf
import xml
import numpy as np
from pylab import *
%matplotlib inline
import xml.etree.ElementTree as ET
import cv2
import os
from matplotlib import patches
from random import shuffle

In [2]:
# PASCAL VOC Data import
# DO NOT EDIT HERE! Modify in its own file
home_dir = os.path.expanduser('~') + "/"
datasets_dir = home_dir + "external_drive/"
voc_2012_dir = datasets_dir + "VOC/VOC2012/VOCdevkit/VOC2012/"

classes_to_index = {"aeroplane":1, "bicycle":2, "boat":3, "bottle":4, "bus":5, "car":6, "cat":7,
"chair":8, "cow":9, "diningtable":10, "dog":11, "horse":12, "motorbike":13, "person":14,
"pottedplant":15, "sheep":16, "train":17, "tvmonitor":18, "sofa":19, "bird":20}

indexes_to_classes = [0]*21
for key, value in classes_to_index.items():
    indexes_to_classes[value] = key

class bounding_box:
    def __init__(self, class_name, xmin, xmax, ymin, ymax):
        self.class_name = class_name
        #self.bbox_coords = bbox_coords #xmax, xmin, ymax, ymin
        self.xmin = xmin
        self.xmax = xmax
        self.ymin = ymin
        self.ymax = ymax

# Get list of training images
train_filenames = []
with open(voc_2012_dir + "ImageSets/Main/train.txt", "r") as file:
    for line in file:
        train_filenames.append(line.strip())
        
# Read an annotation file, return a list of bounding boxes
def get_bounding_boxes(filename):
    bounding_boxes = []
    xmldoc = ET.parse(voc_2012_dir + "Annotations/{}.xml".format(filename))
    objects = xmldoc.findall('object')
    for thing in objects:
        name = thing.find('name').text
        box = thing.find('bndbox')
        xmax = int(box.find('xmax').text)
        xmin = int(box.find('xmin').text)
        ymax = int(box.find('ymax').text)
        ymin = int(box.find('ymin').text)
        bounding_boxes.append(bounding_box(name, xmin, xmax, ymin, ymax))
    return bounding_boxes


memoized_images = {}
num_to_memoize = 5000

# Returns a [height, width, depth] image in RGB pixel order
def read_jpeg_memoized(filename):
    try:
        image = memoized_images[filename]
        #print("cache hit")
    except:
        image = cv2.imread(voc_2012_dir + "JPEGImages/{}.jpg".format(filename))
        if len(memoized_images) < num_to_memoize:
            memoized_images[filename] = image
        #print("cache miss")
        #found_filenames.append(filename)
    if image is None:
        print("Failed to read image! This is probably bad")
    # Convert BGR to RGB. 
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Returns a [height, width, depth] image in RGB pixel order
def read_jpeg(filename):
    image = cv2.imread(voc_2012_dir + "JPEGImages/{}.jpg".format(filename))
    if image is None:
        print("Failed to read image! This is probably bad")
    # Convert BGR to RGB. 
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Plots an image from either filename or numpy array. Optionally draws bounding boxes with class name labels
def show_image(image, bounding_boxes = []):
    if isinstance(image, str):
        image = read_jpeg(image)
    fig = plt.figure()
    ax = fig.add_subplot(111, aspect='equal')
    ax.imshow(image)
    for box in bounding_boxes:
        ax.add_patch(
            patches.Rectangle(
                (box.xmin,box.ymin),
                box.xmax - box.xmin,
                box.ymax - box.ymin,
                fill=False, edgecolor="red", linewidth=3    
            ))
        ax.text(box.xmin + 10,box.ymin + 30, box.class_name, fontsize=16, color = "red")
        

# Gets a batch of images from a given list of filenames
def get_batch(filenames, num_images, img_size, num_classes, memoize):
    shuffle(filenames)
    images = []
    classes = []
    i = 0
    while len(images) < num_images:
        filename = filenames[i]
        bounding_boxes = get_bounding_boxes(filename)
        shuffle(bounding_boxes)
        if check_bounding_box(bounding_boxes[0], img_size):
            if memoize:
                image = read_jpeg_memoized(filename)
            else:
                image = read_jpeg(filename)
            image = crop_resize_image(image, bounding_boxes[0], img_size)
            images.append(image)
            classes.append(classes_to_index[bounding_boxes[0].class_name])
        i += 1
    return np.array(images), dense_to_one_hot(np.array(classes), num_classes)

# Check if a bounding box has a size and shape similar to the desired dimension
def check_bounding_box(bounding_box, desired_dimension):
    x = bounding_box.xmin
    y = bounding_box.ymin
    width = bounding_box.xmax - bounding_box.xmin
    height = bounding_box.ymax - bounding_box.ymin
    shorter_dimension = min(width, height)
    if (desired_dimension - shorter_dimension) / desired_dimension < .5:
        if abs(width - height) / width < .5:
            return True
    return False
            
# Gets the des_dim x des_dimension scaled version of the top leftmost square of the given bounding box region of the image
def crop_resize_image(image, bounding_box, desired_dimension):
    x = bounding_box.xmin
    y = bounding_box.ymin
    width = bounding_box.xmax - bounding_box.xmin
    height = bounding_box.ymax - bounding_box.ymin
    shorter_dimension = min(width, height)
    #if shorter_dimension > desired_dimension:
    cropped_image = image[y:y+shorter_dimension, x:x+shorter_dimension]
    resized_image = cv2.resize(cropped_image, (desired_dimension, desired_dimension))
    return resized_image

def dense_to_one_hot(labels_dense, num_classes=10):
    """Convert class labels from scalars to one-hot vectors."""
    num_labels = labels_dense.shape[0]
    index_offset = np.arange(num_labels) * num_classes
    labels_one_hot = np.zeros((num_labels, num_classes))
    labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
    return labels_one_hot

In [7]:
get_bounding_boxes(train_filenames[1])[0].ymax

176

In [10]:
with open("pascal_voc_train_people.idl", "w") as handle:
    lines = []
    for filename in train_filenames:
        people_boxes = []
        for box in get_bounding_boxes(filename):
            if box.class_name == "person":
                people_boxes.append(box)
        if len(people_boxes) > 0:
            line_to_write = '"' + voc_2012_dir + "JPEGImages/{}.jpg".format(filename) + '"' + ": "
            bounding_box_strings = []
            for box in people_boxes:
                bounding_box_strings.append("({xmin}.0, {ymin}.0, {xmax}.0, {ymax}.0)".format(xmin=box.xmin,
                                                                         ymin=box.ymin,
                                                                         xmax=box.xmax,
                                                                         ymax=box.ymax))
                #print(box.ymin, box.ymax)
            line_to_write += ", ".join(bounding_box_strings)
            line_to_write += ";"
            lines.append(line_to_write + "\n")
    handle.writelines(lines)

In [6]:
voc_2012_dir + "JPEGImages/{}.jpg".format(train_filenames[1])

'/home/ubuntu/external_drive/VOC/VOC2012/VOCdevkit/VOC2012/JPEGImages/2008_000015.jpg'

In [3]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.01)
    return tf.Variable(initial)

def bias_variable(shape, value):
    initial = tf.constant(float(value), shape=shape)
    return tf.Variable(initial)

def conv2d(x, W, stride):
    return tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')

In [4]:
# Need to implement image resizing!!! (and maybe also various augmentations)
# Bring from 0, 255 to 0, 1
# Subtract mean
# Then, bounding boxes will need re-scaled also..

In [5]:
# TURN THESE INTO TENSORFLOW VARIABLES SO THEY CAN BE DYNAMICALLY CHANGED

# The width and height of the image
image_size = 160 # Must be divisible by the pooling layers
# Image depth
image_depth = 3
# The batch size
batch_size = 128
test_batch_size = 256
# number of classes
num_classes = 21


In [6]:
# Something like Alexnet
sess = tf.InteractiveSession()

# Placeholder for the image size
#image_size_variable = tf.placeholder(tf.float32)

# Batched input
x = tf.placeholder(tf.float32, shape=[None, image_size, image_size, image_depth]) # batch size, image size, image size, image depth
y_ = tf.placeholder(tf.float32, shape=[None, num_classes]) # batch size, num_classes

# First Convolutional Layer
# Variables
W_conv1 = weight_variable([11, 11, image_depth, 96]) # filter size, filter size, input channels (image depth), output channels
b_conv1 = bias_variable([96], 0)
# Layers
h_conv1 = tf.nn.relu(conv2d(x, W_conv1, stride=4) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

# Second Convolutional Layer
# Variables
W_conv2 = weight_variable([5, 5, 96, 256])
b_conv2 = bias_variable([256], 1)
# Layers
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2, stride=1) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

# Third Convolutional Layer
# Variables
W_conv3 = weight_variable([3, 3, 256, 384])
b_conv3 = bias_variable([384], 0)
# Layers
h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3, stride=1) + b_conv3)

# Fourth Convolutional Layer
# Variables
W_conv4 = weight_variable([3, 3, 384, 384])
b_conv4 = bias_variable([384], 1)
# Layers
h_conv4 = tf.nn.relu(conv2d(h_conv3, W_conv4, stride=1) + b_conv4)

# Fifth Convolutional Layer
# Variables
W_conv5 = weight_variable([3, 3, 384, 256])
b_conv5 = bias_variable([256], 1)
# Layers
h_conv5 = tf.nn.relu(conv2d(h_conv4, W_conv5, stride=1) + b_conv5)
h_pool5 = max_pool_2x2(h_conv5)

# Fully Connected 1
# Weights
W_fc1 = weight_variable([int(image_size/32) * int(image_size/32) * 256, 4096])
b_fc1 = bias_variable([4096], 1)
# Layers
h_pool5_flat = tf.reshape(h_pool5, [-1, int(image_size/32) * int(image_size/32)*256])
h_fc1 = tf.nn.relu(tf.matmul(h_pool5_flat, W_fc1) + b_fc1)

# Fully Connected 2
# Weights
W_fc2 = weight_variable([4096, 4096])
b_fc2 = bias_variable([4096], 1)
# Layers
h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)

# Dropout (fix this and apply to both layers)
keep_prob = tf.placeholder(tf.float32)
#h_fc2_drop = tf.nn.dropout(h_fc2, keep_prob)

# Softmax
# Weights
W_fc3 = weight_variable([4096, num_classes])
b_fc3 = bias_variable([num_classes], 0)
# Layers
y_conv=tf.nn.softmax(tf.matmul(h_fc2, W_fc3) + b_fc3)


In [7]:
  
#cross_entropy = -tf.reduce_sum(y_*tf.log(y_conv))
cross_entropy = -tf.reduce_sum(y_*tf.log(tf.clip_by_value(y_conv,1e-10,1.0)))

learning_rate = tf.placeholder(tf.float32)
train_step = tf.train.MomentumOptimizer(learning_rate, .9).minimize(cross_entropy)
#train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)
#train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy)


correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [8]:
# Make summary of cross entropy loss
ce_summ = tf.scalar_summary("cross entropy", cross_entropy)

# Make a summary of training accuracy
#correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
#accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
accuracy_summary = tf.scalar_summary("accuracy", accuracy)

# Learning rate summary
learning_rate_summary = tf.scalar_summary("learning rate", learning_rate)

# Merge all the summaries and write them out to /tmp/mnist_logs
merged = tf.merge_all_summaries()


In [None]:
sess.run(tf.initialize_all_variables())
i = 0
save_name = "alexnet_4"
writer = tf.train.SummaryWriter(home_dir + "projects/deep_learning/tensorflow/tmp/mnist_logs/"+save_name, sess.graph_def, flush_secs=10)

In [None]:
learning_rate_value = .001
dropout = .5

while(True):
    i += 1
    if i%10 == 0:
        
        batch = get_batch(train_filenames, test_batch_size, image_size, num_classes, True)
        #train_accuracy = accuracy.eval(feed_dict={x:batch[0], y_: batch[1], keep_prob: 1.0})
        #print("step %d, training accuracy %g"%(i, train_accuracy))
        #print(sess.run([cross_entropy], feed_dict={x: batch[0], y_: batch[1], keep_prob: 1})[0])
        summary, accuracy_value = sess.run([merged, accuracy], 
                    feed_dict={x: batch[0], y_: batch[1], keep_prob: 1., learning_rate: learning_rate_value})
        #print(summary)
        writer.add_summary(summary, i)
        print("Accuracy at step %s: %s" % (i, accuracy_value))
    else:
        batch = get_batch(train_filenames, batch_size, image_size, num_classes, True)
        train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: dropout, learning_rate: learning_rate_value})

#print("test accuracy %g"%accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))

Accuracy at step 10: 0.117188
Accuracy at step 20: 0.136719
Accuracy at step 30: 0.113281
Accuracy at step 40: 0.128906
Accuracy at step 50: 0.101562

In [None]:
def one_hot_to_class(one_hot):
    for key, value in enumerate(one_hot):
        if value > 0:
            return indexes_to_classes[key]

In [None]:
x_1, y_1 = get_batch(train_filenames, batch_size, image_size, num_classes, True)
results = y_conv.eval(feed_dict={x: x_1, y_: y_1, keep_prob: 1., })

In [None]:
for a in range(20):
    show_image(x_1[a,:,:,:])
    print("Actual: " + one_hot_to_class(y_1[a]))
    print("Guess: " + one_hot_to_class(results[a]))

In [None]:
x_1, y_1 = get_batch(train_filenames, batch_size, image_size, num_classes, True)

In [None]:
# Weight decay, batch normalization, bigger network, different filter sizes, inception filters
# sgd with momentum seems standard

In [None]:
# Simple network definition (intended for CIFAR-10?)
sess = tf.InteractiveSession()

# Placeholder for the image size
#image_size_variable = tf.placeholder(tf.float32)

# Batched input
x = tf.placeholder(tf.float32, shape=[None, image_size, image_size, image_depth]) # batch size, image size, image size, image depth
y_ = tf.placeholder(tf.float32, shape=[None, num_classes]) # batch size, num_classes

W_conv1 = weight_variable([5, 5, image_depth, 32]) # filter size, filter size, input channels (image depth), output channels
b_conv1 = bias_variable([32])

#x_image = tf.reshape(x, [batch_size,28,28,1])

h_conv1 = tf.nn.relu(conv2d(x, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

W_fc1 = weight_variable([int(image_size/4) * int(image_size/4) * 64, 1024])
b_fc1 = bias_variable([1024])

h_pool2_flat = tf.reshape(h_pool2, [-1, int(image_size/4) * int(image_size/4)*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

# Dropout
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

# Softmax
W_fc2 = weight_variable([1024, num_classes])
b_fc2 = bias_variable([num_classes])

y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
