This is an implementation of the artistic neural network as described in this [paper](https://arxiv.org/abs/1508.06576). VGG pretrained model weights can be downloaded from [here](http://www.vlfeat.org/matconvnet/models/). Use imagenet-vgg-verydeep-19.mat and imagenet-vgg-verydeep-16.mat

In [3]:
import os
import sys
import numpy as np
import scipy.io
import scipy.misc
import tensorflow as tf

In [134]:
class VGG(object):
    
    def __init__(self):
        self.graph = None
        self.input_shape = [1, 600, 800, 3]
        self.content_layer = None
        self.style_layer = None
    
    def factory(name, model_path):
        """
        The factory to create the corresponding model we will use
        Available names include "VGG16" and "VGG19"
        """
        if name == "VGG16": return VGG16(model_path)
        if name == "VGG19": return VGG19(model_path)
        
    factory = staticmethod(factory)
    
    def load_model(self):
        """
        The pretrained model contains the layer name and layer type (i.e. pool, conv etc.)
        To access those information, we can do the index access:
        vgg_layers[0]       [0]      [0]      [0]      [2]                                    [0]      [0] ## weight
        vgg_layers[0]       [0]      [0]      [0]      [2]                                    [0]      [1] ## bias
                #  always 0 |layer idx|always 0|always 0|0:layer name; 1:layer type; 2: weights|always 0|0:weight; 1:bias
        vgg_layers[0][30][0][0][0][0] # to access layer name
        vgg_layers[0][30][0][0][1][0] # to access layer type
        
        Note that the fully connected layers and the softmax are not required for this task, therefore we will skip it. 
        The fully connected layers have name fc* (It's type is conv though).
        """
        
        vgg_layers = scipy.io.loadmat(self.path)['layers']
        num_layers = len(vgg_layers[0])
        
        graph = {}
        graph["input"] = tf.Variable(np.zeros(self.input_shape), dtype=tf.float32)
        prev = "input"
        
        for idx in range(num_layers):
            
            layer_name = vgg_layers[0][idx][0][0][0][0]
            layer_type = vgg_layers[0][idx][0][0][1][0]
            
            if layer_name[:2] == "fc":
                break        # stop before adding the first fc layer
            
            if layer_type == "conv":
                W = vgg_layers[0][idx][0][0][2][0][0]
                b = vgg_layers[0][idx][0][0][2][0][1]
                W = tf.constant(W)   # we don't want to update the network
                b = tf.constant(b)
                graph[layer_name] = tf.nn.conv2d(graph[prev], filter=W, strides=[1, 1, 1, 1], padding="SAME") + b
            elif layer_type == "relu":
                graph[layer_name] = tf.nn.relu(graph[prev])
            elif layer_type == "pool":    # according to the paper, average pooling behaves better
                graph[layer_name] = tf.nn.avg_pool(graph[prev], ksize=[1, 2, 2, 1], 
                                                   strides=[1, 2, 2, 1], padding="SAME")
            
            prev = layer_name
        self.graph = graph
        
    def gram_matrix(F, N, M):
        """
        The gram matrix G.
        F -- the features
        N -- number of filters
        M -- hight x width of one feature map 
        Names as per paper
        """
        Ft = tf.reshape(F, (M, N))
        return tf.matmul(tf.transpose(Ft), Ft)
    
    def content_loss(self, sess, content_img):
        """
        Compute the context loss as described in the paper. We only need to do the forward 
        pass once on the content image
        
        sess -- the current session
        content_img -- the content image. should be a numpy array with dimension [1, hight, width, 3]
        Note: the dimension of the image should match with the one set at resize_input.
        Also, the image should be centered. The mean should be the training set mean of the VGG network.
        """
        if content_img.shape != self.input_shape:
            raise Exception("Dimension doesn't match")
        if self.content_layer == None:
            raise Exception("Call from the child class")
        sess.run(self.graph["input"].assign(content_img))
        P = sess.run(self.graph[self.content_layer])
        F = self.graph[self.content_layer]
        return 0.5 * tf.reduce_sum(tf.pow(F - P, 2))
        
    
    def style_loss(self, sess, style_img):
        """
        Compute the style loss as described in the papaer. Again, only do forward pass once for style image
        
        sess -- the current session
        sytle_image -- the style image. Should be a numpy array
        Note: the style image should also have the same dimension as the content image, either by cropping or 
        some other methods.
        
        The style layers will contain multiple layers. Should be a dictionary with keys are the layer name, 
        and values are the associated weight
        """
        if style_img.shape != self.input_shape:
            raise Exception("Dimension doesn't match")
        if self.style_layer == None:
            raise Exception("Call from child class")
        
        N = P.shape[3]  # number of filters
        M = P.shape[1] * P.shape[2] # hight x width of one feature map 
        loss = 0
        for key in self.style_layer:
            
        
    def emit(self):
        return self.graph
    
    def resize_input(new_shape):
        self.input_shape = new_shape
        tf.reshape(self.graph["input"], new_shape)

In [147]:
class VGG19(VGG):
    
    def __init__(self, model_path):
        self.path  = model_path
        self.model = load_model()
    
    def content_loss(self):
        
        
    def style_loss(self):
        

In [136]:
vgg_layers = scipy.io.loadmat("models/imagenet-vgg-verydeep-16.mat")['layers']

In [145]:
w =     vgg_layers[0]       [0]      [0]      [0]      [2]                                    [0]      [0] ## weight
b =     vgg_layers[0]       [0]      [0]      [0]      [2]                                    [0]      [1] ## bias
                #  always 0 |layer idx|always 0|always 0|0:layer name; 1:layer type; 2: weights|always 0|0:weight; 1:bias
    
W = tf.constant(w)
b = tf.constant(b)
print W
print b

Tensor("Const_7:0", shape=(3, 3, 3, 64), dtype=float32)
Tensor("Const_8:0", shape=(64, 1), dtype=float32)


In [143]:
print len(vgg_layers[0])
# print vgg_layers[0][30][0][0][0][0]
# print vgg_layers[0][30][0][0][1][0]

for i in range(len(vgg_layers[0])):
    print vgg_layers[0][i][0][0][0][0], vgg_layers[0][i][0][0][1][0]

37
conv1_1 conv
relu1_1 relu
conv1_2 conv
relu1_2 relu
pool1 pool
conv2_1 conv
relu2_1 relu
conv2_2 conv
relu2_2 relu
pool2 pool
conv3_1 conv
relu3_1 relu
conv3_2 conv
relu3_2 relu
conv3_3 conv
relu3_3 relu
pool3 pool
conv4_1 conv
relu4_1 relu
conv4_2 conv
relu4_2 relu
conv4_3 conv
relu4_3 relu
pool4 pool
conv5_1 conv
relu5_1 relu
conv5_2 conv
relu5_2 relu
conv5_3 conv
relu5_3 relu
pool5 pool
fc6 conv
relu6 relu
fc7 conv
relu7 relu
fc8 conv
prob softmax
