# Video GAN

This is a replication exercise for learning, an experiment in novel artistic outputs, and (hopefully, eventually) a research contribution to GANs as applied to video synthesis. The goal is to generate abstracted videos evoking the subjective affect of certain objects, actions, and scenes in motion, through generative adversarial networks trained on input videos of the desired subjects.

Generative models are based on ["Generating Videos with Scene Dynamics" (2016)](http://www.cs.columbia.edu/~vondrick/tinyvideo/paper.pdf) and ["Improving Video Generation for Multi-functional Applications" (2017)](https://arxiv.org/pdf/1711.11453.pdf). Code is based on the latter's [GitHub repo](https://github.com/bernhard2202/improved-video-gan/). Kratzwald's implementation appears to be a better fit for desired use case due to its ability to handle inputs without static backgrounds.

The creative part of this project is more nebulous for now but will require manipulating the generated videos such that they're able to be projected in a live setting paired with musical compositions. At minimum this will require interpolating the outputs which will be pretty small, or figuring out a way to generate larger outputs without significant runtime cost.

Using Google Colaboratory for TPU access. Will refactor into Python module once validated.

In [0]:
import cv2
import numpy as np
import os
import tensorflow as tf

## Settings

In [0]:
# Video settings
video_dir = ''
video_size = []
frame_int = 2
frame_cap = 32

# Training parameters
epochs = 50
z_dim = 100
read_threads = 16
batch_size = 64

# Adam optimizer
learning_rate = 0.0001
alpha1 = 0.1
beta1 = 0.5

# Output frequency
sample_rate = 100

# Use eager execution
tf.enable_eager_execution()

## Video Processing

### Extract frames

In [0]:
videos = glob.glob(os.path.join(video_dir, '*.avi'))

# For each video in directory, capture every frame_int number of frames and store in 4D array.
for vnum, video in enumerate(videos):
    description = os.path.splitext(video)[0]
    vidcap = cv2.VideoCapture(os.path.join(video_dir, video))
    success, image = vidcap.read()
    output = np.zeros(frame_cap, image.shape[0], image.shape[1], image.shape[2])
    loc, frames = 0
    while success and frames < frame_cap:
        output[frames] = image
        loc += frame_int
        frames += 1
        vidcap.set(cv2.CAP_PROP_POS_MSEC, count)
        success, image = vidcap.read()
    cv2.imwrite(os.path.join(video_dir, description + str(vnum) + '.jpg', output)

### Read frames into tf data object

In [0]:
# Reads video image, decodes into a dense tensor, resized to desired shape.
def _parse_function(filename, label):
    image_string = tf.read_file(filename)
    image_decoded = tf.image.decode_jpeg(image_string)
    image_resized = tf.image.resize_images(image_decoded, video_size)
    return image_resized, label

# File name vector.
video_files = glob.glob(os.path.join(video_dir, '*.jpg'))
filenames = tf.constant(video_files)

# Label vector.
labels = tf.constant([os.path.splitext(vid)[0] for vid in video_files])

# Construct dataset, shuffle, and repeat.
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels)).shuffle(buffer_size=10000).repeat(epochs)
dataset = dataset.map(_parse_function).batch(batch_size)

## Utilities

### Model Utilities

In [0]:
# Model summaries

def add_activation_summary(var):
    tf.summary.histogram(var.op.name + "/activation", var)
    tf.summary.scalar(var.op.name + "/sparsity", tf.nn.zero_fraction(var))

def add_gradient_summary(grad, var):
    if grad is not None:
        tf.summary.histogram(var.op.name + '/gradient', grad)
        
# Layer utilities

def uniform(std_dev, size):
    return np.random.uniform(
        low=-std_dev * np.sqrt(3),
        high=std_dev * np.sqrt(3),
        size=size
    ).astype('float32')
    
def conv2d(input_, input_dim, output_dim,
           k_h=4, k_w=4, d_h=2, d_w=2, name="conv2d", padding="SAME"):
    """ 
    init weights like in    
    "Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification"
    """
    with tf.variable_scope(name):
        fan_in = input_dim * k_h * k_w
        fan_out = (output_dim * k_h * k_w) / (d_h * d_w)

        filters_std = np.sqrt(4. / (fan_in + fan_out))

        filter_values = uniform(
            filters_std,
            (k_h, k_w, input_dim, output_dim)
        )

        w_init = tf.Variable(filter_values, name='filters_init')
        w = tf.get_variable('filters', initializer=w_init.initialized_value())
        b = tf.get_variable('biases', [output_dim], initializer=tf.constant_initializer(0.0))
        result = tf.nn.conv2d(
            input=input_,
            filter=w,
            strides=[1, d_h, d_w, 1],
            padding=padding,
            data_format='NHWC'
        )
        result = tf.nn.bias_add(result, b)

    return result


def conv3d(input_, input_dim, output_dim,
           k_t=4, k_h=4, k_w=4, d_t=2, d_h=2, d_w=2, name="conv3d", padding="SAME"):
    """ 
    init weights like in 
    "Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification"
    """
    with tf.variable_scope(name):
        fan_in = input_dim * k_t * k_h * k_w
        fan_out = (output_dim * k_t * k_h * k_w) / (d_t * d_h * d_w)

        filters_std = np.sqrt(4. / (fan_in + fan_out))

        filter_values = uniform(
            filters_std,
            (k_t, k_h, k_w, input_dim, output_dim)
        )

        w_init = tf.Variable(filter_values, name='filters_init')
        w = tf.get_variable('filters', initializer=w_init.initialized_value())
        b = tf.get_variable('biases', [output_dim], initializer=tf.constant_initializer(0.0))

        result = tf.nn.conv3d(
            input=input_,
            filter=w,
            strides=[1, d_t, d_h, d_w, 1],
            padding=padding,
            data_format='NDHWC'
        )
        result = tf.nn.bias_add(result, b)

    return result


def conv3d_transpose(input_, input_dim, output_shape,
                     k_h=4, k_w=4, k_d=4, d_h=2, d_w=2, d_d=2,
                     name="deconv3d"):
    """ 
    init weights like in 
    "Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification"
    """
    with tf.variable_scope(name):
        output_dim = output_shape[-1]

        fan_in = input_dim * k_d * k_h * k_w
        fan_out = (output_dim * k_d * k_h * k_w) / (d_d * d_h * d_w)

        filters_std = np.sqrt(4. / (fan_in + fan_out))

        filter_values = uniform(
            filters_std,
            (k_d, k_h, k_w, output_dim, input_dim)
        )

        w_init = tf.Variable(filter_values, name='filter_init')
        w = tf.get_variable('filters', initializer=w_init.initialized_value())
        b = tf.get_variable('biases', [output_dim], initializer=tf.constant_initializer(0.0))

        result = tf.nn.conv3d_transpose(value=input_,
                                        filter=w,
                                        output_shape=output_shape,
                                        strides=[1, d_d, d_h, d_w, 1],
                                        name=name,
                                        )

        result = tf.nn.bias_add(result, b)
        return result

    
def dis_block(input, input_dim, output_dim, name, reuse=False, normalize=True):
    with tf.variable_scope(name, reuse=reuse) as vs:
        result = conv3d(input, input_dim, output_dim, name='conv3d')
        if normalize:
            result = tf.contrib.layers.layer_norm(result, reuse=reuse, scope=vs)
        result = tf.maximum(result, result * 0.2)
    return result


def linear(input_, output_size, scope=None, stddev=0.01, bias_start=0.0, with_w=False):
    """
    Code from https://github.com/wxh1996/VideoGAN-tensorflow
    """
    shape = input_.get_shape().as_list()
    with tf.variable_scope(scope or "Linear"):
        matrix = tf.get_variable("Matrix", [shape[1], output_size], tf.float32,
                                 tf.random_normal_initializer(stddev=stddev))
        bias = tf.get_variable("bias", [output_size],
                               initializer=tf.constant_initializer(bias_start))
        if with_w:
            return tf.matmul(input_, matrix) + bias, matrix, bias
        else:
            return tf.matmul(input_, matrix) + bias

### Output Utilities

In [0]:
def write_avi(batch, directory, name=''):
    writer = cv2.VideoWriter(out, cv2.VideoWriter_fourcc('X', 'V', 'I', 'D'),
                             frate, (dims[1], dims[0])) #TODO: Set dimension reference
    for fnum, frame in enumerate(batch):
        writer.write(np.uint8(frame))
    writer.release()

    
def convert_image(images, batch_size, col=5, row=5):
    images = tf.image.convert_image_dtype(tf.div(tf.add(images, 1.0), 2.0), tf.uint8)
    images = [image for image in tf.split(images, batch_size, axis=0)]
    rows = []
    for i in range(row):
        rows.append(tf.concat(images[col * i + 0:col * i + col], 2))
    image = tf.concat(rows, 1)
    return tf.image.encode_jpeg(tf.squeeze(image, [0]))


def sampleBatch(samples, batch_size, col=5, row=5, frames=32):
    frames = [convert_image(samples[:, i, :, :, :], batch_size, col, row) for i in range(frames)]
    return frames

## Model

In [0]:
class VideoGAN():
  
    def __init___(self,
                input
                batch_size,
                frame_size,
                crop_size,
                learning_rate,
                beta1,
                critical_iterations):
        self.critic_iterations = critic_iterations
        self.crop_size = crop_size
        self.beta1 = beta1
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.z_dim = z_dim
        self.frame_size = frame_size
        self.videos = input_batch
        self.build_model()
    
    def build_model(self):
        print("Setting up model...")
        self.z_vec = tf.placeholder(tf.float32, [self.batch_size, self.z_dim], name="z")

        tf.summary.histogram("z", self.z_vec)
        self.videos_fake, self.generator_variables = self.generator(self.z_vec)

        self.d_real, self.discriminator_variables = self.discriminator(self.videos, reuse=False)
        self.d_fake, _ = self.discriminator(self.videos_fake, reuse=True)

        self.g_cost = -tf.reduce_mean(self.d_fake)
        self.d_cost = tf.reduce_mean(self.d_fake) - tf.reduce_mean(self.d_real)

        tf.summary.scalar("g_cost", self.g_cost)
        tf.summary.scalar("d_cost", self.d_cost)

        alpha = tf.random_uniform(
            shape=[self.batch_size, 1],
            minval=0.,
            maxval=1.
        )

        dim = self.frame_size * self.crop_size * self.crop_size * 3

        vid = tf.reshape(self.videos, [self.batch_size, dim])
        fake = tf.reshape(self.videos_fake, [self.batch_size, dim])
        differences = fake - vid
        interpolates = vid + (alpha * differences)
        d_hat, _ = self.discriminator(
            tf.reshape(interpolates, [self.batch_size, self.frame_size, self.crop_size, self.crop_size, 3]), reuse=True)
        gradients = tf.gradients(d_hat, [interpolates])[0]
        slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients), reduction_indices=[1]))
        gradient_penalty = tf.reduce_mean((slopes - 1.) ** 2)

        self.d_cost_final = self.d_cost + 10 * gradient_penalty

        tf.summary.scalar("d_cost_penalized", self.d_cost_final)

        self.d_adam, self.g_adam = None, None
        with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
            self.d_adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=self.beta1, beta2=0.999) \
                .minimize(self.d_cost_final, var_list=self.discriminator_variables)
            self.g_adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=self.beta1, beta2=0.999) \
                .minimize(self.g_cost, var_list=self.generator_variables)

        print("\nTrainable variables for generator:")
        for var in self.generator_variables:
            print(var.name)
        print("\nTrainable variables for discriminator:")
        for var in self.discriminator_variables:
            print(var.name)

        self.sample = sampleBatch(self.videos_fake, self.batch_size)
        self.summary_op = tf.summary.merge_all()
    
    def train(self
              session,
              step,
              summary_writer=None,
              log_summary=False,
              sample_dir=None,
              generate_sample=False):
        if log_summary:
            start_time = time.time()

        critic_itrs = self.critic_iterations

        for critic_itr in range(critic_itrs):
            session.run(self.d_adam, feed_dict=self.get_feed_dict())

        feed_dict = self.get_feed_dict()
        session.run(self.g_adam, feed_dict=feed_dict)

        if log_summary:
            g_loss_val, d_loss_val, summary = session.run([self.g_cost, self.d_cost_final, self.summary_op],
                                                          feed_dict=feed_dict)
            summary_writer.add_summary(summary, step)
            print("Time: %g/itr, Step: %d, generator loss: %g, discriminator_loss: %g" % (
                time.time() - start_time, step, g_loss_val, d_loss_val))

        if generate_sample:
            vid_sample = session.run(self.sample, feed_dict=feed_dict)
            saveGIFBatch(vid_sample, sample_dir, 'vid_%d' % step)
            
    def generator(self):
        with tf.variable_scope('g_') as vs:
            """ LINEAR BLOCK """
            self.z_, _, _ = linear(z, 512 * 4 * 4 * 2, 'g_f_h0_lin', with_w=True)
            self.fg_h0 = tf.reshape(self.z_, [-1, 2, 4, 4, 512])
            self.fg_h0 = tf.nn.relu(tf.contrib.layers.batch_norm(self.fg_h0, scope='g_f_bn0'), name='g_f_relu0')
            add_activation_summary(self.fg_h0)

            """ CONV BLOCK 1 """
            self.fg_h1 = conv3d_transpose(self.fg_h0, 512, [self.batch_size, 4, 8, 8, 256], name='g_f_h1')
            self.fg_h1 = tf.nn.relu(tf.contrib.layers.batch_norm(self.fg_h1, scope='g_f_bn1'), name='g_f_relu1')
            add_activation_summary(self.fg_h1)

            """ CONV BLOCK 2 """
            self.fg_h2 = conv3d_transpose(self.fg_h1, 256, [self.batch_size, 8, 16, 16, 128], name='g_f_h2')
            self.fg_h2 = tf.nn.relu(tf.contrib.layers.batch_norm(self.fg_h2, scope='g_f_bn2'), name='g_f_relu2')
            add_activation_summary(self.fg_h2)

            """ CONV BLOCK 3 """
            self.fg_h3 = conv3d_transpose(self.fg_h2, 128, [self.batch_size, 16, 32, 32, 64], name='g_f_h3')
            self.fg_h3 = tf.nn.relu(tf.contrib.layers.batch_norm(self.fg_h3, scope='g_f_bn3'), name='g_f_relu3')
            add_activation_summary(self.fg_h3)

            """ CONV BLOCK 5 """
            self.fg_h4 = conv3d_transpose(self.fg_h3, 64, [self.batch_size, 32, 64, 64, 3], name='g_f_h4')
            self.fg_fg = tf.nn.tanh(self.fg_h4, name='g_f_actvcation')

        variables = tf.contrib.framework.get_variables(vs)
        return self.fg_fg, variables
    
    def discriminator(self):
        with tf.variable_scope('d_', reuse=reuse) as vs:
            initial_dim = 64
            """ CONV BLOCK 1 """
            d_h0 = dis_block(video, 3, initial_dim, 'block1', reuse=reuse)
            """ CONV BLOCK 2 """
            d_h1 = dis_block(d_h0, initial_dim, initial_dim * 2, 'block2', reuse=reuse)
            """ CONV BLOCK 3 """
            d_h2 = dis_block(d_h1, initial_dim * 2, initial_dim * 4, 'block3', reuse=reuse)
            """ CONV BLOCK 4 """
            d_h3 = dis_block(d_h2, initial_dim * 4, initial_dim * 8, 'block4', reuse=reuse)
            """ CONV BLOCK 5 """
            d_h4 = dis_block(d_h3, initial_dim * 8, 1, 'block5', reuse=reuse, normalize=False)
            """ LINEAR BLOCK """
            d_h5 = linear(tf.reshape(d_h4, [self.batch_size, -1]), 1)
        variables = tf.contrib.framework.get_variables(vs)
        return d_h5, variables
    
    def get_feed_dict(self):
        batch_z = np.random.normal(0, 1.0, size=[self.batch_size, self.z_dim]).astype(np.float32)
        feed_dict = {self.z_vec: batch_z}
        return feed_dict