In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import glob
tf.logging.set_verbosity(tf.logging.ERROR)
import numpy as np

In [2]:
# from youtube-8m utils.py
def Dequantize(feat_vector, max_quantized_value=2, min_quantized_value=-2):
  """Dequantize the feature from the byte format to the float format.

  Args:
    feat_vector: the input 1-d vector.
    max_quantized_value: the maximum of the quantized value.
    min_quantized_value: the minimum of the quantized value.

  Returns:
    A float vector which has the same shape as feat_vector.
  """
  assert max_quantized_value > min_quantized_value
  quantized_range = max_quantized_value - min_quantized_value
  scalar = quantized_range / 255.0
  bias = (quantized_range / 512.0) + min_quantized_value
  return feat_vector * scalar + bias

def decode(feat_vector, feature_size):
    return tf.reshape(tf.cast(tf.decode_raw(feat_vector, 
                                            tf.uint8), 
                              tf.float32),
                      [-1, feature_size])

In [3]:
# filepath is path to tfrecord
# datatype is audio or video
# output_features and output_labels are empty lists or existing lists
def load_data(filepath, data_type, output_labels, output_features):
    if data_type == 'audio':
        context = {
            'labels': tf.VarLenFeature(dtype=tf.int64)
        }

        feature_list = {
            'audio_embedding': tf.FixedLenSequenceFeature([], dtype=tf.string)
        }
        feature_name = 'audio_embedding'
        feature_len = 128

    elif data_type == 'video':
        context = {
            'id': tf.FixedLenFeature([], dtype=tf.string),
            'labels': tf.VarLenFeature(dtype=tf.int64)
        }

        feature_list = {
            'rgb': tf.FixedLenSequenceFeature([], dtype=tf.string),
        }
        feature_name = 'rgb'
        feature_len = 128
        
        
    tf.reset_default_graph()    
    sess = tf.InteractiveSession()

    # Read TFRecord file
    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer([filepath])


    # Extract features from serialized data

    _, serialized_example = reader.read(filename_queue)
    context, features = tf.io.parse_single_sequence_example(serialized_example,
                                                    context_features=context,
                                                    sequence_features=feature_list,
                                                    example_name=None,
                                                    name=None
    )

    # Many tf.train functions use tf.train.QueueRunner,
    # so we need to start it before we read
    tf.train.start_queue_runners(sess)
    
    
    num_in_file = sum(1 for _ in tf.python_io.tf_record_iterator(filepath))

    for i in range(num_in_file):
        labels = context['labels'].eval()
        label = labels.values[0]
        data = Dequantize(decode(features[feature_name], feature_len)).eval()
        output_labels.append(label)
        output_features.append(data)

    tf.InteractiveSession().close()
    
    return output_labels, output_features

In [4]:
audio_output_labels = []
audio_output_features = []
audio_path = "audio_1556745450.370243.tfrecord"
audio_output_labels, audio_output_features = load_data(audio_path,
                                           'audio', audio_output_labels, audio_output_features)

video_output_labels = []
video_output_features = []
video_path = "video_1556754626.438139.tfrecord"
video_output_labels, video_output_features = load_data(video_path,
                                           'video', video_output_labels, video_output_features)



In [5]:
audio_x = np.array(audio_output_features)
audio_y = np.array(audio_output_labels)
video_x = np.array(video_output_features)
video_y = np.array(video_output_labels)
print('audio_x shape: ', audio_x.shape)
print('audio_y shape: ', audio_y.shape)
print('video_x shape: ', video_x.shape)
print('video_y shape: ', video_y.shape)


audio_x shape:  (450, 10, 128)
audio_y shape:  (450,)
video_x shape:  (450, 10, 128)
video_y shape:  (450,)


In [7]:
# --- At this point the desired data should be loaded --- 

In [30]:
def build_batch(dataset_x, dataset_y, batch_size):
    indices = list(np.random.randint(0, len(dataset_x), size=batch_size))
    
    # Recover what the entries for the batch are
    batch_x = np.array([dataset_x[i] for i in indices])
    batch_y = np.array([dataset_y[i] for i in indices])
    
    return batch_x, batch_y

In [114]:
def contrastive_loss(embedded_batch_a, embedded_batch_b, a_labels, b_labels, margin):
    y = tf.cast(tf.equal(a_labels, b_labels), tf.float32)
    dist = tf.norm(embedded_batch_a - embedded_batch_b)
    loss = y * .5 * tf.square(dist) + (1 - y) * .5 * tf.square(tf.maximum(0., margin - dist))
    return tf.reduce_sum(loss)

In [108]:
def build_graph(input_shape, labels_shape, embed_size=128, learning_rate=0.0001, l1_reg=0.0001, l2_reg=0.0001, margin=.05):
    audio_inputs = tf.placeholder(shape=[None, input_shape[0], input_shape[1]], name="audio_inputs", dtype=tf.float32)
    video_inputs = tf.placeholder(shape=[None, input_shape[0], input_shape[1]], name="video_inputs", dtype=tf.float32)
    audio_labels = tf.placeholder(shape=[None], name="audio_labels", dtype=tf.float32)
    video_labels = tf.placeholder(shape=[None], name="video_labels", dtype=tf.float32)

    flattened_audio = tf.layers.flatten(audio_inputs)
    flattened_video = tf.layers.flatten(video_inputs)
    flattened_audio_labels = tf.layers.flatten(audio_labels)
    flattened_video_labels = tf.layers.flatten(video_labels)

    
    #subnetwork = build_fc_net(inputs, embed_size, np.prod(video_input_shape), l1_reg, l2_reg)
    weights = tf.trainable_variables()
    if l1_reg != 0:
        reg_1 = tf.contrib.layers.l1_regularizer(scale=l1_reg)
    else:
        reg_1 = 0
    if l2_reg != 0:
        reg_2 = tf.contrib.layers.l2_regularizer(scale=l2_reg)
    else:
        reg_2 = 0
    #loss = contrastiveloss(audio_embed, video_embed)
    # TODO: This really should be a contrastive loss i think?????
    audio_embed = build_fc_net(flattened_audio, embed_size, l1_reg, l2_reg)
    video_embed = build_fc_net(flattened_video, embed_size, l1_reg, l2_reg)
    weights = tf.trainable_variables()

    #mse = tf.losses.mean_squared_error(audio_embed, video_embed) # only use loss if labels don't match?
    error = contrastive_loss(audio_embed, video_embed, flattened_audio_labels, flattened_video_labels, margin)
    reg_penalty = tf.contrib.layers.apply_regularization(reg_1, weights) + tf.contrib.layers.apply_regularization(reg_2, weights)
    loss = error + reg_penalty
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return audio_inputs, audio_labels, video_inputs, video_labels, loss, optimizer

In [109]:
#audio input is a tf placeholder for the input audio features
#video labels is a tf placeholder for the input video features
#encode size is the desired size of the encoded vector (should be the same size as the video features)
#l1 and l2 reg are the amount of weight to put on l1 and l2 regularizers for the loss

def build_fc_net(input_data, embed_size, l1_reg=0, l2_reg=0):
    net = build_fc_layers(input_data, embed_size)
    return net

In [110]:
# input layer should be the flattened inputs
def build_fc_layers(input_layer, output_size):
    with tf.name_scope("model"):
        with tf.variable_scope("dense0", reuse=tf.AUTO_REUSE) as scope:
            h1 = tf.layers.dense(inputs=input_layer, units=1024, activation=tf.nn.tanh)
            d1 = tf.layers.dropout(inputs=h1, rate=0.4)
        with tf.variable_scope("dense1", reuse=tf.AUTO_REUSE) as scope:
            h2 = tf.layers.dense(inputs=d1, units=1024, activation=tf.nn.tanh)
            d2 = tf.layers.dropout(inputs=h2, rate=0.4)            
        with tf.variable_scope("dense2", reuse=tf.AUTO_REUSE) as scope:
            h3 = tf.layers.dense(inputs=d2, units=1024, activation=tf.nn.tanh)
            d3 = tf.layers.dropout(inputs=h3, rate=0.4)
        with tf.variable_scope("dense3", reuse=tf.AUTO_REUSE) as scope:
            raw_encode = tf.layers.dense(inputs=d3, units=output_size, activation=tf.nn.tanh)

    return raw_encode

In [116]:
def train(loss, optimizer, audio_x, audio_y, video_x, video_y, batch_size, num_iters=5000):
    audio_inputs, audio_labels, video_inputs, video_labels, loss, optimizer = build_graph(audio_x[0].shape, audio_y.shape[0])

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for i in range(num_iters):
            audio_batch_input, audio_batch_label = build_batch(audio_x, audio_y, batch_size)
            video_batch_input, video_batch_label = build_batch(video_x, video_y, batch_size)
            _, loss_val = sess.run([optimizer, loss], feed_dict={audio_inputs: audio_batch_input,
                                                             video_inputs: video_batch_input,
                                                             audio_labels: audio_batch_label,
                                                             video_labels: video_batch_label})
            if i % 100 == 0:
                print("Loss at batch " + str(i))
                print(loss_val)

In [117]:
tf.reset_default_graph()
train(loss, optimizer, audio_x, audio_y, video_x, video_y, 16)

Loss at batch 0
9.665541
Loss at batch 100
8.809877
Loss at batch 200
8.807909
Loss at batch 300
8.80874
Loss at batch 400
8.808024
Loss at batch 500
8.807798
Loss at batch 600
8.80401
Loss at batch 700
8.798994
Loss at batch 800
8.79342
Loss at batch 900
8.787271
Loss at batch 1000
8.780493
Loss at batch 1100
8.77808
Loss at batch 1200
8.779077
Loss at batch 1300
8.771797
Loss at batch 1400
8.768798
Loss at batch 1500
8.760667
Loss at batch 1600
8.7520075
Loss at batch 1700
8.972441
Loss at batch 1800
8.733269
Loss at batch 1900
8.723491
Loss at batch 2000
8.713846
Loss at batch 2100
8.702354
Loss at batch 2200
8.690161
Loss at batch 2300
8.677269
Loss at batch 2400
8.663697
Loss at batch 2500
8.649381
Loss at batch 2600
8.634296
Loss at batch 2700
8.625087
Loss at batch 2800
8.608751
Loss at batch 2900
8.59144
Loss at batch 3000
8.573368
Loss at batch 3100
8.554292
Loss at batch 3200
8.53428
Loss at batch 3300
8.5132885
Loss at batch 3400
8.491317
Loss at batch 3500
8.468199
Loss at 

In [38]:
#input layer is a tf placeholder that is the input data
#output size is the desired size of the encoded vector (should be the same size as the video vector?)
#scope is a string to keep track of all trainable variables in this stack
def build_fc_layers(input_layer, output_size):
    h1 = tf.layers.dense(inputs=input_layer, units=1024, activation=tf.nn.tanh)
    d1 = tf.layers.dropout(inputs=h1, rate=0.4)
    h2 = tf.layers.dense(inputs=d1, units=1024, activation=tf.nn.tanh)
    d2 = tf.layers.dropout(inputs=h2, rate=0.4)
    h3 = tf.layers.dense(inputs=d2, units=1024, activation=tf.nn.tanh)
    d3 = tf.layers.dropout(inputs=h3, rate=0.4)
    raw_encode = tf.layers.dense(inputs=d3, units=output_size, activation=tf.nn.tanh)
    return raw_encode

In [39]:
#audio input is a tf placeholder for the input audio features
#video labels is a tf placeholder for the input video features
#encode size is the desired size of the encoded vector (should be the same size as the video features)
#l1 and l2 reg are the amount of weight to put on l1 and l2 regularizers for the loss
def build_fc_net(audio_input, video_labels, encode_size, l1_reg=0, l2_reg=0):
    audio_net = build_fc_layers(audio_input, encode_size)
    weights = tf.trainable_variables()
    if l1_reg != 0:
        reg_1 = tf.contrib.layers.l1_regularizer(scale=l1_reg)
    else:
        reg_1 = 0
    if l2_reg != 0:
        reg_2 = tf.contrib.layers.l2_regularizer(scale=l2_reg)
    else:
        reg_2 = 0
    mse = tf.losses.mean_squared_error(audio_net, video_labels)
    reg_penalty = tf.contrib.layers.apply_regularization(reg_1, weights) + tf.contrib.layers.apply_regularization(reg_2, weights)
    loss = mse + reg_penalty
    return loss

In [40]:
def build_graph(audio_input_shape, video_input_shape, learning_rate=0.0001, l1_reg=0.0001, l2_reg=0):
    inputs = tf.placeholder(shape=audio_input_shape, name="inputs", dtype=tf.float32)
    labels = tf.placeholder(shape=video_input_shape, name="labels", dtype=tf.float32)
    flattened_labels = tf.layers.Flatten()(labels)
    loss = build_fc_net(inputs, flattened_labels, np.prod(video_input_shape), l1_reg, l2_reg)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return inputs, labels, loss, optimizer

In [41]:
def train(inputs, labels, loss, optimizer, dataset, batch_size, num_iters=1000):
    with tf.Session() as sess:
        for i in range(num_iters):
            audio_batch_input, audio_batch_label = build_batch(dataset, batch_size)
            _, loss = sess.run([optimizer, loss], feed_dict={inputs:batch_input, labels:batch_label})
            if i % 100 == 0:
                print("Loss at batch " + str(i))
                print(loss)

ValueError: Shapes (450, 1) and (450, 10, 450) are incompatible