In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import glob
tf.logging.set_verbosity(tf.logging.ERROR)
import numpy as np

In [2]:
# from youtube-8m utils.py
def Dequantize(feat_vector, max_quantized_value=2, min_quantized_value=-2):
  """Dequantize the feature from the byte format to the float format.

  Args:
    feat_vector: the input 1-d vector.
    max_quantized_value: the maximum of the quantized value.
    min_quantized_value: the minimum of the quantized value.

  Returns:
    A float vector which has the same shape as feat_vector.
  """
  assert max_quantized_value > min_quantized_value
  quantized_range = max_quantized_value - min_quantized_value
  scalar = quantized_range / 255.0
  bias = (quantized_range / 512.0) + min_quantized_value
  return feat_vector * scalar + bias

def decode(feat_vector, feature_size):
    return tf.reshape(tf.cast(tf.decode_raw(feat_vector, 
                                            tf.uint8), 
                              tf.float32),
                      [-1, feature_size])

In [3]:
# filepath is path to tfrecord
# datatype is audio or video
# output_features and output_labels are empty lists or existing lists
def load_data(filepath, data_type, output_labels, output_features):
    if data_type == 'audio':
        context = {
            'labels': tf.VarLenFeature(dtype=tf.int64)
        }

        feature_list = {
            'audio_embedding': tf.FixedLenSequenceFeature([], dtype=tf.string)
        }
        feature_name = 'audio_embedding'
        feature_len = 128

    elif data_type == 'video':
        context = {
            'id': tf.FixedLenFeature([], dtype=tf.string),
            'labels': tf.VarLenFeature(dtype=tf.int64)
        }

        feature_list = {
            'rgb': tf.FixedLenSequenceFeature([], dtype=tf.string),
        }
        feature_name = 'rgb'
        feature_len = 128
        
        
    tf.reset_default_graph()    
    sess = tf.InteractiveSession()

    # Read TFRecord file
    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer([filepath])


    # Extract features from serialized data

    _, serialized_example = reader.read(filename_queue)
    context, features = tf.io.parse_single_sequence_example(serialized_example,
                                                    context_features=context,
                                                    sequence_features=feature_list,
                                                    example_name=None,
                                                    name=None
    )

    # Many tf.train functions use tf.train.QueueRunner,
    # so we need to start it before we read
    tf.train.start_queue_runners(sess)
    
    
    num_in_file = sum(1 for _ in tf.python_io.tf_record_iterator(filepath))

    for i in range(num_in_file):
        labels = context['labels'].eval()
        label = labels.values[0]
        data = Dequantize(decode(features[feature_name], feature_len)).eval()
        output_labels.append(label)
        output_features.append(data)

    tf.InteractiveSession().close()
    
    return output_labels, output_features

In [5]:
audio_output_labels = []
audio_output_features = []
audio_path = "audio_1556745450.370243.tfrecord"
audio_output_labels, audio_output_features = load_data(audio_path,
                                           'audio', audio_output_labels, audio_output_features)

video_output_labels = []
video_output_features = []
video_path = "video_1556754626.438139.tfrecord"
video_output_labels, video_output_features = load_data(video_path,
                                           'video', video_output_labels, video_output_features)



In [6]:
audio_x = np.array(audio_output_features)
audio_y = np.array(audio_output_labels)
video_x = np.array(video_output_features)
video_y = np.array(video_output_labels)
print('audio_x shape: ', audio_x.shape)
print('audio_y shape: ', audio_y.shape)
print('video_x shape: ', video_x.shape)
print('video_y shape: ', video_y.shape)


audio_x shape:  (450, 10, 128)
audio_y shape:  (450,)
video_x shape:  (450, 10, 128)
video_y shape:  (450,)


In [7]:
# --- At this point the desired data should be loaded --- 

In [None]:
def build_batch(dataset, batch_size):


In [None]:
def network_model_1(features, labels, mode):
    """Model function for CNN."""
    # Input Layer
    input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])

    dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
    dropout = tf.layers.dropout(
      inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)

    # Logits Layer
    logits = tf.layers.dense(inputs=dropout, units=10)

    predictions = {
      # Generate predictions (for PREDICT and EVAL mode)
      "classes": tf.argmax(input=logits, axis=1),
      # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
      # `logging_hook`.
      "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }

    if mode == tf.estimator.ModeKeys.PREDICT:
    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
    train_op = optimizer.minimize(
        loss=loss,
        global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
      "accuracy": tf.metrics.accuracy(
          labels=labels, predictions=predictions["classes"])
    }
    return tf.estimator.EstimatorSpec(
      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

In [8]:
#input layer is a tf placeholder that is the input data
#output size is the desired size of the encoded vector (should be the same size as the video vector?)
#scope is a string to keep track of all trainable variables in this stack
def build_fc_layers(input_layer, output_size, scope):
    h1 = tf.layers.dense(inputs=input_layer, units=1024, activation=tf.nn.tanh, scope=scope)
    d1 = tf.layers.dropout(inputs=h1, rate=0.4)
    h2 = tf.layers.dense(inputs=d1, units=1024, activation=tf.nn.tanh, scope=scope)
    d2 = tf.layers.dropout(inputs=h2, rate=0.4)
    h3 = tf.layers.dense(inputs=d2, units=1024, activation=tf.nn.tanh, scope=scope)
    d3 = tf.layers.drouput(inputs=h3, rate=0.4)
    raw_encode = tf.layers.dense(inputs=d3, units=output_size, activation=tf.nn.tanh, scope=scope)
    return raw_encode

In [9]:
#audio input is a tf placeholder for the input audio features
#video labels is a tf placeholder for the input video features
#encode size is the desired size of the encoded vector (should be the same size as the video features)
#l1 and l2 reg are the amount of weight to put on l1 and l2 regularizers for the loss
def build_fc_net(audio_input, video_labels, encode_size, l1_reg=0, l2_reg=0):
    audio_net = build_fc_layers(audio_input, encode_size, scope="fc_audio")
    weights = tf.trainable_variables(scope="fc_audio")
    if l1_reg != 0:
        reg_1 = tf.contrib.layers.l1_regularizer(scale=l1_reg)
    else:
        reg_1 = 0
    if l2_reg != 0:
        reg_2 = tf.contrib.layers.l2_regularizer(scale=l2_reg)
    else:
        reg_2 = 0
    mse = tf.losses.mean_squared_error(audio_net, video_labels)
    reg_penalty = tf.contrib.layers.apply_regularization(reg_1, weights) + tf.contrib.layers.apply_regularization(reg_2, weights)
    loss = mse + reg_penalty
    return loss

In [10]:
def build_graph(audio_input_shape, video_input_shape, learning_rate=0.0001, l1_reg=0, l2_reg=0):
    inputs = tf.placeholder(shape=audio_input_shape, name="inputs", dtype=tf.float32)
    labels = tf.placeholder(shape=video_input_shape, name="labels", dtype=tf.float32)
    flattened_labels = tf.layers.Flatten()(labels)
    loss = build_fc_net(inputs, flattened_labels, np.prod(video_input_shape), l1_reg, l2_reg)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return inputs, labels, loss, optimizer

In [13]:
def train(inputs, labels, loss, optimizer, dataset, batch_size, num_iters=1000):
    with tf.Session() as sess:
        for i in range(num_iters):
            batch_input, batch_label = build_batch(dataset, batch_size)
            _, loss = sess.run([optimizer, loss], feed_dict={inputs:batch_input, labels:batch_label})
            if i % 100 == 0:
                print("Loss at batch " + str(i))
                print(loss)