In [38]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import glob
tf.logging.set_verbosity(tf.logging.ERROR)
import numpy as np

In [31]:
# from youtube-8m utils.py
def Dequantize(feat_vector, max_quantized_value=2, min_quantized_value=-2):
  """Dequantize the feature from the byte format to the float format.

  Args:
    feat_vector: the input 1-d vector.
    max_quantized_value: the maximum of the quantized value.
    min_quantized_value: the minimum of the quantized value.

  Returns:
    A float vector which has the same shape as feat_vector.
  """
  assert max_quantized_value > min_quantized_value
  quantized_range = max_quantized_value - min_quantized_value
  scalar = quantized_range / 255.0
  bias = (quantized_range / 512.0) + min_quantized_value
  return feat_vector * scalar + bias

def decode(feat_vector, feature_size):
    return tf.reshape(tf.cast(tf.decode_raw(feat_vector, 
                                            tf.uint8), 
                              tf.float32),
                      [-1, feature_size])

In [34]:
# filepath is path to tfrecord
# datatype is audio or video
# output_features and output_labels are empty lists or existing lists
def load_data(filepath, data_type, output_labels, output_features):
    if data_type == 'audio':
        context = {
            'labels': tf.VarLenFeature(dtype=tf.int64)
        }

        feature_list = {
            'audio_embedding': tf.FixedLenSequenceFeature([], dtype=tf.string)
        }
        feature_name = 'audio_embedding'
        feature_len = 128

    elif data_type == 'video':
        context = {
            'id': tf.FixedLenFeature([], dtype=tf.string),
            'labels': tf.VarLenFeature(dtype=tf.int64)
        }

        feature_list = {
            'rgb': tf.FixedLenSequenceFeature([], dtype=tf.string),
        }
        feature_name = 'rgb'
        feature_len = 128
        
        
    tf.reset_default_graph()    
    sess = tf.InteractiveSession()

    # Read TFRecord file
    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer([filepath])


    # Extract features from serialized data

    _, serialized_example = reader.read(filename_queue)
    context, features = tf.io.parse_single_sequence_example(serialized_example,
                                                    context_features=context,
                                                    sequence_features=feature_list,
                                                    example_name=None,
                                                    name=None
    )

    # Many tf.train functions use tf.train.QueueRunner,
    # so we need to start it before we read
    tf.train.start_queue_runners(sess)
    
    
    num_in_file = sum(1 for _ in tf.python_io.tf_record_iterator(filepath))

    for i in range(num_in_file):
        labels = context['labels'].eval()
        label = labels.values[0]
        data = Dequantize(decode(features[feature_name], feature_len)).eval()
        output_labels.append(label)
        output_features.append(data)

    tf.InteractiveSession().close()
    
    return output_labels, output_features

In [90]:
audio_output_labels = []
audio_output_features = []
audio_path = "audio_1556754078.010573.tfrecord"
audio_output_labels, audio_output_features = load_data(audio_path,
                                           'audio', audio_output_labels, audio_output_features)

video_output_labels = []
video_output_features = []
video_path = "video_1556754626.438139.tfrecord"
video_output_labels, video_output_features = load_data(video_path,
                                           'video', video_output_labels, video_output_features)

In [92]:
audio_x = np.array(audio_output_features)
audio_y = np.array(audio_output_labels)
video_x = np.array(video_output_features)
video_y = np.array(video_output_labels)
print('audio_x shape: ', audio_x.shape)
print('audio_y shape: ', audio_y.shape)
print('video_x shape: ', video_x.shape)
print('video_y shape: ', video_y.shape)


audio_x shape:  (450, 10, 128)
audio_y shape:  (450,)
video_x shape:  (450, 10, 128)
video_y shape:  (450,)


In [None]:
# --- At this point the desired data should be loaded --- 

In [None]:
def build_batch(dataset, batch_size):


In [None]:
def network_model_1(features, labels, mode):
    """Model function for CNN."""
    # Input Layer
    input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])

    dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
    dropout = tf.layers.dropout(
      inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)

    # Logits Layer
    logits = tf.layers.dense(inputs=dropout, units=10)

    predictions = {
      # Generate predictions (for PREDICT and EVAL mode)
      "classes": tf.argmax(input=logits, axis=1),
      # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
      # `logging_hook`.
      "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }

    if mode == tf.estimator.ModeKeys.PREDICT:
    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
    train_op = optimizer.minimize(
        loss=loss,
        global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
      "accuracy": tf.metrics.accuracy(
          labels=labels, predictions=predictions["classes"])
    }
    return tf.estimator.EstimatorSpec(
      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)