In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import glob
tf.logging.set_verbosity(tf.logging.ERROR)
import numpy as np
from tensorboardX import SummaryWriter

In [2]:
# from youtube-8m utils.py
def Dequantize(feat_vector, max_quantized_value=2, min_quantized_value=-2):
  """Dequantize the feature from the byte format to the float format.

  Args:
    feat_vector: the input 1-d vector.
    max_quantized_value: the maximum of the quantized value.
    min_quantized_value: the minimum of the quantized value.

  Returns:
    A float vector which has the same shape as feat_vector.
  """
  assert max_quantized_value > min_quantized_value
  quantized_range = max_quantized_value - min_quantized_value
  scalar = quantized_range / 255.0
  bias = (quantized_range / 512.0) + min_quantized_value
  return feat_vector * scalar + bias

def decode(feat_vector, feature_size):
    return tf.reshape(tf.cast(tf.decode_raw(feat_vector, 
                                            tf.uint8), 
                              tf.float32),
                      [-1, feature_size])

In [3]:
# filepath is path to tfrecord
# datatype is audio or video
# output_features and output_labels are empty lists or existing lists
def load_data(filepath, data_type, output_labels, output_features):
    if data_type == 'audio':
        context = {
            'labels': tf.VarLenFeature(dtype=tf.int64)
        }

        feature_list = {
            'audio_embedding': tf.FixedLenSequenceFeature([], dtype=tf.string)
        }
        feature_name = 'audio_embedding'
        feature_len = 128

    elif data_type == 'video':
        context = {
            'id': tf.FixedLenFeature([], dtype=tf.string),
            'labels': tf.VarLenFeature(dtype=tf.int64)
        }

        feature_list = {
            'rgb': tf.FixedLenSequenceFeature([], dtype=tf.string),
        }
        feature_name = 'rgb'
        feature_len = 128
        
        
    tf.reset_default_graph()    


    # Read TFRecord file
    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer([filepath])


    # Extract features from serialized data

    _, serialized_example = reader.read(filename_queue)
    context, features = tf.io.parse_single_sequence_example(serialized_example,
                                                    context_features=context,
                                                    sequence_features=feature_list,
                                                    example_name=None,
                                                    name=None
    )
    labels = context['labels']
    label = labels.values[0]
    data = Dequantize(decode(features[feature_name], feature_len))

    # Many tf.train functions use tf.train.QueueRunner,
    # so we need to start it before we read
    
    
    with tf.Session() as sess:
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        # f = codecs.open(outpath, "w", encoding='utf-8')
        try:
            counter = 0
            recordlist = []

            num_in_file = sum(1 for _ in tf.python_io.tf_record_iterator(filepath))

            for i in range(num_in_file):
                d, l = sess.run([data, label])
                output_labels.append(l)
                output_features.append(d)
        except tf.errors.OutOfRangeError:
            print('Finished extracting from tfrecord data.')
        finally:
            coord.request_stop()
            coord.join(threads)


    
    return output_labels, output_features

In [4]:
audio_output_labels = []
audio_output_features = []
audio_path = "audio_1556954623.538376.tfrecord"
audio_output_labels, audio_output_features = load_data(audio_path,
                                           'audio', audio_output_labels, audio_output_features)
# audio_path = "audio_1556932956.096088.tfrecord"
# audio_output_labels, audio_output_features = load_data(audio_path,
#                                            'audio', audio_output_labels, audio_output_features)

video_output_labels = []
video_output_features = []
video_path = "video_1556949434.26188.tfrecord"
video_output_labels, video_output_features = load_data(video_path,
                                           'video', video_output_labels, video_output_features)
# video_path = "video_1556931967.658074.tfrecord"
# video_output_labels, video_output_features = load_data(video_path,
#                                            'video', video_output_labels, video_output_features)

In [5]:
audio_x = np.array(audio_output_features)
audio_y = np.array(audio_output_labels)
video_x = np.array(video_output_features)
video_y = np.array(video_output_labels)
print('audio_x shape: ', audio_x.shape)
print('audio_y shape: ', audio_y.shape)
print('video_x shape: ', video_x.shape)
print('video_y shape: ', video_y.shape)


audio_x shape:  (1939, 10, 128)
audio_y shape:  (1939,)
video_x shape:  (1939, 10, 128)
video_y shape:  (1939,)


In [None]:
print(audio_y)
print(video_y)

In [35]:
validation_split = int(audio_y.size * 0.2)
audio_indices_val = np.where(audio_y < validation_split)
video_indices_val = np.where(video_y < validation_split)
audio_indices_train = np.where(audio_y >= validation_split)
video_indices_train = np.where(video_y >= validation_split)
validation_audio_x = audio_x[audio_indices_val]
validation_audio_y = audio_y[audio_indices_val]
validation_video_x = video_x[video_indices_val]
validation_video_y = video_y[video_indices_val]
training_audio_x = audio_x[audio_indices_train]
training_audio_y = audio_y[audio_indices_train]
training_video_x = video_x[video_indices_train]
training_video_y = video_y[video_indices_train]

validation_indices = np.random.choice(audio_y.size, int(audio_y.size*0.2), replace=False)
validation_audio_x = audio_x[validation_indices]
validation_audio_y = audio_y[validation_indices]
validation_video_x = video_x[validation_indices]
validation_video_y = video_y[validation_indices]

training_audio_x = np.delete(audio_x, validation_indices, axis=0)
training_audio_y = np.delete(audio_y, validation_indices, axis=0)
training_video_x = np.delete(video_x, validation_indices, axis=0)
training_video_y = np.delete(video_y, validation_indices, axis=0)

print(training_audio_x.shape)
print(training_audio_y.shape)
print(training_video_x.shape)
print(training_video_y.shape)

(1552, 10, 128)
(1552,)
(1552, 10, 128)
(1552,)


array([ 0,  0,  0, ..., 99, 99, 99])

In [7]:
# --- At this point the desired data should be loaded --- 

In [36]:
def build_batch(dataset_x, dataset_y, batch_size, labels=None):
    if labels is None:
        indices = list(np.random.randint(0, len(dataset_x), size=batch_size))
    else:
        indices=[]
        for i in labels:
            #print(i)
            indices.append(np.where(dataset_y == i)[0][0])
    # Recover what the entries for the batch are
    batch_x = np.array([dataset_x[i] for i in indices])
    batch_y = np.array([dataset_y[i] for i in indices])
    
    return batch_x, batch_y, indices

In [45]:
def build_balanced_batch(dataset1_x, dataset1_y, dataset2_x, dataset2_y, batch_size):
    b = int(batch_size/2)
    indices = list(np.random.randint(0, len(dataset1_x), size=b))
    labels = dataset1_y[indices]
    matched1_x, matched1_y, _ = build_batch(dataset1_x, dataset1_y, b, labels)
    matched2_x, matched2_y, _ = build_batch(dataset2_x, dataset2_y, b, labels)
    random1_x, random1_y, _ = build_batch(dataset1_x, dataset1_y, b)    
    random2_x, random2_y, _ = build_batch(dataset2_x, dataset2_y, b)   
    x1 = np.concatenate([matched1_x, random1_x])   
    y1 = np.concatenate([matched1_y, random1_y])  
    x2 = np.concatenate([matched2_x, random2_x])   
    y2 = np.concatenate([matched2_y, random2_y])  
    return x1, y1, x2, y2

In [46]:
def contrastive_loss(embedded_batch_a, embedded_batch_b, a_labels, b_labels, margin):
    y = tf.cast(tf.equal(a_labels, b_labels), tf.float32)
    dist = tf.norm(embedded_batch_a - embedded_batch_b, axis=1)
    #print(dist.shape)
    loss = (y) * .5 * tf.square(dist) + (1-y) * .5 * tf.square(tf.maximum(0., margin - dist))
    return y, tf.reduce_sum(loss)

In [47]:
def build_graph(input_shape, labels_shape, 
                embed_size=128, learning_rate=0.001, 
                l1_reg=0.001, l2_reg=0.001, margin=50):
    audio_inputs = tf.placeholder(shape=[None, input_shape[0], input_shape[1]], name="audio_inputs", dtype=tf.float32)
    video_inputs = tf.placeholder(shape=[None, input_shape[0], input_shape[1]], name="video_inputs", dtype=tf.float32)
    audio_labels = tf.placeholder(shape=[None], name="audio_labels", dtype=tf.float32)
    video_labels = tf.placeholder(shape=[None], name="video_labels", dtype=tf.float32)

    flattened_audio = tf.layers.flatten(audio_inputs)
    flattened_video = tf.layers.flatten(video_inputs)
    flattened_audio_labels = tf.layers.flatten(audio_labels)
    flattened_video_labels = tf.layers.flatten(video_labels)

    
    #subnetwork = build_fc_net(inputs, embed_size, np.prod(video_input_shape), l1_reg, l2_reg)
    weights = tf.trainable_variables()
    reg_1 = tf.contrib.layers.l1_regularizer(scale=l1_reg)
    reg_2 = tf.contrib.layers.l2_regularizer(scale=l2_reg)
    audio_embed = build_fc_net(flattened_audio, embed_size, l1_reg, l2_reg)
    video_embed = build_fc_net(flattened_video, embed_size, l1_reg, l2_reg, reuse=True)
    weights = tf.trainable_variables()

    #mse = tf.losses.mean_squared_error(audio_embed, video_embed) # only use loss if labels don't match?
    matches, error = contrastive_loss(audio_embed, video_embed, flattened_audio_labels, flattened_video_labels, margin)
    reg_penalty = tf.contrib.layers.apply_regularization(reg_1, weights) + tf.contrib.layers.apply_regularization(reg_2, weights)
    loss = error + reg_penalty
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return (audio_inputs, audio_labels,
            video_inputs, video_labels,
            loss, optimizer, matches,
            audio_embed, video_embed)

In [48]:
#audio input is a tf placeholder for the input audio features
#video labels is a tf placeholder for the input video features
#encode size is the desired size of the encoded vector (should be the same size as the video features)
#l1 and l2 reg are the amount of weight to put on l1 and l2 regularizers for the loss

def build_fc_net(input_data, embed_size, l1_reg=0.001, l2_reg=0.001, reuse=False):
    net = build_fc_layers(input_data, embed_size, reuse)
    return net

In [49]:
# input layer should be the flattened inputs
def build_fc_layers(input_layer, output_size, reuse=False):
    with tf.name_scope("model"):
        with tf.variable_scope("dense0", reuse=reuse) as scope:
            h1 = tf.layers.dense(inputs=input_layer, units=1024, activation=tf.nn.tanh)
            d1 = tf.layers.dropout(inputs=h1, rate=.3)
        with tf.variable_scope("dense1", reuse=reuse) as scope:
            h2 = tf.layers.dense(inputs=d1, units=1024, activation=tf.nn.tanh)
            d2 = tf.layers.dropout(inputs=h2, rate=.3)            
        with tf.variable_scope("dense2", reuse=reuse) as scope:
            h3 = tf.layers.dense(inputs=d2, units=1024, activation=tf.nn.tanh)
            d3 = tf.layers.dropout(inputs=h3, rate=.3)
        with tf.variable_scope("dense3", reuse=reuse) as scope:
            h4 = tf.layers.dense(inputs=d3, units=1024, activation=tf.nn.tanh)
            d4 = tf.layers.dropout(inputs=h4, rate=.3)
        with tf.variable_scope("dense4", reuse=reuse) as scope:
            h5 = tf.layers.dense(inputs=d3, units=1024, activation=tf.nn.tanh)
            d5 = tf.layers.dropout(inputs=h4, rate=.3)
        with tf.variable_scope("dense5", reuse=reuse) as scope:
            h6 = tf.layers.dense(inputs=d4, units=1024, activation=tf.nn.tanh)
            d6 = tf.layers.dropout(inputs=h5, rate=.3)
        with tf.variable_scope("dense6", reuse=reuse) as scope:
            raw_encode = tf.layers.dense(inputs=d3, units=output_size, activation=tf.nn.tanh)

    return raw_encode

In [50]:
def train(sess, audio_inputs, audio_labels, 
          video_inputs, video_labels, 
          audio_x, audio_y,
          video_x, video_y,
          loss, optimizer, 
          audio_embed, video_embed, 
          batch_size, cur_epoch, num_iters=1000):

        for i in range(num_iters):
            audio_batch_input, audio_batch_label, video_batch_input, video_batch_label = build_balanced_batch(audio_x, audio_y, video_x, video_y, batch_size)
            """if i % 2 == 0:
                audio_batch_input, audio_batch_label, _ = build_batch(audio_x, audio_y, batch_size)
                video_batch_input, video_batch_label, _ = build_batch(video_x, video_y, batch_size)
            else:
                audio_batch_input, audio_batch_label, indices = build_batch(audio_x, audio_y, batch_size)
                video_batch_input, video_batch_label, _ = build_batch(video_x, video_y, batch_size, audio_batch_label)"""                
            _, loss_val,ys = sess.run([optimizer, loss, matches], feed_dict={audio_inputs: audio_batch_input,
                                                             video_inputs: video_batch_input,
                                                             audio_labels: audio_batch_label,
                                                             video_labels: video_batch_label})
            if i % 100 == 0:
                train_summary_writer.add_scalar('train_loss', loss_val, cur_epoch * num_iters + i)
                #print(ys)
#                 print("Loss at iter " + str(i) + ": " + str(loss_val))
#                 print("")

In [51]:
def k_nearest_neighbors(embedding, search_space, k):
    embedding_stacked = np.repeat(embedding[np.newaxis,:], len(search_space), axis=0)
    #print(embedding_stacked.shape)
    similarity = np.linalg.norm(embedding_stacked - search_space, axis=1)
    #print(similarity.shape)
    indices = np.argsort(similarity)[:k]
    #print(indices)
    return indices

def validate(sess, audio_inputs, audio_labels, 
             video_inputs, video_labels, 
             audio_x, audio_y,
             video_x, video_y,
             loss, optimizer, 
             audio_embed, video_embed, 
             cur_epoch, batch_size=100, k=50, tfboard_var='val_found_matches'):

    audio_batch_input, audio_batch_label, indices = build_batch(audio_x, audio_y, batch_size)
    #print(audio_batch_label)
    video_batch_input, video_batch_label, _ = build_batch(video_x, video_y, batch_size, audio_batch_label)
    #print(video_batch_label)
    audio_embeddings = sess.run(audio_embed, feed_dict={audio_inputs: audio_batch_input,
                                                          audio_labels: audio_batch_label})
    video_embeddings = sess.run(video_embed, feed_dict={video_inputs: video_batch_input,
                                                          video_labels: video_batch_label})
    matches = 0
    for i in range(len(audio_embeddings)):
        a = audio_embeddings[i]
        #if i%10 == 0:
        #    print(a)
        a_y = audio_batch_label[i]
        ids = k_nearest_neighbors(a, video_embeddings, k)
#         print(a_y)
#         print(ids)
#         print(video_batch_label)
#         print(video_batch_label[ids])
        if a_y in video_batch_label[ids]:
            matches += 1
#     print("")
#     print("Percent of matches found: " + str(matches / len(audio_embeddings)))
    train_summary_writer.add_scalar(tfboard_var, matches/batch_size, cur_epoch)

In [52]:
train_log_dir = 'logs/tensorboard/train/log9'
train_summary_writer = SummaryWriter(train_log_dir)


num_epochs = 500
batch = 64
embed_size = 128
learning_rate = 0.0001
l1_reg = 0.001
l2_reg = 0.001
margin=2

tf.reset_default_graph()

(audio_inputs, audio_labels, 
 video_inputs, video_labels, 
 loss, optimizer, matches,
 audio_embed, video_embed) = build_graph(audio_x[0].shape, audio_y.shape[0], embed_size, learning_rate, l1_reg, l2_reg, margin)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

for epoch in range(num_epochs):
#     print("")
#     print("Epoch " + str(epoch))
#     print("=============")
#     print("")
    train(sess, audio_inputs, audio_labels,
          video_inputs, video_labels,
          training_audio_x, training_audio_y, 
          training_video_x, training_video_y, 
          loss, optimizer, 
          audio_embed, video_embed, batch, epoch)
    validate(sess, audio_inputs, audio_labels,
          video_inputs, video_labels,
          training_audio_x, training_audio_y, 
          training_video_x, training_video_y, 
          loss, optimizer, 
          audio_embed, video_embed, epoch, batch_size=validation_split, tfboard_var='train_found_matches')

    validate(sess, audio_inputs, audio_labels,
          video_inputs, video_labels,
          validation_audio_x, validation_audio_y, 
          validation_video_x, validation_video_y, 
          loss, optimizer, 
          audio_embed, video_embed, epoch, batch_size=validation_split)


In [None]:
validate(sess, audio_inputs, audio_labels,
      video_inputs, video_labels,
      audio_x, audio_y, 
      video_x, video_y, 
      loss, optimizer, 
      audio_embed, video_embed, epoch)