In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import glob
tf.logging.set_verbosity(tf.logging.ERROR)
import numpy as np
#from tensorboardX import SummaryWriter

In [2]:
# from youtube-8m utils.py
def Dequantize(feat_vector, max_quantized_value=2, min_quantized_value=-2):
  """Dequantize the feature from the byte format to the float format.

  Args:
    feat_vector: the input 1-d vector.
    max_quantized_value: the maximum of the quantized value.
    min_quantized_value: the minimum of the quantized value.

  Returns:
    A float vector which has the same shape as feat_vector.
  """
  assert max_quantized_value > min_quantized_value
  quantized_range = max_quantized_value - min_quantized_value
  scalar = quantized_range / 255.0
  bias = (quantized_range / 512.0) + min_quantized_value
  return feat_vector * scalar + bias

def decode(feat_vector, feature_size):
    return tf.reshape(tf.cast(tf.decode_raw(feat_vector, 
                                            tf.uint8), 
                              tf.float32),
                      [-1, feature_size])

In [3]:
# filepath is path to tfrecord
# datatype is audio or video
# output_features and output_labels are empty lists or existing lists
def load_data(filepath, data_type, output_labels, output_features):
    if data_type == 'audio':
        context = {
            'labels': tf.VarLenFeature(dtype=tf.int64)
        }

        feature_list = {
            'audio_embedding': tf.FixedLenSequenceFeature([], dtype=tf.string)
        }
        feature_name = 'audio_embedding'
        feature_len = 128

    elif data_type == 'video':
        context = {
            'id': tf.FixedLenFeature([], dtype=tf.string),
            'labels': tf.VarLenFeature(dtype=tf.int64)
        }

        feature_list = {
            'rgb': tf.FixedLenSequenceFeature([], dtype=tf.string),
        }
        feature_name = 'rgb'
        feature_len = 128
        
        
    tf.reset_default_graph()    


    # Read TFRecord file
    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer([filepath])


    # Extract features from serialized data

    _, serialized_example = reader.read(filename_queue)
    context, features = tf.io.parse_single_sequence_example(serialized_example,
                                                    context_features=context,
                                                    sequence_features=feature_list,
                                                    example_name=None,
                                                    name=None
    )
    labels = context['labels']
    label = labels.values[0]
    data = Dequantize(decode(features[feature_name], feature_len))

    # Many tf.train functions use tf.train.QueueRunner,
    # so we need to start it before we read
    
    
    with tf.Session() as sess:
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        # f = codecs.open(outpath, "w", encoding='utf-8')
        try:
            counter = 0
            recordlist = []

            num_in_file = sum(1 for _ in tf.python_io.tf_record_iterator(filepath))

            for i in range(num_in_file):
                d, l = sess.run([data, label])
                output_labels.append(l)
                output_features.append(d)
        except tf.errors.OutOfRangeError:
            print('Finished extracting from tfrecord data.')
        finally:
            coord.request_stop()
            coord.join(threads)


    
    return output_labels, output_features

In [4]:
audio_output_labels = []
audio_output_features = []
audio_path = "audio_1556878148.412379.tfrecord"
audio_output_labels, audio_output_features = load_data(audio_path,
                                           'audio', audio_output_labels, audio_output_features)

video_output_labels = []
video_output_features = []
video_path = "video_1556875795.817071.tfrecord"
video_output_labels, video_output_features = load_data(video_path,
                                           'video', video_output_labels, video_output_features)

In [5]:
audio_x = np.array(audio_output_features)
audio_y = np.array(audio_output_labels)
video_x = np.array(video_output_features)
video_y = np.array(video_output_labels)
print('audio_x shape: ', audio_x.shape)
print('audio_y shape: ', audio_y.shape)
print('video_x shape: ', video_x.shape)
print('video_y shape: ', video_y.shape)


audio_x shape:  (2243, 10, 128)
audio_y shape:  (2243,)
video_x shape:  (2243, 10, 128)
video_y shape:  (2243,)


In [6]:
print(audio_y)
print(video_y)

[1340 1205 2186 ... 2088 1070 1735]
[ 970  347  819 ... 2049 1702 1913]


In [7]:
validation_split = int(audio_y.size * 0.2)
audio_indices_val = np.where(audio_y < validation_split)
video_indices_val = np.where(video_y < validation_split)
audio_indices_train = np.where(audio_y >= validation_split)
video_indices_train = np.where(video_y >= validation_split)
validation_audio_x = audio_x[audio_indices_val]
validation_audio_y = audio_y[audio_indices_val]
validation_video_x = video_x[video_indices_val]
validation_video_y = video_y[video_indices_val]
training_audio_x = audio_x[audio_indices_train]
training_audio_y = audio_y[audio_indices_train]
training_video_x = video_x[video_indices_train]
training_video_y = video_y[video_indices_train]

In [8]:
# --- At this point the desired data should be loaded --- 

In [9]:
def build_batch(dataset_x, dataset_y, batch_size, labels=None):
    if labels is None:
        indices = list(np.random.randint(0, len(dataset_x), size=batch_size))
    else:
        indices=[]
        for i in labels:
            #print(i)
            indices.append(np.where(dataset_y == i)[0][0])
    # Recover what the entries for the batch are
    batch_x = np.array([dataset_x[i] for i in indices])
    batch_y = np.array([dataset_y[i] for i in indices])
    
    return batch_x, batch_y, indices

In [10]:
def build_balanced_batch(dataset1_x, dataset1_y, dataset2_x, dataset2_y, batch_size):
    b = int(batch_size/2)
    indices = list(np.random.randint(0, len(dataset1_x), size=b))
    labels = dataset1_y[indices]
    matched1_x, matched1_y, _ = build_batch(dataset1_x, dataset1_y, b, labels)
    matched2_x, matched2_y, _ = build_batch(dataset2_x, dataset2_y, b, labels)
    random1_x, random1_y, _ = build_batch(dataset1_x, dataset1_y, b)    
    random2_x, random2_y, _ = build_batch(dataset2_x, dataset2_y, b)   
    x1 = np.concatenate([matched1_x, random1_x])   
    y1 = np.concatenate([matched1_y, random1_y])  
    x2 = np.concatenate([matched2_x, random2_x])   
    y2 = np.concatenate([matched2_y, random2_y])  
    return x1, y1, x2, y2

In [11]:
def contrastive_loss(embedded_batch_a, embedded_batch_b, a_labels, b_labels, margin):
    y = tf.cast(tf.equal(a_labels, b_labels), tf.float32)
    dist = tf.norm(embedded_batch_a - embedded_batch_b, axis=1)
    #print(dist.shape)
    loss = (y) * .5 * tf.square(dist) + (1-y) * .5 * tf.square(tf.maximum(0., margin - dist))
    return y, tf.reduce_sum(loss)

In [12]:
def build_graph(input_shape, labels_shape, 
                embed_size=128, learning_rate=0.001, 
                l1_reg=0.001, l2_reg=0.001, margin=50):
    audio_inputs = tf.placeholder(shape=[None, input_shape[0], input_shape[1]], name="audio_inputs", dtype=tf.float32)
    video_inputs = tf.placeholder(shape=[None, input_shape[0], input_shape[1]], name="video_inputs", dtype=tf.float32)
    audio_labels = tf.placeholder(shape=[None], name="audio_labels", dtype=tf.float32)
    video_labels = tf.placeholder(shape=[None], name="video_labels", dtype=tf.float32)

    flattened_audio = tf.layers.flatten(audio_inputs)
    flattened_video = tf.layers.flatten(video_inputs)
    flattened_audio_labels = tf.layers.flatten(audio_labels)
    flattened_video_labels = tf.layers.flatten(video_labels)

    
    #subnetwork = build_fc_net(inputs, embed_size, np.prod(video_input_shape), l1_reg, l2_reg)
    weights = tf.trainable_variables()
    reg_1 = tf.contrib.layers.l1_regularizer(scale=l1_reg)
    reg_2 = tf.contrib.layers.l2_regularizer(scale=l2_reg)
    audio_embed = build_fc_net(flattened_audio, embed_size, l1_reg, l2_reg)
    video_embed = build_fc_net(flattened_video, embed_size, l1_reg, l2_reg, reuse=True)
    weights = tf.trainable_variables()

    #mse = tf.losses.mean_squared_error(audio_embed, video_embed) # only use loss if labels don't match?
    matches, error = contrastive_loss(audio_embed, video_embed, flattened_audio_labels, flattened_video_labels, margin)
    reg_penalty = tf.contrib.layers.apply_regularization(reg_1, weights) + tf.contrib.layers.apply_regularization(reg_2, weights)
    loss = error + reg_penalty
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return (audio_inputs, audio_labels,
            video_inputs, video_labels,
            loss, optimizer, matches,
            audio_embed, video_embed)

In [81]:
#audio input is a tf placeholder for the input audio features
#video labels is a tf placeholder for the input video features
#encode size is the desired size of the encoded vector (should be the same size as the video features)
#l1 and l2 reg are the amount of weight to put on l1 and l2 regularizers for the loss

def build_fc_net(input_data, embed_size, l1_reg=0.001, l2_reg=0.001, reuse=False, hidden_sizes=None):
    net = build_fc_layers(input_data, embed_size, reuse, hidden_sizes)
    return net

In [82]:
# input layer should be the flattened inputs
def build_fc_layers(input_layer, output_size, reuse=False, hidden_sizes=None):
    if hidden_sizes is None:
        hidden_sizes = [1024, 512, 256]
    with tf.name_scope("model"):
        with tf.variable_scope("dense0", reuse=reuse) as scope:
            h1 = tf.layers.dense(inputs=input_layer, units=hidden_sizes[0], activation=tf.nn.tanh)
            d1 = tf.layers.dropout(inputs=h1, rate=.3)
        with tf.variable_scope("dense1", reuse=reuse) as scope:
            h2 = tf.layers.dense(inputs=d1, units=hidden_sizes[1], activation=tf.nn.tanh)
            d2 = tf.layers.dropout(inputs=h2, rate=.3)            
        with tf.variable_scope("dense2", reuse=reuse) as scope:
            h3 = tf.layers.dense(inputs=d2, units=hidden_sizes[2], activation=tf.nn.tanh)
            d3 = tf.layers.dropout(inputs=h3, rate=.3)
        with tf.variable_scope("dense3", reuse=reuse) as scope:
            raw_encode = tf.layers.dense(inputs=d3, units=output_size, activation=tf.nn.tanh)

    return raw_encode

In [145]:
def train(sess, audio_inputs, audio_labels, 
          video_inputs, video_labels, 
          audio_x, audio_y,
          video_x, video_y,
          loss, optimizer, 
          audio_embed, video_embed, 
          batch_size, cur_epoch, num_iters=1000):

        for i in range(num_iters):
            #audio_batch_input, audio_batch_label, video_batch_input, video_batch_label = build_balanced_batch(audio_x, audio_y, video_x, video_y, batch_size)
            if i % 2 == 0:
                audio_batch_input, audio_batch_label, _ = build_batch(audio_x, audio_y, batch_size)
                video_batch_input, video_batch_label, _ = build_batch(video_x, video_y, batch_size)
            else:
                audio_batch_input, audio_batch_label, indices = build_batch(audio_x, audio_y, batch_size)
                video_batch_input, video_batch_label, _ = build_batch(video_x, video_y, batch_size, audio_batch_label)
            _, loss_val,ys = sess.run([optimizer, loss, matches], feed_dict={audio_inputs: audio_batch_input,
                                                             video_inputs: video_batch_input,
                                                             audio_labels: audio_batch_label,
                                                             video_labels: video_batch_label})
            if i % 100 == 0:
                #train_summary_writer.add_scalar('train_loss', loss_val, cur_epoch * num_iters + i)
                #print(ys)
                print("Loss at iter " + str(i) + ": " + str(loss_val))
                print("")

In [78]:
def k_nearest_neighbors(embedding, search_space, k):
    embedding_stacked = np.repeat(embedding[np.newaxis,:], len(search_space), axis=0)
    similarity = np.linalg.norm(embedding_stacked - search_space, axis=1)
    #print(similarity.shape)
    indices = np.argsort(similarity)[:k]
    #print(indices)
    return indices

def validate(sess, audio_inputs, audio_labels, 
             video_inputs, video_labels, 
             audio_x, audio_y,
             video_x, video_y,
             loss, optimizer, 
             audio_embed, video_embed, 
             cur_epoch, batch_size=100, k=5):

    audio_batch_input, audio_batch_label, indices = build_batch(audio_x, audio_y, batch_size)
    #print(audio_batch_label)
    video_batch_input, video_batch_label, _ = build_batch(video_x, video_y, batch_size, audio_batch_label)
    #print(video_batch_label)
    audio_embeddings = sess.run(audio_embed, feed_dict={audio_inputs: audio_batch_input,
                                                          audio_labels: audio_batch_label})
    video_embeddings = sess.run(video_embed, feed_dict={video_inputs: video_batch_input,
                                                          video_labels: video_batch_label})
    matches = 0
    for i in range(len(audio_embeddings)):
        a = audio_embeddings[i]
        #if i%10 == 0:
        #    print(a)
        a_y = audio_batch_label[i]
        ids = k_nearest_neighbors(a, video_embeddings, k)
        #print(a_y)
        #print(ids)
        if a_y in video_batch_label[ids]:
            matches += 1
    random_val = k / len(audio_embeddings)
    print("")
    print("Percent of matches found: " + str(matches / len(audio_embeddings)))
    print("Random Chance would be: " + str(random_val))
    #train_summary_writer.add_scalar('found_matches', matches/batch_size, cur_epoch)

In [42]:
train_log_dir = 'logs/tensorboard/train/log4'
#train_summary_writer = SummaryWriter(train_log_dir)


num_epochs = 100
batch = 128
embed_size = 64
learning_rate = 0.005
l1_reg = 0.0
l2_reg = 0.001
margin=5

tf.reset_default_graph()

(audio_inputs, audio_labels, 
 video_inputs, video_labels, 
 loss, optimizer, matches,
 audio_embed, video_embed) = build_graph(audio_x[0].shape, audio_y.shape[0], embed_size, learning_rate, l1_reg, l2_reg, margin)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

for epoch in range(num_epochs):
    print("")
    print("Epoch " + str(epoch))
    print("=============")
    print("")

    train(sess, audio_inputs, audio_labels,
          video_inputs, video_labels,
          audio_x, audio_y, 
          video_x, video_y, 
          loss, optimizer, 
          audio_embed, video_embed, batch, epoch)
    
    validate(sess, audio_inputs, audio_labels,
          video_inputs, video_labels,
          audio_x, audio_y, 
          video_x, video_y, 
          loss, optimizer, 
          audio_embed, video_embed, epoch, batch_size=validation_split)



Epoch 0

Loss at iter 0: 119639.79



KeyboardInterrupt: 

In [None]:
validate(sess, audio_inputs, audio_labels,
      video_inputs, video_labels,
      audio_x, audio_y, 
      video_x, video_y, 
      loss, optimizer, 
      audio_embed, video_embed, epoch)

In [83]:
def build_to_video_graph(input_shape, labels_shape, learning_rate=0.001, 
                l1_reg=0.001, l2_reg=0.001, margin=50):
    audio_inputs = tf.placeholder(shape=[None, input_shape[0], input_shape[1]], name="audio_inputs", dtype=tf.float32)
    video_inputs = tf.placeholder(shape=[None, input_shape[0], input_shape[1]], name="video_inputs", dtype=tf.float32)

    flattened_audio = tf.layers.flatten(audio_inputs)
    flattened_video = tf.layers.flatten(video_inputs)
    
    #subnetwork = build_fc_net(inputs, embed_size, np.prod(video_input_shape), l1_reg, l2_reg)
    weights = tf.trainable_variables()
    reg_1 = tf.contrib.layers.l1_regularizer(scale=l1_reg)
    reg_2 = tf.contrib.layers.l2_regularizer(scale=l2_reg)
    net = build_fc_net(flattened_audio, np.prod(input_shape), l1_reg, l2_reg, hidden_sizes=[1280, 1280, 1280])
    weights = tf.trainable_variables()

    error = tf.losses.mean_squared_error(net, flattened_video) # only use loss if labels don't match?
    #matches, error = contrastive_loss(audio_embed, video_embed, flattened_audio_labels, flattened_video_labels, margin)
    reg_penalty = tf.contrib.layers.apply_regularization(reg_1, weights) + tf.contrib.layers.apply_regularization(reg_2, weights)
    loss = error + reg_penalty
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return (audio_inputs,
            video_inputs,
            loss, optimizer,
            net)

In [84]:
def train_a_to_v(sess, audio_inputs, 
          video_inputs, 
          audio_x, audio_y,
          video_x, video_y,
          loss, optimizer, 
          net, batch_size,
          cur_epoch, num_iters=1000):

        for i in range(num_iters):
            audio_batch_input, audio_batch_label, indices = build_batch(audio_x, audio_y, batch_size)
            video_batch_input, video_batch_label, _ = build_batch(video_x, video_y, batch_size, audio_batch_label)
            _, loss_val = sess.run([optimizer, loss], feed_dict={audio_inputs: audio_batch_input,
                                                             video_inputs: video_batch_input})
            if i % 100 == 0:
                #train_summary_writer.add_scalar('train_loss', loss_val, cur_epoch * num_iters + i)
                #print(ys)
                print("Loss at iter " + str(i) + ": " + str(loss_val))
                print("")

In [85]:
def validate_video_direct(sess, audio_inputs,
             audio_x, audio_y,
             video_x, video_y,
             loss, optimizer,
             net, cur_epoch, 
             batch_size=100, k=5):
    
    audio_embeddings = sess.run(net, feed_dict={audio_inputs: audio_x})
    matches = 0
    for i in range(len(audio_embeddings)):
        a = audio_embeddings[i]
        #if i%10 == 0:
        #    print(a)
        a_y = audio_y[i]
        ids = k_nearest_neighbors(a, np.reshape(video_x, [video_x.shape[0], -1]), k)
        #print(a_y)
        #print(ids)
        if a_y in video_y[ids]:
            matches += 1
    random_val = k / len(audio_embeddings)
    print("")
    print("Percent of matches found: " + str(matches / len(audio_embeddings)))
    print("Random Chance would be: " + str(random_val))
    #train_summary_writer.add_scalar('found_matches', matches/batch_size, cur_epoch)

In [86]:
train_log_dir = 'logs/tensorboard/train/log4'
#train_summary_writer = SummaryWriter(train_log_dir)


num_epochs = 200
batch = 64
learning_rate = 0.001
l1_reg = 0.0
l2_reg = 0.001

tf.reset_default_graph()

(audio_inputs, video_inputs, 
 loss, optimizer,
 net) = build_to_video_graph(audio_x[0].shape, audio_y.shape[0], learning_rate, l1_reg, l2_reg, margin)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

for epoch in range(num_epochs):
    print("")
    print("Epoch " + str(epoch))
    print("=============")
    print("")

    train_a_to_v(sess, audio_inputs,
          video_inputs,
          audio_x, audio_y, 
          video_x, video_y, 
          loss, optimizer, 
          net, batch, epoch)

    validate_video_direct(sess, audio_inputs,
                         audio_x, audio_y,
                         video_x, video_y, 
                         loss, optimizer,
                         net, epoch)


Epoch 0

Loss at iter 0: 3.6048768

Loss at iter 100: 1.2104247

Loss at iter 200: 0.92626446

Loss at iter 300: 0.9222817

Loss at iter 400: 0.9243196

Loss at iter 500: 0.87257695

Loss at iter 600: 0.90413153

Loss at iter 700: 0.9001312

Loss at iter 800: 0.90901756

Loss at iter 900: 0.90472096


Percent of matches found: 0.0026749888542131075
Random Chance would be: 0.002229157378510923

Epoch 1

Loss at iter 0: 0.8975496

Loss at iter 100: 0.8946828

Loss at iter 200: 0.9204103

Loss at iter 300: 0.90690285

Loss at iter 400: 0.9165665

Loss at iter 500: 0.90016055

Loss at iter 600: 0.9207752

Loss at iter 700: 0.9423184

Loss at iter 800: 0.9048807

Loss at iter 900: 0.9138212


Percent of matches found: 0.0026749888542131075
Random Chance would be: 0.002229157378510923

Epoch 2

Loss at iter 0: 0.9120704

Loss at iter 100: 0.90556836

Loss at iter 200: 0.87932324

Loss at iter 300: 0.87622803

Loss at iter 400: 0.91434586

Loss at iter 500: 0.9000758

Loss at iter 600: 0.892

KeyboardInterrupt: 

In [146]:
def build_lstm_graph(input_shape, labels_shape, 
                embed_size=128, learning_rate=0.001, 
                l1_reg=0.001, l2_reg=0.001, margin=50):
    audio_inputs = tf.placeholder(shape=[None, input_shape[0], input_shape[1]], name="audio_inputs", dtype=tf.float32)
    video_inputs = tf.placeholder(shape=[None, input_shape[0], input_shape[1]], name="video_inputs", dtype=tf.float32)
    audio_labels = tf.placeholder(shape=[None], name="audio_labels", dtype=tf.float32)
    video_labels = tf.placeholder(shape=[None], name="video_labels", dtype=tf.float32)
    
    split_audio = tf.unstack(audio_inputs, axis=1)
    split_video = tf.unstack(video_inputs, axis=1)
    flattened_audio_labels = tf.layers.flatten(audio_labels)
    flattened_video_labels = tf.layers.flatten(video_labels)

    #subnetwork = build_fc_net(inputs, embed_size, np.prod(video_input_shape), l1_reg, l2_reg)
    weights = tf.trainable_variables()
    reg_1 = tf.contrib.layers.l1_regularizer(scale=l1_reg)
    reg_2 = tf.contrib.layers.l2_regularizer(scale=l2_reg)
    lstm = tf.nn.rnn_cell.BasicLSTMCell(512)
    lstm_audio, audio_states = tf.nn.static_rnn(lstm, split_audio, dtype=tf.float32)
    lstm_video, video_states = tf.nn.static_rnn(lstm, split_video, dtype=tf.float32)
    audio_embed = build_fc_net(lstm_audio[-1], embed_size, l1_reg, l2_reg)
    video_embed = build_fc_net(lstm_video[-1], embed_size, l1_reg, l2_reg, reuse=True)
    weights = tf.trainable_variables()

    #mse = tf.losses.mean_squared_error(audio_embed, video_embed) # only use loss if labels don't match?
    matches, error = contrastive_loss(audio_embed, video_embed, flattened_audio_labels, flattened_video_labels, margin)
    reg_penalty = tf.contrib.layers.apply_regularization(reg_1, weights) + tf.contrib.layers.apply_regularization(reg_2, weights)
    loss = error + reg_penalty
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return (audio_inputs, audio_labels,
            video_inputs, video_labels,
            loss, optimizer, matches,
            audio_embed, video_embed)

In [147]:
train_log_dir = 'logs/tensorboard/train/log4'
#train_summary_writer = SummaryWriter(train_log_dir)


num_epochs = 100
batch = 64
embed_size = 128
learning_rate = 0.005
l1_reg = 0.0
l2_reg = 0.001
margin=5

tf.reset_default_graph()

(audio_inputs, audio_labels, 
 video_inputs, video_labels, 
 loss, optimizer, matches,
 audio_embed, video_embed) = build_lstm_graph(audio_x[0].shape, audio_y.shape[0], embed_size, learning_rate, l1_reg, l2_reg, margin)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

for epoch in range(num_epochs):
    print("")
    print("Epoch " + str(epoch))
    print("=============")
    print("")

    train(sess, audio_inputs, audio_labels,
          video_inputs, video_labels,
          audio_x, audio_y, 
          video_x, video_y, 
          loss, optimizer, 
          audio_embed, video_embed, batch, epoch)
    
    validate(sess, audio_inputs, audio_labels,
          video_inputs, video_labels,
          audio_x, audio_y, 
          video_x, video_y, 
          loss, optimizer, 
          audio_embed, video_embed, epoch, batch_size=validation_split)

KeyboardInterrupt: 