# License

In [1]:
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Todos

* Separate out prediction, loading the model
* Confirm audio samples are being loaded in a way that is extensible
* Train with all samples

In [2]:
def installDeps():
    !pip install numpy scipy
    !pip install resampy tensorflow six
    !pip install youtube_dl
    !pip install ipywidgets
    !pip install pydub
    !pip install tqdm
    !pip install ffmpeg-python
    !apt-get install ffmpeg
#!python vggish_train_demo.py --num_batches 50 --train_vggish=False --checkpoint './vggish_model.ckpt'

from __future__ import print_function

from random import shuffle

import numpy as np
import tensorflow as tf
import os
import vggish_input
import vggish_params
import vggish_slim
from pydub import AudioSegment
from audioUtils import readFolder

slim = tf.contrib.slim

In [45]:

def getNoise(shuf = True, number_of_samples = 1):
    """Returns a shuffled batch of examples of all audio classes.

    Note that this is just a toy function because this is a simple demo intended
    to illustrate how the training code might work.

    Returns:
    a tuple (features, labels) where features is a NumPy array of shape
    [batch_size, num_frames, num_bands] where the batch_size is variable and
    each row is a log mel spectrogram patch of shape [num_frames, num_bands]
    suitable for feeding VGGish, while labels is a NumPy array of shape
    [batch_size, num_classes] where each row is a multi-hot label vector that
    provides the labels for corresponding rows in features.
    """
    # Make a waveform for each class.
    num_seconds = number_of_samples
    sr = 44100  # Sampling rate.
    t = np.linspace(0, num_seconds, int(num_seconds * sr))  # Time axis.
    # Random sine wave.
    freq = np.random.uniform(100, 1000)
    sine = np.sin(2 * np.pi * freq * t)
    # Random constant signal.
    magnitude = np.random.uniform(-1, 1)
    const = magnitude * t
    # White noise.
    noise = np.random.normal(-1, 1, size=t.shape)

    # Make examples of each signal and corresponding labels.
    # Sine is class index 0, Const class index 1, Noise class index 2.
    sine_examples = vggish_input.waveform_to_examples(sine, sr)
    sine_labels = np.array([[1, 0, 0]] * sine_examples.shape[0])
    const_examples = vggish_input.waveform_to_examples(const, sr)
    const_labels = np.array([[0, 1, 0]] * const_examples.shape[0])
    noise_examples = vggish_input.waveform_to_examples(noise, sr)
    noise_labels = np.array([[0, 0, 1]] * noise_examples.shape[0])

    # Shuffle (example, label) pairs across all classes.
    all_examples = np.concatenate((sine_examples, const_examples, noise_examples))
    all_labels = np.concatenate((sine_labels, const_labels, noise_labels))
    labeled_examples = list(zip(all_examples, all_labels))
    if shuf:
        shuffle(labeled_examples)

    # Separate and return the features and labels.
    features = [example for (example, _) in labeled_examples]
    labels = [label for (_, label) in labeled_examples]
    return (features, labels)

def getFilePathsForClass(c):
    dirs = readFolder('samples/%s' % (c))
    collected_files = []
    for d in dirs[:1]:
        files = readFolder('samples/%s/%s/out' % (c, d))

        for file in files:
            collected_files.append('samples/%s/%s/out/%s' % (c, d, file))
    return collected_files
            
def getSampleForFile(file):
    return AudioSegment.from_file(file).get_array_of_samples()

# accepts a numpy array representing a single audio file, or multiple files concat'ed together
def getFileAsVggishInput(sample):
    return vggish_input.waveform_to_examples(sample, 44100)

# append every audio file into one enormous massive audio file
def getSamplesForFiles(files, number_of_samples):
    sample = np.array([])
    
    for file in files:
        audio = getSampleForFile(file)
        sample = np.append(sample, audio)
        
    vggishInput = getFileAsVggishInput(sample)[0:number_of_samples]
    return vggishInput

def getData(files, number_of_samples, arr):
    examples = getSamplesForFiles(files, number_of_samples)
    labels = np.array([arr] * examples.shape[0])
    
    return (examples, labels)

def getOneHot(class_num, idx):
    arr = np.zeros(class_num)
    arr[idx] = 1
    return arr

def getSamples(classes, shuf = True, number_of_samples = None):
    exes = []
    whys = []
    #print('collecting samples')
    for idx, cls in enumerate(classes):
        files = getFilePathsForClass(cls)
        x, y = getData(files, number_of_samples, getOneHot(len(classes), idx))
        exes.append(x)
        whys.append(y)
    
    all_examples = np.concatenate(exes)
    all_labels = np.concatenate(whys)
    labeled_examples = list(zip(all_examples, all_labels))
    if shuf:
        shuffle(labeled_examples)

    # Separate and return the features and labels.
    features = [example for (example, _) in labeled_examples]
    labels = [label for (_, label) in labeled_examples]
    return (features, labels)

def loadVGGish(sess, number_of_classes):
    embeddings = vggish_slim.define_vggish_slim(True) # Do we train VGG-ish?

    # Define a shallow classification model and associated training ops on top
    # of VGGish.
    with tf.variable_scope('mymodel'):
        # Add a fully connected layer with 100 units.
        num_units = 100
        fc = slim.fully_connected(embeddings, num_units)

        # Add a classifier layer at the end, consisting of parallel logistic
        # classifiers, one per class. This allows for multi-class tasks.
        logits = slim.fully_connected(
          fc, number_of_classes, activation_fn=None, scope='logits')
        pred = tf.sigmoid(logits, name='prediction')

        # Add training ops.
        with tf.variable_scope('train'):
            global_step = tf.Variable(
                0, name='global_step', trainable=False,
                collections=[tf.GraphKeys.GLOBAL_VARIABLES,
                             tf.GraphKeys.GLOBAL_STEP])

        # Labels are assumed to be fed as a batch multi-hot vectors, with
        # a 1 in the position of each positive class label, and 0 elsewhere.
        labels = tf.placeholder(
            tf.float32, shape=(None, number_of_classes), name='labels')

        # Cross-entropy label loss.
        xent = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=logits, labels=labels, name='xent')
        loss = tf.reduce_mean(xent, name='loss_op')
        tf.summary.scalar('loss', loss)

        # We use the same optimizer and hyperparameters as used to train VGGish.
        optimizer = tf.train.AdamOptimizer(
            learning_rate=vggish_params.LEARNING_RATE,
            epsilon=vggish_params.ADAM_EPSILON)
        optimizer.minimize(loss, global_step=global_step, name='train_op')

    # Initialize all variables in the model, and then load the pre-trained
    # VGGish checkpoint.
    sess.run(tf.global_variables_initializer())
    vggish_slim.load_vggish_slim_checkpoint(sess, './vggish_model.ckpt')
    print(logits, pred)    
    return logits, pred
    
    
def train(get_examples, number_of_classes, model_name = 'model', epochs = 50):
    model_name_to_save = './model/%s' % (model_name)    
    with tf.Graph().as_default(), tf.Session() as sess:
        # Define VGGish.
        logits, pred = loadVGGish(sess, number_of_classes)

        # Locate all the tensors and ops we need for the training loop.
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        #for op in tf.get_default_graph().get_operations():
            #print(str(op.name))

        labels_tensor = sess.graph.get_tensor_by_name('mymodel/labels:0')
        #labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0')    
        global_step_tensor = sess.graph.get_tensor_by_name(
            'mymodel/train/global_step:0')
        loss_tensor = sess.graph.get_tensor_by_name('mymodel/loss_op:0')
        train_op = sess.graph.get_operation_by_name('mymodel/train_op')

        # The training loop.
        for _ in range(epochs):
            (features, labels) = get_examples(shuf=True)
            [num_steps, loss, _] = sess.run(
                [global_step_tensor, loss_tensor, train_op],
                feed_dict={features_tensor: features, labels_tensor: labels})
            print('Step %d: loss %g' % (num_steps, loss))
            saver = tf.train.Saver()
            saver.save(sess, model_name_to_save)            

def predict(model_name, number_of_classes, features, get_examples):
    model_name_to_load = './model/%s' % (model_name)   

    
    with tf.Graph().as_default(), tf.Session() as sess:
        logits, pred = loadVGGish(sess, number_of_classes)
        (features, labels) = get_examples(shuf=False)
        saver = tf.train.Saver()        
        saver.restore(sess, model_name_to_load)  
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        prediction=tf.argmax(logits,1)
        print(np.array(features).shape)
        embedding_batch = sess.run(pred, feed_dict={features_tensor: features})
        return embedding_batch 

def getLaughTracks(number_of_samples = 1, shuf = True, use_cache = True):
    features_name = 'checkpoints/features_%s.npy' % (number_of_samples)
    labels_name = 'checkpoints/labels_%s.npy' % (number_of_samples)
    
    if use_cache and os.path.isfile(features_name) and os.path.isfile(labels_name):
        #print('using cache for laugh tracks')
        features = np.load(features_name)
        labels = np.load(labels_name)        
    else:
        #print('not using cache for laugh tracks')
        (features, labels) = getSamples(['laughter', 'notlaughter'], shuf = False, number_of_samples = number_of_samples)
        np.save('checkpoints/features_%s.npy' % (number_of_samples), features)
        np.save('checkpoints/labels_%s.npy' % (number_of_samples), labels)

    labeled_examples = list(zip(features, labels))
    if shuf:
        shuffle(labeled_examples)

    # Separate and return the features and labels.
    features = [example for (example, _) in labeled_examples]
    labels = [label for (_, label) in labeled_examples]
    return (features, labels)




def trainAndSaveAndPredict(number_of_classes, number_of_samples = 1, epochs = 5, getData = getLaughTracks):
    def curriedGetSamples(shuf):
        return getData(number_of_samples = number_of_samples, shuf = shuf)
    model_name = 'model_%s' % (number_of_samples)
    preds = train(curriedGetSamples, number_of_classes, model_name = model_name, epochs = epochs)
    
    (features, labels) = getData(shuf=False)
    print(np.array(features).shape)    
    preds = predict(model_name, number_of_classes, features, getData)

    with tf.Graph().as_default(), tf.Session() as sess:
        print(preds)
        print(sess.run(tf.argmax(input=preds, axis=1)))


In [44]:
number_of_samples = 1
epochs = 1
print('training on noise, sin, and constant waves')
trainAndSaveAndPredict(number_of_samples = number_of_samples, epochs = epochs, number_of_classes = 3, getData = getNoise)
#print('training on laughter and not laughter')
#trainAndSaveAndPredict(number_of_samples = number_of_samples, epochs = epochs, number_of_classes = 2, getData = getLaughTracks)

training on noise, sin, and constant waves
INFO:tensorflow:Restoring parameters from ./vggish_model.ckpt
Tensor("mymodel/logits/BiasAdd:0", shape=(?, 3), dtype=float32) Tensor("mymodel/prediction:0", shape=(?, 3), dtype=float32)
Step 1: loss 0.779227
(2, 3)
INFO:tensorflow:Restoring parameters from ./vggish_model.ckpt
Tensor("mymodel/logits/BiasAdd:0", shape=(?, 3), dtype=float32) Tensor("mymodel/prediction:0", shape=(?, 3), dtype=float32)
INFO:tensorflow:Restoring parameters from ./model/model_1
(3, 96, 64)
[[0.48206213 0.5950609  0.54132783]
 [0.55020595 0.45138192 0.5996106 ]
 [0.5323726  0.41639867 0.45353687]]
[1 2 0]


In [5]:

#trainAndSaveAndPredict2(number_of_samples = 3, epochs = 5)
(features, labels) = getNoise(shuf = False, number_of_samples = 1)
noise_f = features
noise_l = labels
(features, labels) = getSamples(['laughter', 'notlaughter'], shuf = False, number_of_samples = 1)
yt_f = features
yt_l = labels

print(np.array(noise_f).shape)
print(np.array(noise_l).shape)
print(np.array(yt_f).shape)
print(np.array(yt_l).shape)



(3, 96, 64)
(3, 3)
(2, 96, 64)
(2, 2)


In [None]:
## WORKING IMPLEMENTATION OF TRAIN
def train(get_examples, number_of_classes, model_name = 'model', epochs = 50):
    model_name_to_save = './model/%s' % (model_name)    
    with tf.Graph().as_default(), tf.Session() as sess:
        pred = None
        # Define VGGish.
        embeddings = vggish_slim.define_vggish_slim(True) # Do we train VGG-ish?

        # Define a shallow classification model and associated training ops on top
        # of VGGish.
        with tf.variable_scope('mymodel'):
            # Add a fully connected layer with 100 units.
            num_units = 100
            fc = slim.fully_connected(embeddings, num_units)

            # Add a classifier layer at the end, consisting of parallel logistic
            # classifiers, one per class. This allows for multi-class tasks.
            logits = slim.fully_connected(
              fc, number_of_classes, activation_fn=None, scope='logits')
            pred = tf.sigmoid(logits, name='prediction')

            # Add training ops.
            with tf.variable_scope('train'):
                global_step = tf.Variable(
                    0, name='global_step', trainable=False,
                    collections=[tf.GraphKeys.GLOBAL_VARIABLES,
                                 tf.GraphKeys.GLOBAL_STEP])

            # Labels are assumed to be fed as a batch multi-hot vectors, with
            # a 1 in the position of each positive class label, and 0 elsewhere.
            labels = tf.placeholder(
                tf.float32, shape=(None, number_of_classes), name='labels')

            # Cross-entropy label loss.
            xent = tf.nn.sigmoid_cross_entropy_with_logits(
                logits=logits, labels=labels, name='xent')
            loss = tf.reduce_mean(xent, name='loss_op')
            tf.summary.scalar('loss', loss)

            # We use the same optimizer and hyperparameters as used to train VGGish.
            optimizer = tf.train.AdamOptimizer(
                learning_rate=vggish_params.LEARNING_RATE,
                epsilon=vggish_params.ADAM_EPSILON)
            optimizer.minimize(loss, global_step=global_step, name='train_op')

        # Initialize all variables in the model, and then load the pre-trained
        # VGGish checkpoint.
        sess.run(tf.global_variables_initializer())
        vggish_slim.load_vggish_slim_checkpoint(sess, './vggish_model.ckpt')

        # Locate all the tensors and ops we need for the training loop.
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        #for op in tf.get_default_graph().get_operations():
            #print(str(op.name))

        labels_tensor = sess.graph.get_tensor_by_name('mymodel/labels:0')
        #labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0')    
        global_step_tensor = sess.graph.get_tensor_by_name(
            'mymodel/train/global_step:0')
        loss_tensor = sess.graph.get_tensor_by_name('mymodel/loss_op:0')
        train_op = sess.graph.get_operation_by_name('mymodel/train_op')

        # The training loop.
        for _ in range(epochs):
            (features, labels) = get_examples(shuf=True)
            [num_steps, loss, _] = sess.run(
                [global_step_tensor, loss_tensor, train_op],
                feed_dict={features_tensor: features, labels_tensor: labels})
            print('Step %d: loss %g' % (num_steps, loss))
            saver = tf.train.Saver()
            saver.save(sess, model_name_to_save)






        # FIGURE OUT HOW TO LOAD THE SAVED MODEL HERE

        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        prediction=tf.argmax(logits,1)
        (features, labels) = get_examples(shuf=False)
        embedding_batch = sess.run(pred, feed_dict={features_tensor: features})
        return embedding_batch 