According to forums, the VGGish model code was not the original code used to generate the released embeddings.  This code serves to compare the output of the VGGish model against the embedding.

See https://groups.google.com/forum/#!topic/audioset-users/EITl3rcNDI8

In [None]:
import os
import sys

import pandas as pd
import numpy as np
from scipy.io import wavfile
from scipy.spatial.distance import euclidean, cosine

from sklearn.externals import joblib

import matplotlib.pyplot as plt
import six
import tensorflow as tf

from youtube_audioset import get_data, get_recursive_sound_names, get_all_sound_names
from youtube_audioset import explosion_sounds, motor_sounds, wood_sounds, human_sounds, nature_sounds

from youtube_audioset import download_clip

In [None]:
sys.path.append(os.path.abspath('./externals/tensorflow_models/research/audioset/'))

from vggish_input import wavfile_to_examples, waveform_to_examples
import vggish_input
import vggish_params
import vggish_postprocess
import vggish_slim

In [None]:
# This is an all silent clip

ytid = 'RhSLUvQ_LuM'
yt_start = 30
yt_end = 40

audio_file_path = 'sounds/audioset/'+ ytid+'-'+str(yt_start)+'-'+str(yt_end)+'.wav'

download_clip(ytid, yt_start, yt_end)

In [None]:
examples_batch = wavfile_to_examples(audio_file_path)

In [None]:
sr, wav_data = wavfile.read(audio_file_path)

print "Energy of signal:", np.square(wav_data).sum()

It is confirmed that the audio signal only contains zero samples.

In [None]:
# Copied from https://github.com/tensorflow/models/blob/master/research/audioset/vggish_inference_demo.py

flags = tf.app.flags

flags.DEFINE_string(
    'wav_file', None,
    'Path to a wav file. Should contain signed 16-bit PCM samples. '
    'If none is provided, a synthetic sound is used.')

flags.DEFINE_string(
    'checkpoint', './externals/tensorflow_models/research/audioset/vggish_model.ckpt',
    'Path to the VGGish checkpoint file.')

flags.DEFINE_string(
    'pca_params', './externals/tensorflow_models/research/audioset/vggish_pca_params.npz',
    'Path to the VGGish PCA parameters file.')

flags.DEFINE_string(
    'tfrecord_file', None,
    'Path to a TFRecord file where embeddings will be written.')

FLAGS = flags.FLAGS

In [None]:
# Copied from https://github.com/tensorflow/models/blob/master/research/audioset/vggish_inference_demo.py

# Prepare a postprocessor to munge the model embeddings.
pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

# If needed, prepare a record writer to store the postprocessed embeddings.
writer = tf.python_io.TFRecordWriter(
  FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    # Run inference and postprocessing.
    [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
    print(embedding_batch)
    postprocessed_batch = pproc.postprocess(embedding_batch)
    print(postprocessed_batch)

    # Write the postprocessed embeddings as a SequenceExample, in a similar
    # format as the features released in AudioSet. Each row of the batch of
    # embeddings corresponds to roughly a second of audio (96 10ms frames), and
    # the rows are written as a sequence of bytes-valued features, where each
    # feature value contains the 128 bytes of the whitened quantized embedding.
    seq_example = tf.train.SequenceExample(
        feature_lists=tf.train.FeatureLists(
            feature_list={
                vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(
                        feature=[
                            tf.train.Feature(
                                bytes_list=tf.train.BytesList(
                                    value=[embedding.tobytes()]))
                            for embedding in postprocessed_batch
                        ]
                    )
            }
        )
    )
    print(seq_example)

The results should be identical for each embedding of every 1 second interval.  So let's just look at the first result.

In [None]:
processed_embedding = postprocessed_batch[0,:]

processed_embedding

In [None]:
silence_embedding = joblib.load('parameter/silence_embedding.pkl')

silence_embedding.round()

In [None]:
cosine(silence_embedding, processed_embedding)

In [None]:
euclidean(silence_embedding, processed_embedding)

Cosine distance is low but euclidean distance is very high.