## The file can be used to understand how the VGGISH model makes the mel Spectrogram. 

In the following file I try to feed in a wav file and test all the functions of the file vggish_input and mel_features. This is done in an attempt to understand how the model converts all the data to the mel spectrogram. In order to make changes to the vggish_model complete understanding of the code is needed.

In [1]:
from vggish_input import waveform_to_examples, wavfile_to_examples

In [2]:
# These are the constants defined in the vggish model. These constants are used to make the whole vggish model
# more general and reusable. 

# Architectural constants.
NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
EMBEDDING_SIZE = 128  # Size of embedding layer.

# Hyperparameters used in feature and example generation.
SAMPLE_RATE = 16000
STFT_WINDOW_LENGTH_SECONDS = 0.025
STFT_HOP_LENGTH_SECONDS = 0.010
NUM_MEL_BINS = NUM_BANDS
MEL_MIN_HZ = 125
MEL_MAX_HZ = 7500
LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
EXAMPLE_HOP_SECONDS = 0.96     # with zero overlap.

# Parameters used for embedding postprocessing.
PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
PCA_MEANS_NAME = 'pca_means'
QUANTIZE_MIN_VAL = -2.0
QUANTIZE_MAX_VAL = +2.0

# Hyperparameters used in training.
INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.

# Names of ops, tensors, and features.
INPUT_OP_NAME = 'vggish/input_features'
INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
OUTPUT_OP_NAME = 'vggish/embedding'
OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'


In [3]:
import mel_features
import mel_features
import vggish_params
import numpy as np

In [4]:
from scipy.io import wavfile

fs, data = wavfile.read('tester.wav')
data.shape

(14598982,)

In [5]:
print(fs)

250000


In [7]:
# this is the normal conventional method of looking at data
# the main problem that we get in here while working with the normal function and manually typing ours is that when
# the sampling frequency of the wav file that we get is not equal to the sampling frequency that the VGGish
# model has specified then the vggish model resamples the data and we might lose some of the data in that process. 

vggish_spectrogram_data = wavfile_to_examples('tester.wav')
vggish_spectrogram_data.shape

250000
fft_length2048
num_samples14598982
num_frames23356
shape(23356, 1562)
strides(5000, 8)
(23356, 1562)
(23356, 1562)
sfft_mangnitute return thing shape (23356, 1025)
Num mel bins64
Hi 3
Hi 4
num_samples23356
num_frames243
shape(243, 96, 64)
strides(49152, 512, 8)


(243, 96, 64)

In [8]:
import matplotlib.pyplot as plt
plt.imshow(vggish_spectrogram_data[1][:90,:].T)

<matplotlib.image.AxesImage at 0x1c2c434e80>

In [9]:
vggish_spectrogram_data.shape

(243, 96, 64)

In [10]:
vggish_mel_matrix = mel_features.spectrogram_to_mel_matrix()
vggish_mel_matrix.shape

Num mel bins20


(129, 20)

In [11]:
# doing the calculations anf the computaion on our own
log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=vggish_params.SAMPLE_RATE,
      log_offset=vggish_params.LOG_OFFSET,
      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=vggish_params.NUM_MEL_BINS,
      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
      upper_edge_hertz=vggish_params.MEL_MAX_HZ)

fft_length2048
num_samples14598982
num_frames23356
shape(23356, 1562)
strides(1250, 2)
(23356, 1562)
(23356, 1562)
sfft_mangnitute return thing shape (23356, 1025)
Num mel bins64


In [12]:
features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS

example_window_length = int(round(
      vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))

example_hop_length = int(round(
      vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))

log_mel_examples = mel_features.frame(
      log_mel,
      window_length=example_window_length,
      hop_length=example_hop_length)

num_samples23356
num_frames243
shape(243, 96, 64)
strides(49152, 512, 8)


In [13]:
# it is a 2d array from before only and hence it is easy to plot it in the original form
log_mel_examples.shape
# plt.imshow(log_mel_examples[50][:90,:].T)

(243, 96, 64)

In [14]:
vggish_params.MEL_MIN_HZ

125

In [15]:
# this is the output of the function spectogram_to_mel_matrix 
# this returns the things that when multiplyed to the spectrogram rows returns the mel frequencies. 
vggish_mel_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
vggish_mel_matrix.shape

(129, 20)

In [17]:
log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=vggish_params.SAMPLE_RATE,
      log_offset=vggish_params.LOG_OFFSET,
      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=vggish_params.NUM_MEL_BINS,
      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
      upper_edge_hertz=vggish_params.MEL_MAX_HZ)

fft_length2048
num_samples14598982
num_frames23356
shape(23356, 1562)
strides(1250, 2)
(23356, 1562)
(23356, 1562)
sfft_mangnitute return thing shape (23356, 1025)
Num mel bins64


In [18]:
# Convert waveform to a log magnitude mel-frequency spectrogram.
# it is the output of the function log_mel_spectrogram with all the default values set in the 
# vggish_params
log_mel

array([[-4.60517019e+00,  4.12257316e-02, -2.60327985e-01, ...,
         2.34600797e+00,  2.91223380e+00,  3.18694840e+00],
       [-4.60517019e+00,  1.81355435e-02, -2.83338519e-01, ...,
         2.35884054e+00,  2.56791689e+00,  3.05307805e+00],
       [-4.60517019e+00,  3.05837822e-01,  3.49094283e-03, ...,
         2.22386920e+00,  2.65608219e+00,  3.10463493e+00],
       ...,
       [-4.60517019e+00,  6.55190780e+00,  6.24694319e+00, ...,
         1.15817707e+01,  1.18374610e+01,  1.18577883e+01],
       [-4.60517019e+00,  6.27677907e+00,  5.97181607e+00, ...,
         1.21840830e+01,  1.18757847e+01,  1.19767618e+01],
       [-4.60517019e+00,  6.11815986e+00,  5.81319802e+00, ...,
         1.16924595e+01,  1.20798157e+01,  1.21290344e+01]])

In [19]:
log_mel.shape

(23356, 64)

In [20]:
audio_sample_rate = vggish_params.SAMPLE_RATE
log_offset = vggish_params.LOG_OFFSET
window_length_secs = vggish_params.STFT_WINDOW_LENGTH_SECONDS
hop_length_secs = vggish_params.STFT_HOP_LENGTH_SECONDS
num_mel_bins = vggish_params.NUM_MEL_BINS
lower_edge_hertz = vggish_params.MEL_MIN_HZ
upper_edge_hertz = vggish_params.MEL_MAX_HZ

In [21]:
window_length_samples = int(round(audio_sample_rate * window_length_secs))
hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
spectrogram = mel_features.stft_magnitude(
  data,
  fft_length=fft_length,
  hop_length=hop_length_samples,
  window_length=window_length_samples)

num_samples14598982
num_frames23356
shape(23356, 1562)
strides(1250, 2)
(23356, 1562)
(23356, 1562)


In [22]:
spectrogram

array([[3.36784690e+00, 2.87676021e+00, 1.79288605e+00, ...,
        1.03619537e+01, 1.19902701e+02, 8.23935787e+01],
       [1.79676958e+00, 2.07057333e+00, 1.75156574e+00, ...,
        1.78019618e+02, 2.15259209e+02, 2.11680873e+02],
       [2.40631295e+00, 2.41765787e+00, 2.34125913e+00, ...,
        1.22992238e+02, 1.00755308e+02, 5.85923589e+01],
       ...,
       [6.72202267e+03, 4.60661114e+03, 1.21699124e+03, ...,
        1.28795190e+03, 2.31108228e+03, 2.74121151e+03],
       [6.65873115e+03, 4.46425450e+03, 9.24269451e+02, ...,
        2.01286327e+03, 1.87433210e+03, 2.31111821e+03],
       [6.89395688e+03, 4.58291163e+03, 7.88696202e+02, ...,
        2.18118862e+03, 1.36512099e+03, 1.82521190e+03]])

In [24]:
vggish_matrix_function_call = mel_features.spectrogram_to_mel_matrix(
  num_spectrogram_bins=spectrogram.shape[1],
  audio_sample_rate=audio_sample_rate)

Num mel bins20


In [25]:
vggish_matrix_function_call

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.33020269, 0.66979731, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [29]:
mel_spectrogram = np.dot(spectrogram, mel_features.spectrogram_to_mel_matrix(
  num_spectrogram_bins=spectrogram.shape[1],
  audio_sample_rate=audio_sample_rate))

log_spectrogram = np.log(mel_spectrogram + log_offset)

Num mel bins20


In [30]:
# this is the output of the function log_mel_spectrogram which gives me the final 
# this function basically gives me the frequency sets for the 20 bins or windows specified to it. 
log_spectrogram

array([[-0.50747158,  0.19133931, -0.06603049, ...,  1.05802784,
         1.42566213,  1.36976538],
       [-0.53039634,  0.16821757,  0.47505442, ...,  0.92719236,
         1.23455277,  0.99082727],
       [-0.24450755,  0.45626535,  0.44533956, ...,  1.92971341,
         1.05336681,  1.79702693],
       ...,
       [ 5.99611319,  6.70336902,  5.89689741, ...,  9.05153488,
         9.23293693,  9.30378973],
       [ 5.72098782,  6.42823965,  6.0977991 , ...,  9.604684  ,
         9.60613647,  9.27792826],
       [ 5.56237101,  6.26961999,  6.53442616, ...,  9.0614865 ,
         8.93144538,  9.43334736]])

In [31]:
log_spectrogram.shape

(23356, 20)

In [32]:
# so this function basically converts given set of frequencies to the mel ones according to the HKV 
# formula 
frequencies_to_mel = mel_features.hertz_to_mel(fs)

In [33]:
frequencies_to_mel

6627.8103066390595

In [34]:
# so basically it is difficult to find the frequencies that are used to plot the mel 
# so a better thing to do in this case is to find the evenly spaced things in the max and the min of the frequencies
# and then converting it to the mel frequencies 

final_freqs = np.linspace(0,125001,64)

final_frequencies = mel_features.hertz_to_mel(final_freqs)


In [35]:
final_frequencies

array([   0.        , 1514.72900826, 2138.44515516, 2537.61736084,
       2831.79106167, 3064.85533047, 3257.88333188, 3422.63449506,
       3566.34464939, 3693.78466869, 3808.26677121, 3912.18453245,
       4007.32403388, 4095.05350033, 4176.44426732, 4252.35091293,
       4323.46605541, 4390.35885877, 4453.50273441, 4513.29568377,
       4570.07551017, 4624.13137832, 4675.71272593, 4725.03622334,
       4772.29127286, 4817.64440076, 4861.24279904, 4903.21720735,
       4943.68427704, 4982.74852542, 5020.50396236, 5057.03545323,
       5092.41986774, 5126.72705388, 5160.02066787, 5192.35888507,
       5223.79501168, 5254.37801339, 5284.15297421, 5313.16149621,
       5341.44204909, 5369.03027696, 5395.95926849, 5422.25979544,
       5447.96052408, 5473.08820302, 5497.66783048, 5521.7228038 ,
       5545.27505326, 5568.34516222, 5590.95247525, 5613.11519553,
       5634.85047296, 5656.17448385, 5677.10250327, 5697.64897078,
       5717.82755027, 5737.65118452, 5757.13214511, 5776.28207

In [36]:
final_freqs

array([     0.        ,   1984.14285714,   3968.28571429,   5952.42857143,
         7936.57142857,   9920.71428571,  11904.85714286,  13889.        ,
        15873.14285714,  17857.28571429,  19841.42857143,  21825.57142857,
        23809.71428571,  25793.85714286,  27778.        ,  29762.14285714,
        31746.28571429,  33730.42857143,  35714.57142857,  37698.71428571,
        39682.85714286,  41667.        ,  43651.14285714,  45635.28571429,
        47619.42857143,  49603.57142857,  51587.71428571,  53571.85714286,
        55556.        ,  57540.14285714,  59524.28571429,  61508.42857143,
        63492.57142857,  65476.71428571,  67460.85714286,  69445.        ,
        71429.14285714,  73413.28571429,  75397.42857143,  77381.57142857,
        79365.71428571,  81349.85714286,  83334.        ,  85318.14285714,
        87302.28571429,  89286.42857143,  91270.57142857,  93254.71428571,
        95238.85714286,  97223.        ,  99207.14285714, 101191.28571429,
       103175.42857143, 1

In [37]:
#trying to plot the graph with different frequencies 
final_freqs = np.linspace(0,8000,64)
final_freqs

array([   0.        ,  126.98412698,  253.96825397,  380.95238095,
        507.93650794,  634.92063492,  761.9047619 ,  888.88888889,
       1015.87301587, 1142.85714286, 1269.84126984, 1396.82539683,
       1523.80952381, 1650.79365079, 1777.77777778, 1904.76190476,
       2031.74603175, 2158.73015873, 2285.71428571, 2412.6984127 ,
       2539.68253968, 2666.66666667, 2793.65079365, 2920.63492063,
       3047.61904762, 3174.6031746 , 3301.58730159, 3428.57142857,
       3555.55555556, 3682.53968254, 3809.52380952, 3936.50793651,
       4063.49206349, 4190.47619048, 4317.46031746, 4444.44444444,
       4571.42857143, 4698.41269841, 4825.3968254 , 4952.38095238,
       5079.36507937, 5206.34920635, 5333.33333333, 5460.31746032,
       5587.3015873 , 5714.28571429, 5841.26984127, 5968.25396825,
       6095.23809524, 6222.22222222, 6349.20634921, 6476.19047619,
       6603.17460317, 6730.15873016, 6857.14285714, 6984.12698413,
       7111.11111111, 7238.0952381 , 7365.07936508, 7492.06349

In [38]:
final_frequencies = mel_features.hertz_to_mel(final_freqs)

In [39]:
final_frequencies

array([   0.        ,  187.87672243,  348.86291659,  489.7011444 ,
        614.87821968,  727.53122616,  829.93988623,  923.81306166,
       1010.46475385, 1090.92710374, 1166.0256543 , 1236.43105657,
       1302.69554809, 1365.27928705, 1424.56974979, 1480.89627503,
       1534.5411427 , 1585.74813338, 1634.72922599, 1681.66989949,
       1726.73337386, 1770.06403517, 1811.79022625, 1852.02653889,
       1890.87571067, 1928.4302057 , 1964.77354031, 1999.98140153,
       2034.12259616, 2067.25986022, 2099.45055272, 2130.74725316,
       2161.19827818, 2190.84813029, 2219.73788904, 2247.90555331,
       2275.38634186, 2302.21295807, 2328.41582393, 2354.02328743,
       2379.06180687, 2403.55611522, 2427.52936692, 2451.00326942,
       2473.99820129, 2496.53331851, 2518.62665039, 2540.29518618,
       2561.5549536 , 2582.42109003, 2602.90790729, 2623.02895055,
       2642.7970522 , 2662.22438092, 2681.32248673, 2700.10234223,
       2718.57438043, 2736.74852956, 2754.63424516, 2772.24053

In [40]:
print(mel_features._MEL_HIGH_FREQUENCY_Q)

1127.0


In [41]:
print(mel_features._MEL_BREAK_FREQUENCY_HERTZ)

700.0


In [42]:
mel_features._MEL_HIGH_FREQUENCY_Q * np.log(
      1.0 + (8000 / mel_features._MEL_BREAK_FREQUENCY_HERTZ))

2840.0377117383778