## The following file is used to understand how the VGGISH model makes the mel Spectrogram. 

In the following file I try to feed in a wav file and test all the functions of the file vggish_input and mel_features. This is done in an attempt to understand how the model converts all the data to the mel spectrogram.  

In [None]:
from vggish_input import waveform_to_examples, wavfile_to_examples

In [2]:
# Architectural constants.
NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
EMBEDDING_SIZE = 128  # Size of embedding layer.

# Hyperparameters used in feature and example generation.
SAMPLE_RATE = 16000
STFT_WINDOW_LENGTH_SECONDS = 0.025
STFT_HOP_LENGTH_SECONDS = 0.010
NUM_MEL_BINS = NUM_BANDS
MEL_MIN_HZ = 125
MEL_MAX_HZ = 7500
LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
EXAMPLE_HOP_SECONDS = 0.96     # with zero overlap.

# Parameters used for embedding postprocessing.
PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
PCA_MEANS_NAME = 'pca_means'
QUANTIZE_MIN_VAL = -2.0
QUANTIZE_MAX_VAL = +2.0

# Hyperparameters used in training.
INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.

# Names of ops, tensors, and features.
INPUT_OP_NAME = 'vggish/input_features'
INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
OUTPUT_OP_NAME = 'vggish/embedding'
OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'


In [3]:
import mel_features

In [4]:
import mel_features
import vggish_params
import numpy as np

In [5]:
from scipy.io import wavfile
# code if i wrote the first code
fs, data = wavfile.read('tester.wav')
data.shape

(14598982,)

In [6]:
print(fs)

250000


In [7]:
# this is the normal conventisnal method of looking at data
# the main problem that we get in here while working with the normal function and manually typing ours is that when
# the sampling frequency of the wav file that we get is not equal to the sampling frequency that the VGGish
# model has specified then the vggish model resamples the data and we might lose some of the data in that process. 
out = wavfile_to_examples('tester.wav')
out.shape

250000


(950, 96, 64)

In [None]:
import matplotlib.pyplot as plt
plt.imshow(out[1][:90,:].T)

In [None]:
out.shape

In [None]:
out2 = mel_features.spectrogram_to_mel_matrix()
out2.shape

In [None]:
# doing the calculations anf the computaion on our own
log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=vggish_params.SAMPLE_RATE,
      log_offset=vggish_params.LOG_OFFSET,
      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=vggish_params.NUM_MEL_BINS,
      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
      upper_edge_hertz=vggish_params.MEL_MAX_HZ)

In [None]:
features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS

example_window_length = int(round(
      vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))

example_hop_length = int(round(
      vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))

log_mel_examples = mel_features.frame(
      log_mel,
      window_length=example_window_length,
      hop_length=example_hop_length)

In [None]:
# it is a 2d array from before only and hence it is easy to plot it in the original form
log_mel_examples.shape
# plt.imshow(log_mel_examples[50][:90,:].T)

In [None]:
vggish_params.MEL_MIN_HZ

In [None]:
# this is the output of the function spectogram_to_mel_matrix 
# this returns the things that when multiplyed to the spectrogram rows returns the mel frequencies. 
out2

In [None]:
out2.shape

In [None]:
log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=vggish_params.SAMPLE_RATE,
      log_offset=vggish_params.LOG_OFFSET,
      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=vggish_params.NUM_MEL_BINS,
      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
      upper_edge_hertz=vggish_params.MEL_MAX_HZ)

In [None]:
# Convert waveform to a log magnitude mel-frequency spectrogram.
# it is the output of the function log_mel_spectrogram with all the default values set in the 
# vggish_params
log_mel

In [None]:
log_mel.shape

In [None]:
audio_sample_rate = vggish_params.SAMPLE_RATE
log_offset = vggish_params.LOG_OFFSET
window_length_secs = vggish_params.STFT_WINDOW_LENGTH_SECONDS
hop_length_secs = vggish_params.STFT_HOP_LENGTH_SECONDS
num_mel_bins = vggish_params.NUM_MEL_BINS
lower_edge_hertz = vggish_params.MEL_MIN_HZ
upper_edge_hertz = vggish_params.MEL_MAX_HZ

In [None]:
window_length_samples = int(round(audio_sample_rate * window_length_secs))
hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
spectrogram = mel_features.stft_magnitude(
  data,
  fft_length=fft_length,
  hop_length=hop_length_samples,
  window_length=window_length_samples)

In [None]:
spectrogram

In [None]:
out3 = mel_features.spectrogram_to_mel_matrix(
  num_spectrogram_bins=spectrogram.shape[1],
  audio_sample_rate=audio_sample_rate)

In [None]:
out3

In [None]:
mel_spectrogram = np.dot(spectrogram, mel_features.spectrogram_to_mel_matrix(
  num_spectrogram_bins=spectrogram.shape[1],
  audio_sample_rate=audio_sample_rate))
out4 = np.log(mel_spectrogram + log_offset)

In [None]:
# this is the putput of the function log_mel_spectrogram which gives me the final 
# this function basically gives me the frequency sets for the 20 bins or windows specified to it. 
out4

In [None]:
out4.shape

In [None]:
# so this function basically converts given set of frequecis to the mel ones according to the HKV 
# fourmula 
out5 = mel_features.hertz_to_mel(fs)

In [None]:
out5

In [None]:
# so basically it is difficlt to find the frequencies that are used to plot the mel 
# so a better thing to do in this case is to find the evely spaced things in the max and the min of the frequencies
# and then converting it to the mel frequencies 

final_freqs = np.linspace(0,125001,64)

final_frequencies = mel_features.hertz_to_mel(final_freqs)


In [None]:
final_frequencies

In [None]:
final_freqs

In [None]:
#trying to plot the graph with different frequencies 
final_freqs = np.linspace(0,8000,64)
final_freqs

In [None]:
final_frequencies = mel_features.hertz_to_mel(final_freqs)

In [None]:
final_frequencies

In [None]:
print(mel_features._MEL_HIGH_FREQUENCY_Q)

In [None]:
print(mel_features._MEL_BREAK_FREQUENCY_HERTZ)

In [None]:
mel_features._MEL_HIGH_FREQUENCY_Q * np.log(
      1.0 + (8000 / mel_features._MEL_BREAK_FREQUENCY_HERTZ))