In [12]:
import numpy as np

## First order high pass filter
def high_pass(audio_data, alpha = 0.95):
    return np.append(audio_data[0], audio_data[1:] - alpha * audio_data[:-1])

## Get hamming windowed frames from audio data
def get_frames(audio_data, samplerate, window_ms, shift_ms):

    window_samples = int(samplerate * window_ms / 1000) # samples
    shift_samples = int(samplerate * shift_ms / 1000) # samples

    num_frames = (len(audio_data) - (window_samples - shift_samples)) // shift_samples

    frames = np.zeros((num_frames, window_samples))

    hamming = np.hamming(window_samples)

    for i in range(num_frames):
        start = i * shift_samples
        end = start + window_samples
        frames[i] = audio_data[start:end] * hamming

        # normalize to mean = 0
        #frames[i] = frames[i] - np.mean(frames[i])

    return frames

## Returns log mel spectrums of each frame of input audio
# INPUTS
# audio_data : numpy array of samples
# samplerate : audio sample rate
# frame_len : length of frames to extract from audio in ms
# frame_shift : frame shift in ms
# num_mel_filters : number of filters to use when computing mel spectrum
def get_log_mel_spectrums(audio_data, samplerate, frame_len = 25, frame_shift = 10, num_mel_filters = 40):
    
    nyquist_freq = samplerate / 2

    # Pre-emphasis
    audio_data = high_pass(audio_data)

    # Get audio frames
    frames = get_frames(audio_data, samplerate, frame_len, frame_shift)

    # FFT
    NFFT = 512
    frames_fft_mag = np.abs(np.fft.rfft(frames, NFFT))
    frames_fft_pow = ((frames_fft_mag ** 2) / NFFT)

    # Mel Spectrum

    num_filters = num_mel_filters
    fft_len = frames_fft_pow.shape[1]

    low_freq_mel = 0
    high_freq_mel = 1127 * np.log(1 + nyquist_freq / 700)
    mel_points = np.linspace(low_freq_mel, high_freq_mel, num_filters + 2)
    hz_points = 700 * (np.exp(mel_points / 1127) - 1)
    fft_bins = np.floor((hz_points / nyquist_freq) * fft_len)

    mel_filterbank = np.zeros((num_filters, fft_len))
    for m in range(1, num_filters + 1):
        f_m = int(fft_bins[m])
        f_m_left = int(fft_bins[m - 1])
        f_m_right = int(fft_bins[m + 1])

        mel_filterbank[m-1, f_m] = 1
        for k in range(f_m_left, f_m):
            mel_filterbank[m-1, k] = (1 / (f_m - f_m_left)) * (k - f_m) + 1
        for k in range(f_m + 1, f_m_right):
            mel_filterbank[m-1, k] = (1 / (f_m - f_m_right)) * (k - f_m) + 1

    mel_spectrums = np.zeros((len(frames_fft_pow), num_filters))
    for i in range(len(frames_fft_pow)):
        for j in range(len(mel_filterbank)):
            mel_spectrums[i, j] = (frames_fft_pow[i] * mel_filterbank[j]).sum()

    mel_spectrums = np.log(mel_spectrums + 1)

    return(mel_spectrums)

In [13]:
from scipy.io import wavfile

samplerate, data = wavfile.read("DARPA Audio Data/data/TRAIN/DR1/FCJF0/SA1.WAV.wav")

log_mel_spectrum = get_log_mel_spectrums(data, samplerate)
