In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scipy
import numpy as np

import librosa
from librosa import display
from IPython.display import Audio
import matplotlib.pyplot as plt

#### Load audio into python

In [None]:
file_path = '/root/code/male/tests/example_data/audio1.wav'

In [None]:
samples, sampling_rate = librosa.load(file_path, sr=None, mono=True, offset=0.0, duration=None)

In [None]:
print(sampling_rate)
print(len(samples))

In [None]:
duration = len(samples) / sampling_rate
print(duration)

In [None]:
Audio(file_path)

#### Visualize

In [None]:
plt.figure(figsize=(15, 4))
librosa.display.waveplot(y=samples, sr=sampling_rate)
plt.xlabel('Time (seconds) --->')
plt.ylabel('Amplitude')

## Fast Fourier Transform (FFT)

#### Simple Sine Wave to Understand FFT

To understand the output of FFT, let’s create a simple sine wave. The following piece of code creates a sine wave with a sampling rate = 100, amplitude = 1 and frequency = 3. Amplitude values are calculated every 1/100th second (sampling rate) and stored into a list called y1. We will pass these discrete amplitude values to calculate DFT of this signal using the FFT algorithm.

In [None]:
samples = 100
f = 3
x = np.arange(samples)
y1 = np.sin(2 * np.pi * f * (x / samples))  # amplitude
plt.figure()
plt.stem(x, y1, 'r')
plt.plot(x, y1)
plt.xlabel('Time --->')
plt.ylabel('<-- Amplitude -->')

Now we have a sequence of amplitudes stored in list y1. We will pass this sequence to the FFT algorithm implemented by scipy. This algorithm returns a list yf of complex-valued amplitudes of the frequencies found in the signal. The first half of this list returns positive-frequency-terms, and the other half returns negative-frequency-terms which are similar to the positive ones. You can pick out any one half and calculate absolute values to represent the frequencies present in the signal. Following function takes samples as input and plots the frequency graph 

In [None]:
def fft_plot(audio, sampling_rate):
    n = len(audio)
    T = 1 / sampling_rate
    yf = scipy.fft.fft(audio)
    xf = np.linspace(0.0, 1.0 / (2.0 * T), n // 2)
    fig, ax = plt.subplots()
    ax.plot(xf, 2.0 / n * np.abs(yf[:n // 2]))
    plt.grid()
    plt.xlabel('Frequency --->')
    plt.ylabel('Magnitude')
    return plt.show()

In [None]:
fft_plot(y1, samples)

To check out the output of FFT for a signal having more than one frequency, Let’s create another sine wave. This time we will keep sampling rate = 100, amplitude = 2 and frequency value = 11. Following code generates this signal and plots the sine wave 

In [None]:
samples = 100
f = 11
x = np.arange(samples)
y2 = np.sin(2 * np.pi * f * (x / samples))  # amplitude
plt.figure()
plt.stem(x, y2, 'r')
plt.plot(x, y2)
plt.xlabel('Time --->')
plt.ylabel('<-- Amplitude -->')

We have kept the sampling rate = 100 because later we are going to add this signal to our old sine wave.<br>
Obviously FFT function will show a single spike with frequency = 11 for this wave also. But we want to see what happens if we add these two signals of the same sampling rate but the different frequency and amplitude values. Here sequence y3 will represent the resultant signal.

In [None]:
y3 = y1 + y2
plt.figure()
plt.stem(x, y3, 'r')
plt.plot(x, y3)
plt.xlabel('Time --->')
plt.ylabel('<-- Amplitude -->')

In [None]:
fft_plot(y3, samples)

#### FFT on our Audio signal

Now that we have seen how this FFT algorithm gives us all the frequencies in a given signal. let’s try to pass our original audio signal into this function. We are using the same audio clip we loaded earlier into the python with a sampling rate = 16000.

In [None]:
samples, sampling_rate = librosa.load(file_path, sr=None, mono=True, offset=0.0, duration=None)

In [None]:
fft_plot(samples, sampling_rate)

## Spectrogram

In [None]:
def spectrogram(samples, sample_rate, stride_ms=10.0, window_ms=20.0, max_freq=None, eps=1e-14):

    stride_size = int(0.001 * sample_rate * stride_ms)
    window_size = int(0.001 * sample_rate * window_ms)

    # Extract strided windows
    truncate_size = (len(samples) - window_size) % stride_size
    samples = samples[:len(samples) - truncate_size]
    nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
    nstrides = (samples.strides[0], samples.strides[0] * stride_size)
    windows = np.lib.stride_tricks.as_strided(samples, shape = nshape, strides = nstrides)
    
    assert np.all(windows[:, 1] == samples[stride_size:(stride_size + window_size)])

    # Window weighting, squared Fast Fourier Transform (fft), scaling
    weighting = np.hanning(window_size)[:, None]
    
    fft = np.fft.rfft(windows * weighting, axis=0)
    fft = np.absolute(fft)
    fft = fft ** 2
    
    scale = np.sum(weighting ** 2) * sample_rate
    fft[1:-1, :] *= (2.0 / scale)
    fft[(0, -1), :] /= scale
    
    # Prepare fft frequency list
    freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
    
    # Compute spectrogram feature
    ind = np.where(freqs <= max_freq)[0][-1] + 1
    specgram = np.log(fft[:ind, :] + eps)
    return specgram

In [None]:
spectrogram(samples, sampling_rate)