# Load Data

O Free Spoken Digit Dataset é uma coleção de gravações de áudio de declarações de dígitos (“zero” a “nove”) de diferentes pessoas.
O objetivo desta competição é identificar corretamente o dígito que está sendo pronunciado em cada gravação. 

In [None]:
from src.audioProcessor import AudioProcessor

import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from os import listdir
from os.path import join
from scipy.io import wavfile

import IPython.display as ipd


In [None]:
audio = AudioProcessor()
X1 = []
y = []
for audio_file in audio.audio_files:
    X1.append(audio_file.sample)
    y.append(int(audio_file.label))
print(X1[0])

files = 'data/recordings/'
ds_files = listdir(files)

X = []
for file in ds_files:
    label = int(file.split("_")[0])
    rate, data = wavfile.read(join(files, file))
    X.append(data.astype(np.float32))

len(X), len(y)

In [None]:
np.unique(y, return_counts = True)





O problema está bem equilibrado: para cada uma das classes temos 300 amostras no conjunto de dados.
Todas as gravações são amostradas na taxa de 8 kHZ

Os sinais de áudio têm comprimentos diferentes.
Alguns deles têm intervalos iniciais e de silêncio.


In [None]:
lengths = audio.show_length_distribution(X)

Casos extermos:

In [None]:
longest_audio = np.argmax([len(x) for x in X])
print(audio.get_longest_audio())
print(X[longest_audio]/8000)
plt.plot(X[longest_audio])
plt.title("Longest audio signal")
plt.xlabel("Time")
plt.ylabel("Amplitude")
plt.show()

display(ipd.Audio(X[longest_audio], rate=audio.sample_rate))

In [None]:
shortest_audio = np.argmin([len(x) for x in X])
plt.plot(X[shortest_audio])
plt.title("Shortest audio signal")
plt.show()

display(ipd.Audio(X[shortest_audio], rate= audio.sample_rate))

## Remoção de recursos do domínio do tempo:
Vamos remover o silêncio inicial e final dos sinais para ver se obtemos uma distribuição diferente de comprimento. 

In [None]:
max_size = 0
for x in X:
    max_size = max(max_size, x.shape[0])
    
print('Max sizing before:', max_size)

# trim silence
X = [audio.remove_silence(x) for x in X ]


max_size = 0
for x in X:
    max_size = max(max_size, x.shape[0])
    
print('Max sizing after:', max_size)

In [None]:
lengths = audio.show_length_distribution(X)

In [None]:
# add padding to max size
#! ATENÇÃO NÃO É O MESMO PADDING DO AUDIOPROCESSOR
X = [ np.pad(x, (0, max_size - x.shape[0])) for x in X ] 

plt.plot(X[longest_audio])
plt.title("Longest audio signal after trimming")
plt.xlabel("Time")
plt.ylabel("Amplitude")
plt.show()

display(ipd.Audio(X[longest_audio], rate=audio.sample_rate))

In [None]:
plt.plot(X[shortest_audio])
plt.title("Shortest audio signal after trimming")
plt.show()

display(ipd.Audio(X[shortest_audio], rate=audio.sample_rate))

# Spectorgrams

In [None]:
import librosa
import tensorflow as tf
import tensorflow_io as tfio

def freq_mask(spec):
    return tfio.audio.freq_mask(spec, param=2).numpy()

def time_mask(spec):
    return tfio.audio.time_mask(spec, param=2).numpy()


def mel_spectrogram(waveform):
    spec = librosa.feature.melspectrogram(y=waveform, sr=8000)
    return librosa.power_to_db(spec, ref=np.max)


def mfcc_spectrogram(waveform):
    return librosa.feature.mfcc(y=waveform, sr=8000)


def plot_spectrogram(spectrogram, ax):
  # Convert to frequencies to log scale and transpose so that the time is
  # represented in the x-axis (columns).
  log_spec = np.log(spectrogram.T)
  height = log_spec.shape[0]
  width = log_spec.shape[1]
  X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
  Y = range(height)
  ax.pcolormesh(X, Y, log_spec)



def stft_spectrogram(waveform):
  # Padding for files with less than 16000 samples
  zero_padding = tf.zeros([max_size] - tf.shape(waveform), dtype=tf.float32)

  # Concatenate audio with padding so that all audio clips will be of the 
  # same length
  waveform = tf.cast(waveform, tf.float32)
  equal_length = tf.concat([waveform, zero_padding], 0)
  spectrogram = tf.signal.stft(
      equal_length, frame_length=255, frame_step=128)

  spectrogram = tf.abs(spectrogram)

  return spectrogram



In [None]:
wave = X[longest_audio]

S = mel_spectrogram(wave)
print(S.shape)

fig, ax = plt.subplots()

img = librosa.display.specshow(S, x_axis='time',
                         y_axis='mel', sr=8000, ax=ax)
fig.colorbar(img, ax=ax, format='%+2.0f dB')
ax.set(title='Mel-frequency Spectrogram')



S = mfcc_spectrogram(wave)
print(S.shape)


fig, ax = plt.subplots(1,2)

img = librosa.display.specshow(S, x_axis='time',
                         y_axis='mel', sr=8000, ax=ax[0])
fig.colorbar(img, ax=ax[0], format='%+2.0f dB')
ax[0].set(title='MFCC Spectrogram')


S2 = freq_mask(time_mask(S))

#fig, ax = plt.subplots()

img = librosa.display.specshow(S2, x_axis='time',
                         y_axis='mel', sr=8000, ax=ax[1])

fig.colorbar(img, ax=ax[1], format='%+2.0f dB')
fig.set_size_inches(15, 5)
ax[1].set(title='MFCC Spectrogram with Freq. and Time Mask')



In [None]:
waveform = X[longest_audio]

spectrogram = stft_spectrogram(waveform).numpy()

#spectrogram.resize(32, 32)
print(spectrogram.shape)

timescale = np.arange(waveform.shape[0])
fig, axes = plt.subplots(2, figsize=(12, 8))
axes[0].plot(timescale, waveform)
axes[0].set_title('Waveform')
axes[0].set_xlim([0, max_size])

fig.set_size_inches(10, 10)

axes[1].set_xlabel('Time')
axes[1].set_ylabel('Frequency')
axes[1].set_title('STFT Spectrogram')

plot_spectrogram(spectrogram, axes[1])