In [1]:
import os
from glob import glob
import librosa
import numpy as np
from ipywidgets import IntProgress
import json

In [2]:
DATASET_PATH = "./data/filtered/"

In [8]:
# # note .DS_store needs to be removed
# languages = os.listdir("./data/filtered")
# print(languages) 
# languages = languages[1:]

['.DS_Store', 'german', 'mandarin', 'russian', 'english']


In [9]:
# lang_mapping = {l: i for i, l in enumerate(languages)}
# lang_mapping

{'german': 0, 'mandarin': 1, 'russian': 2, 'english': 3}

In [11]:
# audio_files_by_lang = {}

# for lang in languages:
#     audio_files_by_lang[lang] = glob(f"./data/filtered/{lang}/*.wav")

# Notes

## trimming
trimming at 20 db to remove leadingg and trailing silence. Here's what it means:<br>
10 dB: Normal breathing<br>
20 dB: Whispering from five feet away<br>
30 dB: Whispering nearby<br>
40 dB: Quiet library sounds

to play the trimmed audio:<br>
signal_trimmed, _ =  librosa.effects.trim(signal, top_db=20)<br>
ipd.Audio(data=signal_trimmed, rate=sample_rate)<br>

## amplitude_to_db
https://stackoverflow.com/questions/63347977/what-is-the-conceptual-purpose-of-librosa-amplitude-to-db

The range of perceivable sound pressure is very wide, from around 20 μPa (micro Pascal) to 20 Pa, a ratio of 1 million. Furthermore the human perception of sound levels is not linear, but better approximated by a logarithm.

By converting to decibels (dB) the scale becomes logarithmic. This limits the numerical range, to something like 0-120 dB instead. The intensity of colors when this is plotted corresponds more closely to what we hear than if one used a linear scale.

Note that the reference (0 dB) point in decibels can be chosen freely. The default for librosa.amplitude_to_db is to compute numpy.max, meaning that the max value of the input will be mapped to 0 dB. All other values will then be negative. The function also applies a threshold on the range of sounds, by default 80 dB. So anything lower than -80 dB will be clipped -80 dB.

### another parameter to try in amplitude_to_db -> add ref=np.max (also in librosa library ex): 
from Rob Mulla (2022): "Audio Data Processing in Python" (YouTube Tutorial)<br>
log_spectogram = librosa.amplitude_to_db(spectogram, ref=np.max)


## STFT window size
https://librosa.org/doc/latest/generated/librosa.stft.html#librosa.stft

n_fft = length of the windowed signal after padding with zeros. The number of rows in the STFT matrix D is (1 + n_fft/2). **The default value, n_fft=2048 samples**, corresponds to a physical duration of 93 milliseconds at a sample rate of 22050 Hz, i.e. the default sample rate in librosa. This value is well adapted for **music signals**. However, in **speech processing, the recommended value is 512**, corresponding to 23 milliseconds at a sample rate of 22050 Hz. In any case, we recommend setting n_fft to a power of two for optimizing the speed of the fast Fourier transform (FFT) algorithm.

## Preprocessing decisions:

1. Trim leading and trailing silence from an audio signal (below 20 db is considered as silence)
2. Cut signal to specified duration - 16 sec
3. STFT window size = 512 


In [75]:
# file = audio_files[-10]
# signal, sample_rate = librosa.load(file, mono=True)
# print(signal.shape)
# signal_trimmed, _ =  librosa.effects.trim(signal, top_db=20)
# print(signal_trimmed.shape)
# signal_cut = signal_trimmed[:SIGNAL_LEN]
# print(signal_cut.shape)
# signal_padded = librosa.util.fix_length(signal_trimmed, size=363262)
# print(signal_padded.shape)
# ipd.Audio(signal_padded, rate=SAMPLE_RATE)

In [82]:
N_FFT = 512

SAMPLE_RATE = 22050
# will trim data to this duration in sec
DURATION = 16
SIGNAL_LEN = SAMPLE_RATE * DURATION

# dictionary to store mapping, labels, and spectograms
data = {
    "mapping": [],
    "labels": [],
    "stft": []
}

audio_files = glob(f"./data/filtered/*/*.wav")

# instantiate the progress bar
f = IntProgress(min=0, max=len(audio_files)) 
display(f) # display the bar
    
for file in audio_files:
    f.value += 1
    lang = file.split("/")[3]
    data["mapping"].append(lang)
    data["labels"].append(lang_mapping[lang])  

    signal, sample_rate = librosa.load(file, mono=True)
    
    # Trim leading and trailing silence from an audio signal
    signal_trimmed, _ =  librosa.effects.trim(signal, top_db=20)
    
    # cut signal to specified duration or pad with 0s
    audio_duration = librosa.get_duration(y=signal_trimmed)
    if audio_duration > DURATION:
        signal_fin = signal_trimmed[:SIGNAL_LEN]
    else:
        signal_fin = librosa.util.fix_length(signal_trimmed, size=SIGNAL_LEN)

    # 1. STFT
    stft = librosa.stft(signal_fin, n_fft=N_FFT)
    spectogram = np.abs(stft)
    log_spectogram = librosa.amplitude_to_db(spectogram)
    #log_spectogram = librosa.amplitude_to_db(spectogram, ref=np.max)

    #convert to list to serialize with json (json can't work with ndarray)
    data["stft"].append(log_spectogram.tolist())  

IntProgress(value=0, max=140)

In [83]:
JSON_PATH = "./data/stft_data_16sec.json"

if os.path.exists(JSON_PATH):
    os.remove(JSON_PATH)

with open(JSON_PATH, "w") as fp:
    json.dump(data, fp, indent=4)

## MFCC v1: n_fft = 512

In [85]:
N_FFT = 512

SAMPLE_RATE = 22050
# will trim data to this duration in sec
DURATION = 16
SIGNAL_LEN = SAMPLE_RATE * DURATION
NUM_MFCC = 13

# dictionary to store mapping, labels, and spectograms
data_mfcc = {
    "mapping": [],
    "labels": [],
    "mfcc": []
}

audio_files = glob(f"./data/filtered/*/*.wav")

# instantiate the progress bar
f = IntProgress(min=0, max=len(audio_files)) 
display(f) # display the bar
    
for file in audio_files:
    f.value += 1
    lang = file.split("/")[3]
    data_mfcc["mapping"].append(lang)
    data_mfcc["labels"].append(lang_mapping[lang])  

    signal, sample_rate = librosa.load(file, mono=True)
    
    # Trim leading and trailing silence from an audio signal
    signal_trimmed, _ =  librosa.effects.trim(signal, top_db=20)
    
    # cut signal to specified duration or pad with 0s
    audio_duration = librosa.get_duration(y=signal_trimmed)
    if audio_duration > DURATION:
        signal_fin = signal_trimmed[:SIGNAL_LEN]
    else:
        signal_fin = librosa.util.fix_length(signal_trimmed, size=SIGNAL_LEN)

    
    # 2. MFCCs
    mfcc = librosa.feature.mfcc(y=signal_fin, sr=SAMPLE_RATE, n_mfcc=NUM_MFCC, n_fft=N_FFT)
    mfcc = mfcc.T
    data_mfcc["mfcc"].append(mfcc.tolist())

IntProgress(value=0, max=140)

In [86]:
MFCC_JSON_PATH = "./data/mfcc_data_16sec.json"

if os.path.exists(MFCC_JSON_PATH):
    os.remove(MFCC_JSON_PATH)

with open(MFCC_JSON_PATH, "w") as fp:
    json.dump(data_mfcc, fp, indent=4)

# MFCC v2: n_fft =255; hop_length=128

Tensorflow tutorial on keywords classification uses: <br>
frame_length=255, frame_step=128 (aka n_fft and hop_length?)

In [90]:
N_FFT = 255
HOP_LENGTH = 128

SAMPLE_RATE = 22050
# will trim data to this duration in sec
DURATION = 16
SIGNAL_LEN = SAMPLE_RATE * DURATION
NUM_MFCC = 11

# dictionary to store mapping, labels, and spectograms
data_mfcc_v2 = {
    "mapping": [],
    "labels": [],
    "mfcc": []
}

audio_files = glob(f"./data/filtered/*/*.wav")

# instantiate the progress bar
f = IntProgress(min=0, max=len(audio_files)) 
display(f) # display the bar
    
for file in audio_files:
    f.value += 1
    lang = file.split("/")[3]
    data_mfcc_v2["mapping"].append(lang)
    data_mfcc_v2["labels"].append(lang_mapping[lang])  

    signal, sample_rate = librosa.load(file, mono=True)
    
    # Trim leading and trailing silence from an audio signal
    signal_trimmed, _ =  librosa.effects.trim(signal, top_db=20)
    
    # cut signal to specified duration or pad with 0s
    audio_duration = librosa.get_duration(y=signal_trimmed)
    if audio_duration > DURATION:
        signal_fin = signal_trimmed[:SIGNAL_LEN]
    else:
        signal_fin = librosa.util.fix_length(signal_trimmed, size=SIGNAL_LEN)

    
    # 2. MFCCs
    mfcc = librosa.feature.mfcc(y=signal_fin, sr=SAMPLE_RATE, n_mfcc=NUM_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)
    mfcc = mfcc.T
    data_mfcc_v2["mfcc"].append(mfcc.tolist())

IntProgress(value=0, max=140)

In [91]:
MFCC_JSON_PATH = "./data/mfcc_data_16sec_nfft255_nmfcc11.json"

if os.path.exists(MFCC_JSON_PATH):
    os.remove(MFCC_JSON_PATH)

with open(MFCC_JSON_PATH, "w") as fp:
    json.dump(data_mfcc_v2, fp, indent=4)

# Learning from errors:
had problem with uneven array size & np.VisibleDeprecationWarning -> trimming audios made 1 audio file less than 16 sec, which resulted into ragged array -> couldn't properly import as np.array <br>
**Solution**: pad with 0s using librosa.util.fix_length<br>
Problem desc: <br>
"I faced the np.VisibleDeprecationWarning while stacking lists containing audio data from WAV files. This problem occurred because audio files had different lengths. So, the lists that I needed to stack together into one numpy array also had varying lengths. Ignoring or suppressing this warning did not give the desired stacked np array so, I made all the audio files of the same length by using pydub.AudioSegment as mentioned in this answer."
https://stackoverflow.com/questions/63097829/debugging-numpy-visibledeprecationwarning-ndarray-from-ragged-nested-sequences


UserWarning: Empty filters detected in mel frequency basis. Some channels will produce empty responses. Try increasing your sampling rate (and fmax) or reducing n_mels.
https://stackoverflow.com/questions/56929874/what-is-the-warning-empty-filters-detected-in-mel-frequency-basis-about
