In [53]:
import os
from glob import glob
import librosa
import numpy as np
from ipywidgets import IntProgress
import json
import datetime

# Notes

## trimming
trimming at 20 db to remove leadingg and trailing silence. Here's what it means:<br>
10 dB: Normal breathing<br>
20 dB: Whispering from five feet away<br>
30 dB: Whispering nearby<br>
40 dB: Quiet library sounds

to play the trimmed audio:<br>
signal_trimmed, _ =  librosa.effects.trim(signal, top_db=20)<br>
ipd.Audio(data=signal_trimmed, rate=sample_rate)<br>

## amplitude_to_db
https://stackoverflow.com/questions/63347977/what-is-the-conceptual-purpose-of-librosa-amplitude-to-db

The range of perceivable sound pressure is very wide, from around 20 μPa (micro Pascal) to 20 Pa, a ratio of 1 million. Furthermore the human perception of sound levels is not linear, but better approximated by a logarithm.

By converting to decibels (dB) the scale becomes logarithmic. This limits the numerical range, to something like 0-120 dB instead. The intensity of colors when this is plotted corresponds more closely to what we hear than if one used a linear scale.

Note that the reference (0 dB) point in decibels can be chosen freely. The default for librosa.amplitude_to_db is to compute numpy.max, meaning that the max value of the input will be mapped to 0 dB. All other values will then be negative. The function also applies a threshold on the range of sounds, by default 80 dB. So anything lower than -80 dB will be clipped -80 dB.

### another parameter to try in amplitude_to_db -> add ref=np.max (also in librosa library ex): 
from Rob Mulla (2022): "Audio Data Processing in Python" (YouTube Tutorial)<br>
log_spectogram = librosa.amplitude_to_db(spectogram, ref=np.max)


## STFT window size
https://librosa.org/doc/latest/generated/librosa.stft.html#librosa.stft

n_fft = length of the windowed signal after padding with zeros. The number of rows in the STFT matrix D is (1 + n_fft/2). **The default value, n_fft=2048 samples**, corresponds to a physical duration of 93 milliseconds at a sample rate of 22050 Hz, i.e. the default sample rate in librosa. This value is well adapted for **music signals**. However, in **speech processing, the recommended value is 512**, corresponding to 23 milliseconds at a sample rate of 22050 Hz. In any case, we recommend setting n_fft to a power of two for optimizing the speed of the fast Fourier transform (FFT) algorithm.

## Preprocessing decisions:

1. Trim leading and trailing silence from an audio signal (below 20 db is considered as silence)
2. Cut signal to a specified duration or zero-pad if shorter
3. STFT/MFCC window size = 512 

In [72]:
def extract_features(method, process_info, dataset_path, duration, sample_rate=22050, n_fft=512, num_mfcc=13):
    data = {
        "process_info": process_info,
        "mapping": [],
        "labels": [],
        "stft": [],
        "mfcc": [],
        "cwt":[]
    }
    
    if '.DS_Store' in os.listdir(DATASET_PATH):
        os.remove(f'{DATASET_PATH}/.DS_Store')

    languages = os.listdir(DATASET_PATH)
    lang_mapping = {l: i for i, l in enumerate(languages)}


    audio_files = glob(f"{dataset_path}/*/*.wav")
    # instantiate the progress bar
    f = IntProgress(min=0, max=len(audio_files)) 
    display(f) # display the bar

    for file in audio_files:
        f.value += 1
        lang_pos = len(dataset_path.split("/"))
        lang = file.split("/")[lang_pos]

        data["mapping"].append(lang)
        data["labels"].append(lang_mapping[lang])  

        signal, sample_rate = librosa.load(file, mono=True)

        # Trim leading and trailing silence from an audio signal
        signal_trimmed, _ =  librosa.effects.trim(signal, top_db=20)

        # cut signal to specified duration or pad with 0s
        audio_duration = librosa.get_duration(y=signal_trimmed)
        signal_len = sample_rate * duration
        if audio_duration > duration:
            signal_fin = signal_trimmed[:signal_len]
        else:
            signal_fin = librosa.util.fix_length(signal_trimmed, size=signal_len)
        
        if method == "stft":
            stft = librosa.stft(signal_fin, n_fft=n_fft)
            spectogram = np.abs(stft)
            log_spectogram = librosa.amplitude_to_db(spectogram)
            #log_spectogram = librosa.amplitude_to_db(spectogram, ref=np.max)

            #convert to list to serialize with json (json can't work with ndarray)
            data["stft"].append(log_spectogram.tolist())
        
        if method == "mfcc":
            mfcc = librosa.feature.mfcc(y=signal_fin, sr=sample_rate, n_mfcc=num_mfcc, n_fft=n_fft)
            mfcc = mfcc.T
            data["mfcc"].append(mfcc.tolist())
            
#         if method="cwt":
        
    return data 

# 1. STFT

In [74]:
DATASET_PATH = "./data/filtered"
METHOD = "stft"

N_FFT = 512
SAMPLE_RATE = 16000
DURATION = 23

PROCESS_INFO = {"method": METHOD, "duration": DURATION, "n_fft": N_FFT, "sample_rate": SAMPLE_RATE, "trim": "True", "zero-pad": "True"}

data = extract_features(METHOD, PROCESS_INFO, DATASET_PATH, DURATION, SAMPLE_RATE, n_fft=N_FFT)

IntProgress(value=0, max=140)

In [75]:
date = datetime.now().strftime('%Y-%m-%d')
JSON_PATH = f"./data/{METHOD}_{date}.json"

with open(JSON_PATH, "w") as fp:
    json.dump(data, fp, indent=4)

# 2. MFCC

In [76]:
DATASET_PATH = "./data/filtered"
METHOD = "mfcc"

N_FFT = 512
NUM_MFCC = 13
SAMPLE_RATE = 22050
DURATION = 23

PROCESS_INFO = {"method": METHOD, "duration": DURATION, "n_fft": N_FFT, "sample_rate": SAMPLE_RATE, "trim": "True", "zero-pad": "True"}

data = extract_features(METHOD, PROCESS_INFO, DATASET_PATH, DURATION, SAMPLE_RATE, n_fft=N_FFT, num_mfcc=NUM_MFCC)

IntProgress(value=0, max=140)

In [77]:
date = datetime.now().strftime('%Y-%m-%d')
JSON_PATH = f"./data/{METHOD}_{date}.json"

with open(JSON_PATH, "w") as fp:
    json.dump(data, fp, indent=4)

# 3. CWT

In [None]:
DATASET_PATH = "./data/filtered"
METHOD = "cwt"

SAMPLE_RATE = 22050
DURATION = 23

PROCESS_INFO = {"method": METHOD, "duration": DURATION, "n_fft": N_FFT, "sample_rate": SAMPLE_RATE, "trim": "True", "zero-pad": "True"}

data = extract_features(METHOD, PROCESS_INFO, DATASET_PATH, DURATION, SAMPLE_RATE)


In [82]:
N_FFT = 512

SAMPLE_RATE = 22050
# will trim data to this duration in sec
DURATION = 16
SIGNAL_LEN = SAMPLE_RATE * DURATION
TRAIN_INFO = {"duration": DURATION, "n_fft": N_FFT, "sample_rate": SAMPLE_RATE, "trim": true, "zero-pad": true}


# dictionary to store mapping, labels, and spectograms
data = {
    "mapping": [],
    "labels": [],
    "stft": [],
    "train_info": TRAIN_INFO 
}

audio_files = glob(f"./data/filtered/*/*.wav")

# instantiate the progress bar
f = IntProgress(min=0, max=len(audio_files)) 
display(f) # display the bar
    
for file in audio_files:
    f.value += 1
    lang = file.split("/")[3]
    data["mapping"].append(lang)
    data["labels"].append(lang_mapping[lang])  

    signal, sample_rate = librosa.load(file, mono=True)
    
    # Trim leading and trailing silence from an audio signal
    signal_trimmed, _ =  librosa.effects.trim(signal, top_db=20)
    
    # cut signal to specified duration or pad with 0s
    audio_duration = librosa.get_duration(y=signal_trimmed)
    if audio_duration > DURATION:
        signal_fin = signal_trimmed[:SIGNAL_LEN]
    else:
        signal_fin = librosa.util.fix_length(signal_trimmed, size=SIGNAL_LEN)

    # 3 CWT
    scales = np.arange(1, 200, 5)
    waveletname = 'morl'
    coeff, freq = pywt.cwt(signal_fin, scales, waveletname, 1)
    

IntProgress(value=0, max=140)

In [None]:
def apply_cwt(dataset_size, signal_size, variables_num, dataset):
    scales = range(1, signal_size)
    waveletname = 'morl'
    data_cwt = np.ndarray(shape=(dataset_size, signal_size-1, signal_size-1, variables_num))
    
    # instantiate the progress bar
    max_count = dataset_size
    f = IntProgress(min=0, max=max_count) 
    display(f) # display the bar
    
    for ii in range(0, dataset_size):
        f.value += 1
        for jj in range(0, variables_num):
            signal = dataset[ii, :, jj]
            # coeff shape (19, 20) bc of 19 scales
            coeff, freq = pywt.cwt(signal, scales, waveletname, 1)
            
            # subtract 1 to get 19x19 coeffecients that would result into images of size 19x19 
            # images are scaleograms produced after cwt
            coeff_ = coeff[:,:signal_size-1]
            # combine data into a single signal image with 6 channels 
            # each representing 1 variable
            data_cwt[ii, :, :, jj] = coeff_
    return data_cwt

In [83]:
JSON_PATH = "./data/stft_data_16sec.json"

if os.path.exists(JSON_PATH):
    os.remove(JSON_PATH)

with open(JSON_PATH, "w") as fp:
    json.dump(data, fp, indent=4)

# Learning from errors:
had problem with uneven array size & np.VisibleDeprecationWarning -> trimming audios made 1 audio file less than 16 sec, which resulted into ragged array -> couldn't properly import as np.array <br>
**Solution**: pad with 0s using librosa.util.fix_length<br>
Problem desc: <br>
"I faced the np.VisibleDeprecationWarning while stacking lists containing audio data from WAV files. This problem occurred because audio files had different lengths. So, the lists that I needed to stack together into one numpy array also had varying lengths. Ignoring or suppressing this warning did not give the desired stacked np array so, I made all the audio files of the same length by using pydub.AudioSegment as mentioned in this answer."
https://stackoverflow.com/questions/63097829/debugging-numpy-visibledeprecationwarning-ndarray-from-ragged-nested-sequences


UserWarning: Empty filters detected in mel frequency basis. Some channels will produce empty responses. Try increasing your sampling rate (and fmax) or reducing n_mels.
https://stackoverflow.com/questions/56929874/what-is-the-warning-empty-filters-detected-in-mel-frequency-basis-about
