In [2]:
import librosa
import numpy as np


class AudioPreprocessor:
    def __init__(self, config):
        self.config = config

    def normalize(self, spectrogram_in_db):
        '''Normalize spectrogram in decibel values between 0 and 1.'''
        normalized_spectrogram_in_db = (
            spectrogram_in_db - self.config.REF_LEVEL_DB - self.config.MIN_LEVEL_DB
        ) / -self.config.MIN_LEVEL_DB

        return np.clip(normalized_spectrogram_in_db, self.config.ZERO_THRESHOLD, 1)

    def magnitude_to_mel(self, magnitude):
        '''Convert a magnitude spectrogram to a mel spectrogram.'''
        return librosa.feature.melspectrogram(
            S=magnitude,
            sr=self.config.SAMPLE_RATE,
            n_fft=self.config.N_FFT,
            n_mels=self.config.N_MELS,
            fmin=self.config.FMIN,
            fmax=self.config.FMAX,
        )

    def trim_audio(self, y):
        y_trimmed, _ = librosa.effects.trim(y)
        return y_trimmed

    def amp_to_db(self, mel_spectrogram):
        '''Convert amplitude spectrogram to decibel scale.'''
        return 20.0 * np.log10(
            np.maximum(self.config.ZERO_THRESHOLD, mel_spectrogram)
        )

    def audio_to_stft(self, audio):
        '''Generate Short-Time Fourier Transform (STFT) from the audio time series.'''
        return librosa.stft(
            y=audio,
            n_fft=self.config.N_FFT,
            hop_length=self.config.HOP_LENGTH,
            win_length=self.config.WIN_LENGTH,
        )

    def apply_pre_emphasis(self, y):
        '''Apply a pre-emphasis filter to the audio signal.'''
        return np.append(y[0], y[1:] - self.config.PRE_EMPHASIS * y[:-1])

    def stft_to_magnitude(self, linear):
        '''Compute the magnitude spectrogram from STFT.'''
        return np.abs(linear)

    def audio_to_mel_db(self, audio):
        '''Convert a given linear spectrogram to a log mel spectrogram (mel spectrogram in db) and return it.'''
        # audio = self.trim_audio(audio)
        stft = self.audio_to_stft(audio)
        magnitude = self.stft_to_magnitude(stft)
        mel = self.magnitude_to_mel(magnitude)
        mel = self.amp_to_db(mel)
        return self.normalize(mel)
    
    def audio_to_magnitude_db(self, audio):
        '''Convert a given linear spectrogram to a magnitude spectrogram.'''
        audio = self.trim_audio(audio)
        stft = self.audio_to_stft(audio)
        magnitude_in_amp =  self.stft_to_magnitude(stft)
        magnitude_in_db = self.amp_to_db(magnitude_in_amp)
        return self.normalize(magnitude_in_db)

In [3]:
class Text2SpeechAudioConfig:
    N_MELS = 80
    SAMPLE_RATE = 16000
    N_FFT = 800
    FRAME_SHIFT = 0.0125
    FRAME_LENGTH = 0.05
    REF_LEVEL_DB = 20
    HOP_LENGTH = int(SAMPLE_RATE * FRAME_SHIFT)
    WIN_LENGTH = int(SAMPLE_RATE * FRAME_LENGTH)
    PRE_EMPHASIS = 0.97
    POWER = 1.2
    FMIN = 55
    FMAX = 7600
    ZERO_THRESHOLD = 1e-5
    MIN_LEVEL_DB = -100

In [4]:
SAVE_DIR = "mel2mag/mels"

In [5]:
DATA_ROOT = "LibriSpeech/train-clean-100"

In [6]:
import os

audios = []
for root, dirs, files in os.walk(DATA_ROOT):
    for file in files:
        if file.endswith(".flac"):
            audios.append(os.path.join(root, file))

In [7]:
len(audios)

28539

In [8]:
AUDIO_PROCESSOR = AudioPreprocessor(Text2SpeechAudioConfig)

In [9]:
from tqdm import tqdm

for file in tqdm(audios, desc="Generate mel"):
    audio, sr = librosa.load(file, sr=16000)
    mel = AUDIO_PROCESSOR.audio_to_mel_db(audio)
    np.save(os.path.join(SAVE_DIR, file.split("/")[-1][:-5] + ".npy"), mel)

Generate mel:   2%|▏         | 701/28539 [02:46<1:50:23,  4.20it/s]


KeyboardInterrupt: 

In [29]:
os.listdir(SAVE_DIR)

['103-1240-0000.npy',
 '103-1240-0001.npy',
 '103-1240-0002.npy',
 '103-1240-0003.npy',
 '103-1240-0004.npy',
 '103-1240-0005.npy',
 '103-1240-0006.npy',
 '103-1240-0007.npy',
 '103-1240-0008.npy',
 '103-1240-0009.npy',
 '103-1240-0010.npy',
 '103-1240-0011.npy',
 '103-1240-0012.npy',
 '103-1240-0013.npy',
 '103-1240-0014.npy',
 '103-1240-0015.npy',
 '103-1240-0016.npy',
 '103-1240-0017.npy',
 '103-1240-0018.npy',
 '103-1240-0019.npy',
 '103-1240-0020.npy',
 '103-1240-0021.npy',
 '103-1240-0022.npy',
 '103-1240-0023.npy',
 '103-1240-0024.npy',
 '103-1240-0025.npy',
 '103-1240-0026.npy',
 '103-1240-0027.npy',
 '103-1240-0028.npy',
 '103-1240-0029.npy',
 '103-1240-0030.npy',
 '103-1240-0031.npy',
 '103-1240-0032.npy',
 '103-1240-0033.npy',
 '103-1240-0034.npy',
 '103-1240-0035.npy',
 '103-1240-0036.npy',
 '103-1240-0037.npy',
 '103-1240-0038.npy',
 '103-1240-0039.npy',
 '103-1240-0040.npy',
 '103-1240-0041.npy',
 '103-1240-0042.npy',
 '103-1240-0043.npy',
 '103-1240-0044.npy',
 '103-1240