In [1]:
#export
import librosa
import matplotlib.pyplot as plt
import torchaudio
import torch
import numpy as np
import librosa.display
import glob
import pydub
from pydub import AudioSegment

There are many preprocessing methods, some of which may have different relationships to network type.

In [5]:
#will

'''
Preprocessing Pipeline
1 - Load data
2 - Convert to correct sample rate using
3a - Detect silence on front
3b - Detect silence on back 
4a - Trim silence on front
4b - Trim silence on back
5 - Normalize mel spectrogram
6a - Pad silence to front
6b - Pad silence to back
'''


class Loader:
    """ 
    Load audio file and resample to desired sample rate
        sample_rate (int): desired sample rate
    """

    def __init__(self, sample_rate=22050):
        self.sample_rate = sample_rate

    def load(self, file_path):
        signal = AudioSegment.from_file(file_path)
        signal = signal.set_frame_rate(self.sample_rate)
        return signal


class Padder:
    """
    Apply padding to array
        mode (str): argument for numpy padding 
        pad_time (int): numebr of ms to pad to left and right
    """

    def __init__(self, sample_rate, mode="constant", pad_ms=50):
        self.pad_time = pad_ms / 1000  # Convert to seconds
        self.sample_rate = sample_rate
        self.mode = mode

    def pad(self, arr):
        '''
        Pads zeros (silence) to front and back of audio signal
            ms: number of millseconds to bad for (ms)
            sr: sample_rate (Hz)
        '''

        n_padding = self.get_pad_length()
        padded_arr = self.left_pad(arr, n_padding)
        padded_arr = self.right_pad(padded_arr, n_padding)

        return padded_arr

    def get_pad_length(self):
        '''
        Calculates number of elements to pad to array
            sr (int): sample rate
            t (int): Number of seconds to pad for
        '''
        return int(self.sample_rate * self.pad_time)

    def left_pad(self, arr, n_missing_items):
        # [1,2,3] -> 2 -> [0,0,1,2,3]
        padded_array = np.pad(arr,
                              (n_missing_items, 0),
                              mode=self.mode)

        return padded_array

    def right_pad(self, arr, n_missing_items):
        # [1,2,3] -> 2 -> [1,2,3,0,0]
        padded_array = np.pad(arr,
                              (0, n_missing_items),
                              mode=self.mode)
        return padded_array


class TrimSilence():
    def __init__(self, mode="constant", leading_silence_threshold=-33, trailing_silence_threshold=-40):
        self.mode = mode
        self.leading_silence_threshold = leading_silence_threshold
        self.trailing_silence_threshold = trailing_silence_threshold

    def detect_silence(self, signal):
        '''
        Calls function to detect silence. 
        Could be with pydub, librosa, or pytorch VAD.
        '''
        start_trim, end_trim = self.pydub_detect_silence(signal)
        return start_trim, end_trim

    def crop_silence(self, signal, start_trim, end_trim):
        duration = len(signal)
        return signal[start_trim:duration-end_trim]

    # Calculate and return indices to trim for
    # leading and trailing silences
    def pydub_detect_silence(self, signal):
        start_trim = pydub.silence.detect_leading_silence(
            signal, silence_threshold=self.leading_silence_threshold)
        end_trim = pydub.silence.detect_leading_silence(
            signal.reverse(), silence_threshold=self.trailing_silence_threshold)
        return start_trim, end_trim

    # Main for trimming class
    def trim(self, signal):
        start_trim, end_trim = self.detect_silence(signal)
        signal_trimmed = self.crop_silence(signal, start_trim, end_trim)
        return signal_trimmed

# class LogSpectrogramExtractor:
#     """
#     Extracts log spectrograms (in dB) from a time series signal
#     frame_size
#     hop_length
#     """

#     def __init__(self, frame_size, hop_length):
#         self.frame_size = frame_size
#         self.hop_length = hop_length

#     def extract(self, signal):
#         # Returns (1+frame_size / 2, num_frames)
#         # remove last index to keep even
#         # 1024 -> 513 -> 512
#         stft = librosa.stft(signal,
#                             n_fft=self.frame_size,
#                             hop_length=self.hop_length)[:-1]
#         spectrogram = np.abs(stft)
#         log_spectrogram = librosa.amplitude_to_db(spectrogram)
#         return log_spectrogram


class Normalizer:
    """ 
    Apply min max normalization to an array. 
    Min value of array set to self.min, max value to self.max
    """

    def __init__(self, min_val, max_val):
        self.min = min_val
        self.max = max_val

    def librosa_normalize(self, signal):
        return librosa.util.normalize(signal)

    def min_max_normalize(self, arr):
        # Convert to range [0,1]
        norm_arr = (arr - arr.min() / (arr.max() - arr.min()))

        # Convert range to [self.min, self.max]
        norm_arr = norm_arr * (self.max - self.min) + self.min
        return norm_arr

    def min_max_denormalize(self, norm_arr, original_min, original_max):
        arr = (norm_arr - self.min) / (self.max - self.min)
        arr = arr * (original_max - original_min) + original_min

    def normalize(self, signal):
        return self.librosa_normalize(signal)


class PreprocessingPipeline:
    """ 
    Processes audio files in directory, applying the 
    following steps to each file:
    1 - load a file
    2 - pad the signal
    3 - extract log spectrogram fro msignal
    4 - normalize spectrogram
    5 - save normalized spectrogram
    Store min/max values for all log spectrograms
    """

    def __init__(self, loader=None, trimmer=None, normalizer=None, padder=None):
        self.loader = loader
        self.trimmer = trimmer
        self.normalizer = normalizer
        self.padder = padder

    def pydub_to_np(self, pydub_signal):
        return np.array(pydub_signal.get_array_of_samples(), dtype=np.float64)

    def save(self, signal, path, sr):
        librosa.output.write_wav(path, signal, sr=sr)

    # Main preprocessing function
    def preprocess(self, file_path):
        # Loads in pydub signal and converts to specified sample rate
        original_signal = self.loader.load(file_path)

        # Detects and trims leading and trailing silence
        processed_signal = self.trimmer.trim(original_signal)
        # print(trimmed_signal)

        # Convert pydub to numpy to use librosa norm
        processed_signal = self.pydub_to_np(processed_signal)
        # print(trimmed_signal_np)

        if self.normalizer:
            # Perform min/max normalization
            processed_signal = self.normalizer.normalize(processed_signal)
            # print(normalized_signal)

        # Pad signal with silence
        processed_signal = self.padder.pad(processed_signal)
        # print(padded_signal)
        return self.pydub_to_np(original_signal), processed_signal

In [None]:
#mellotron
import torch
import numpy as np
from scipy.signal import get_window
import librosa.util as librosa_util


def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
                     n_fft=800, dtype=np.float32, norm=None):
    """
    # from librosa 0.6
    Compute the sum-square envelope of a window function at a given hop length.
    This is used to estimate modulation effects induced by windowing
    observations in short-time fourier transforms.
    Parameters
    ----------
    window : string, tuple, number, callable, or list-like
        Window specification, as in `get_window`
    n_frames : int > 0
        The number of analysis frames
    hop_length : int > 0
        The number of samples to advance between frames
    win_length : [optional]
        The length of the window function.  By default, this matches `n_fft`.
    n_fft : int > 0
        The length of each analysis frame.
    dtype : np.dtype
        The data type of the output
    Returns
    -------
    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
        The sum-squared envelope of the window function
    """
    if win_length is None:
        win_length = n_fft

    n = n_fft + hop_length * (n_frames - 1)
    x = np.zeros(n, dtype=dtype)

    # Compute the squared window at the desired length
    win_sq = get_window(window, win_length, fftbins=True)
    win_sq = librosa_util.normalize(win_sq, norm=norm)**2
    win_sq = librosa_util.pad_center(win_sq, n_fft)

    # Fill the envelope
    for i in range(n_frames):
        sample = i * hop_length
        x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
    return x


def griffin_lim(magnitudes, stft_fn, n_iters=30):
    """
    PARAMS
    ------
    magnitudes: spectrogram magnitudes
    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
    """

    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
    angles = angles.astype(np.float32)
    angles = torch.autograd.Variable(torch.from_numpy(angles))
    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)

    for i in range(n_iters):
        _, angles = stft_fn.transform(signal)
        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
    return signal


def dynamic_range_compression(x, C=1, clip_val=1e-5):
    """
    PARAMS
    ------
    C: compression factor
    """
    return torch.log(torch.clamp(x, min=clip_val) * C)


def dynamic_range_decompression(x, C=1):
    """
    PARAMS
    ------
    C: compression factor used to compress
    """
    return torch.exp(x) / C

In [3]:
#cookie

import os
import numpy as np
import librosa
from glob import glob
from multiprocessing import Pool
from tqdm import tqdm

def even_split(a, n):
    """split array `a` into `n` seperate evenly sized chunks"""
    n = min(n, len(a)) # if less elements in array than chunks to output, change chunks to array length
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))


def multiprocess_directory(function, directory, regex="**/*.flac", threads=16):
    """
    Calls `function` on all files in directory that match 'regex'
    
    PARAMS:
        function: the function to be called by each thread
        directory: the directory to search for files
        regex: query for files. '**' indicates recursive search, '*' is a standard wildcard.
        threads: number of threads to spawn.
    RETURNS:
        None
    
    Note - Despite being called multiprocess, this is actually using multithreading.
    """
    from random import shuffle
    p = Pool(threads)
    file_paths = glob(os.path.join(directory, regex), recursive=True)
    shuffle(file_paths)
    split_file_paths = list(even_split(file_paths,threads))
    #with concurrent.futures.ProcessPoolExecutor() as executor:
    #    print(executor.map(function, split_file_paths))
    return p.map(function, split_file_paths)


def multiprocess_filearray(function, file_paths, threads=16):
    """
    Splits given filepaths into 'threads' number of even sized chunks, and calls function in a seperate thread on each one.
    
    PARAMS:
        function: the function to be called by each thread
        file_path: the files paths that will be split into chunks and fed into each thread
        threads: number of threads to spawn.
    RETURNS:
        None
    
    Note - Despite being called multiprocess, this is actually using multithreading.
    """
    p = Pool(threads)
    split_file_paths = list(even_split(file_paths,threads))
    #with concurrent.futures.ProcessPoolExecutor() as executor:
    #    print(executor.map(function, split_file_paths))
    return p.map(function, split_file_paths)


def normalize_volumes_mixmode(directory, amplitude=0.08, ext='.wav'):
    """
    Calls 'normalize-audio' for every subdirectory in directory.
    
    Will calcuate the mean RMS amplitude of all files in each subdirectory, average them, get difference from amplitude param, and apply a constant amplitude modifier to every audio file in the folder.
    This means audio files in the same folder will all get louder or quieter by the same around, but different folders will be adjusted by different amounts so each folder has roughly the same volume.
    
    PARAMS:
        directory: the root directory, where every directory under this root will be updated.
        amplitude: the target average amplitude for every subdirectory
        ext: the extension of the audio files to be analysed and updated.
    RETURNS:
        None
    """
    subdirectories = [x[0] for x in os.walk(directory)]
    for subdirectory in subdirectories:
        os.system(f"normalize-audio -w 16 -a {amplitude} -b '{subdirectory}/'*{ext}")


def process_audio_multiprocess(file_paths_arr,
        filt_type, filt_cutoff_freq, filt_order,
        out_path, 
        trim_margin_left, trim_margin_right, trim_top_db, trim_window_length, trim_hop_length, trim_ref, trim_preemphasis_strength,
        SAMPLE_RATE=48000, MIN_SAMPLE_RATE=15999, BIT_DEPTH=2,
        ignore_dirs=["Noise samples","_Noisy_","_Very Noisy_"], skip_existing=False,
        in_ext_=None, out_ext=".wav", use_tqdm=True, dump_sample_rates=True
    ):
    """
    Take an array of audio file paths. Apply processing and trimming and save the output.
    PARAMS:
        file_paths_arr: Array of audio paths, FLAC's or WAV's recommended.
        
        filt_type: options of 'hp','lp' which is a 'high-pass' and 'low-pass' filter respectively.
        filt_cutoff_freq: threshold frequency for the filter.
        filt_order: similar to the strength of the filter, also effects processing time.
        
        trim_margin_left: save samples to the left of silence.
        trim_margin_right: save samples to the right of silence.
        trim_top_db: decibelles under reference db that is considered silence.
        trim_window_length: number of samples to average over.
        trim_hop_length: number of samples to shift the window each time.
        trim_ref: reference db, typical functions are np.amax and np.mean.
        trim_preemphasis_strength: empthasis filter which can be used to make trimming more sensitive to higher frequencies.
                                   The empthasised audio is only used to identify trimming locations, the original audio
                                   will still be output.
        
        SAMPLE_RATE: the output sample rate of the processed audio files.
        MIN_SAMPLE_RATE: minimum sample rate for an audio file to be processed.
        BIT_DEPTH: doesn't do anything right now. At some point will be used to pick the bit-depth of output audio files.
        
        ignore_dirs: skip audio files where a str from ignore_dirs is found in the filepath.
        skip_existing: skip files that would overwrite an existing file.
        
        in_ext_: ...
        out_ext: the output extension of the audio files. Anything supported by soundfile should work however only FLAC and WAV have been tested by me.
        use_tqdm: add progress bar
        dump_sample_rates: return samples_rates.
    RETURNS:
        samples_rates: an dict of output files and their sample rates before being processed.
                       e.g: {
                                path 0: sample_rate 0,
                                path 1: sample_rate 1,
                                path 2: sample_rate 2, ...
                            }
    
    Note - filt params are zipped together so must be lists of the same length.
    Note - trim params are zipped together so must be lists of the same length.
    Note - This uses file_paths_arr as input because it is intended to be used in a multiprocessing environment where
                                    a host will split a directories audio files into chunks before calling this func.
    """
    import soundfile as sf
    import scipy
    from scipy import signal
    
    if dump_sample_rates:
        sample_rates = {} # array of dicts. e.g: [{path 0: sample_rate 0}, {path 1: sample_rate 1}, {path 2: sample_rate 2}, ...]
    
    skip = 0
    prev_sr = 0
    iterator = tqdm(file_paths_arr, smoothing=0.0) if use_tqdm else file_paths_arr
    for file_path in iterator: # recursive directory search
        in_ext = in_ext_ if (in_ext_ is not None) else os.path.splitext(os.path.split(file_path)[-1])[-1] # get ext from file_path or use override.
        #out_path = file_path.replace(in_ext,out_ext)
        if skip_existing and os.path.exists(out_path):
            continue
        if any([filter_dir in file_path for filter_dir in ignore_dirs]):
            continue
        
        # VCTK cleanup
        #if file_path.endswith(f"_mic1{in_ext}"):
        #    os.rename(file_path, file_path.replace(f"_mic1{in_ext}",in_ext))
        #if file_path.endswith(f"_mic2{in_ext}"):
        #    continue
        try:
            native_sound, native_SR = sf.read(file_path, always_2d=True)
        except RuntimeError as ex:
            print(f'"{os.path.split(file_path)[-1]}" failed to load and has been deleted.\nDELETED PATH: "{file_path}"')
            os.unlink(file_path)
            #raise RuntimeError(ex)
        native_sound = native_sound[:,0]# take first channel (either mono or left audio channel)
        native_sound = np.asfortranarray(native_sound).astype('float64') # and ensure the audio is contiguous
        
        if native_SR < MIN_SAMPLE_RATE: # skip any files with native_SR below the minimum
            continue
        if native_SR != SAMPLE_RATE: # ensure all audio is same Sample Rate
            try:
                sound = librosa.core.resample(native_sound, native_SR, SAMPLE_RATE)
            except ValueError as ex:
                print(ex, file_path, native_SR, len(native_sound), sep="\n")
                raise ValueError(ex)
        else:
            sound = native_sound
        
        if dump_sample_rates:
            sample_rates[os.path.abspath(out_path)] = native_SR
        
        # 24 bit -> 16 bit, 32 bit -> 16 bit
        if max(np.amax(native_sound), -np.amin(native_sound)) > (2**23): # if samples exceed values possible at 24 bit
            sound = (sound / 2**(31-15))#.astype('int16') # change bit depth from 32 bit to 16 bit
        elif max(np.amax(native_sound), -np.amin(native_sound)) > (2**15): # if samples exceed values possible at 16 bit
            sound = (sound / 2**(23-15))#.astype('int16') # change bit depth from 24 bit to 16 bit
        
        # apply audio filters
        for type_, freq_, order_ in zip(filt_type, filt_cutoff_freq, filt_order): # eg[ ['lp'], [40], [10] ] # i.e [type, freq, strength]
            sos = signal.butter(order_, freq_, type_, fs=SAMPLE_RATE, output='sos') # calcuate filter somethings
            sound = signal.sosfilt(sos, sound) # apply filter
        
        # apply audio trimming
        for i, (margin_left_, margin_right_, top_db_, window_length_, hop_length_, ref_, preemphasis_strength_) in enumerate(zip(trim_margin_left, trim_margin_right, trim_top_db, trim_window_length, trim_hop_length, trim_ref, trim_preemphasis_strength)):
            if preemphasis_strength_:
                sound_filt = librosa.effects.preemphasis(sound, coef=preemphasis_strength_)
                _, index = librosa.effects.trim(sound_filt, top_db=top_db_, frame_length=window_length_, hop_length=hop_length_, ref=ref_) # gonna be a little messed up for different sampling rates
            else:
                _, index = librosa.effects.trim(sound, top_db=top_db_, frame_length=window_length_, hop_length=hop_length_, ref=ref_) # gonna be a little messed up for different sampling rates
            try:
                sound = sound[int(max(index[0]-margin_left_, 0)):int(index[1]+margin_right_)]
            except TypeError:
                print(f'Slice Left:\n{max(index[0]-margin_left_, 0)}\nSlice Right:\n{index[1]+margin_right_}')
            assert len(sound), f"Audio trimmed to 0 length by pass {i+1}\nconfig = {[margin_left_, margin_right_, top_db_, window_length_, hop_length_, ref_]}\nFile_Path = '{file_path}'"
        
        # write updated audio to file
        if os.path.exists(out_path):
            os.unlink(out_path) # using unlink incase the out_path object is a symlink
        sf.write(out_path, sound, SAMPLE_RATE)
    
    if dump_sample_rates:
        return sample_rates

In [7]:
#tests
import IPython
import soundfile as sf
audio_file = "/mnt/disks/uberduck-experiments-v0/data/uberduck/eminem/wav/01.wav"
IPython.display.Audio(audio_file)

In [12]:
process_audio_multiprocess(["/mnt/disks/uberduck-experiments-v0/data/uberduck/eminem/wav/01.wav"],
        'hp', [20], [1],
        trim_margin_left = [10],
                           trim_margin_right = [10],
                           trim_top_db = [10],
                           trim_window_length = [10],
                           trim_hop_length = [10],
                           trim_ref= [10],
                           trim_preemphasis_strength= [10],
        SAMPLE_RATE=48000, 
                           MIN_SAMPLE_RATE=15999,
                           BIT_DEPTH=2,
        ignore_dirs=["Noise samples","_Noisy_","_Very Noisy_"],
                           skip_existing=False,
        in_ext_=None, 
                           out_ext=".wav",
                           use_tqdm=True, 
                           dump_sample_rates=True,
                           out_path = "/mnt/disks/uberduck-experiments-v0/fileout"
    )

  0%|                                                     | 0/1 [00:00<?, ?it/s]


TypeError: No format specified and unable to get format from file extension: '/mnt/disks/uberduck-experiments-v0/fileout'

In [50]:
file_paths_arr = ["/mnt/disks/uberduck-experiments-v0/data/uberduck/eminem/wav/01.wav"]
filt_type = 'hp'
filt_cutoff_freq = [1]
filt_order = [1]
trim_margin_left = [0]
trim_margin_right = [0]
trim_top_db = [0]
trim_window_length = [0]
trim_hop_length = [1]
trim_ref= [0]
trim_preemphasis_strength= [0]
SAMPLE_RATE=22050
MIN_SAMPLE_RATE=15999
BIT_DEPTH=16
ignore_dirs=["Noise samples","_Noisy_","_Very Noisy_"]
skip_existing=False
in_ext_=None
out_ext=".wav"
use_tqdm=True
dump_sample_rates=True
out_path = "/mnt/disks/uberduck-experiments-v0/"
use_tqdm = True

In [51]:
import soundfile as sf
import scipy
from scipy import signal

skip = 0
prev_sr = 0
#iterator = tqdm(file_paths_arr, smoothing=0.0) if use_tqdm else file_paths_arr
file_path =  "/mnt/disks/uberduck-experiments-v0/data/uberduck/eminem/wav/01.wav"
native_sound, native_SR = sf.read(file_path, always_2d=True)
native_sound = native_sound[:,0]# take first channel (either mono or left audio channel)
native_sound = np.asfortranarray(native_sound).astype('float64') # and ensure the audio is contiguous


sound = native_sound

# 24 bit -> 16 bit, 32 bit -> 16 bit
if max(np.amax(native_sound), -np.amin(native_sound)) > (2**23): # if samples exceed values possible at 24 bit
    sound = (sound / 2**(31-15))#.astype('int16') # change bit depth from 32 bit to 16 bit
elif max(np.amax(native_sound), -np.amin(native_sound)) > (2**15): # if samples exceed values possible at 16 bit
    sound = (sound / 2**(23-15))#.astype('int16') # change bit depth from 24 bit to 16 bit

# apply audio filters
for type_, freq_, order_ in zip(filt_type, filt_cutoff_freq, filt_order): # eg[ ['lp'], [40], [10] ] # i.e [type, freq, strength]
    sos = signal.butter(order_, freq_, type_, fs=SAMPLE_RATE, output='sos') # calcuate filter somethings
    sound = signal.sosfilt(sos, sound) # apply filter


i = 0
margin_left_ = trim_margin_left[0]
margin_right_ = trim_margin_right[0]
top_db_ = trim_top_db[0]
window_length_ = trim_window_length[0]
hop_length_ = trim_hop_length[0]
ref_ = trim_preemphasis_strength[0]
preemphasis_strength_  = True

sound_filt = librosa.effects.preemphasis(sound, coef=preemphasis_strength_)
_, index = librosa.effects.trim(sound_filt, top_db=top_db_, frame_length=window_length_, hop_length=hop_length_, ref=ref_) # gonna be a little messed up for different sampling rates


In [52]:
sound_filt.shape

(46502,)

In [53]:
#tests
import IPython
import soundfile as sf
IPython.display.Audio(sound_filt, rate= 22050)

In [6]:
SAMPLE_RATE = 22050

loader = Loader(sample_rate=SAMPLE_RATE)
padder = Padder(sample_rate=SAMPLE_RATE, pad_ms=50)
trimmer = TrimSilence(leading_silence_threshold=-33, trailing_silence_threshold=-36)
# normalizer = Normalizer(min_val=0,max_val=1)

pipeline = PreprocessingPipeline(loader=loader, 
                                   padder=padder, 
                                   trimmer=trimmer)
#In [14]:
input_folder = "/mnt/disks/uberduck-experiments-v0/data/uberduck/eminem/wav/*.wav"
output_folder = "/mnt/disks/uberduck-experiments-v0/data/test"

# Preprocess files in input folder and write to output folder


In [10]:

original, processed = pipeline.preprocess(audio_file)
#output_path = file_name + "_processed.wav"

NameError: name 'file_name' is not defined

In [12]:
IPython.display.Audio(processed, rate = 22050)

In [13]:
IPython.display.Audio(original, rate = 22050)

In [9]:
import librosa
import matplotlib.pyplot as plt
#import torchaudio
import torch
import numpy as np
import librosa.display
import glob
import pydub
from pydub import AudioSegment
