In [4]:
import os
import glob
import multiprocessing
from pydub import AudioSegment
import functools
from tqdm import tqdm
import warnings
warnings.filterwarnings(action='ignore') 

In [5]:
def get_all_file_path(input_dir, file_extension):
    return glob.glob(os.path.join(input_dir, '**', '*.{}'.format(file_extension)), recursive=True)

def detect_leading_silence(sound, silence_threshold=-50.0, chunk_size=10):
    '''
    sound is a pydub.AudioSegment
    silence_threshold in dB
    chunk_size in ms
    iterate over chunks until you find the first one with sound
    '''
    trim_ms = 0 # ms
    assert chunk_size > 0 # to avoid infinite loop
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
        trim_ms += chunk_size
    return trim_ms

def preprocess(data, input_dir, dest_dir, silence_threshold=-50.0, chunk_size=10):
    output_file_path = os.path.join(dest_dir, '/'.join(data.split('/')[2:-1]))
    os.makedirs(output_file_path, exist_ok=True)
    if not os.path.exists(output_file_path) or True:
        sound = AudioSegment.from_file(data, format="wav")
        start_trim = detect_leading_silence(sound)
        end_trim = detect_leading_silence(sound.reverse())
        duration = len(sound)    
        trimmed_sound = sound[start_trim:duration-end_trim]
        trimmed_sound.export(os.path.join(dest_dir, '/'.join(data.split('/')[2:])), format="wav")
        return 0
    
def parallel_preprocess(input_dir, dest_dir, silence_threshold=-50.0, chunk_size=10, parallel=None):
    dataset = get_all_file_path(input_dir, 'wav')
    with multiprocessing.Pool(parallel) as p:
        func = functools.partial(preprocess,
            input_dir=input_dir, dest_dir=dest_dir,
            silence_threshold=silence_threshold, chunk_size=chunk_size)
        output = list(tqdm(p.imap(func, dataset), total=len(dataset)))

In [6]:
parallel_preprocess(input_dir='./datasets_resample',
                    dest_dir='./datasets_trim',
                    silence_threshold=-50.0,
                    chunk_size=10,
                    parallel=multiprocessing.cpu_count())

100%|██████████| 2452/2452 [00:00<00:00, 3403.04it/s]
