In [7]:
import os
from pathlib import Path
from typing import List

from pydub import AudioSegment

In [8]:
# # change working directory to the parent folder for importing and enable auto reaload
# os.chdir('..')
# %load_ext autoreload
# %autoreload 2

In [9]:
class AudioSplitter:
    """Split the audio into chunks"""
    
    @classmethod
    def split_audios_from_folder(cls, audio_folder_dir: Path):
        """
        1. Create directory for saving files
        2. For each audio file in the directory:
            1) Split the file into chunks
            2) Save each group of chunks as files into the new directory
        """
        split_folder_dir = cls.create_split_folder_dir(audio_folder_dir)
        
        # get all the file directory within the folder
        audio_file_dir_list = [os.path.join(audio_folder_dir, file) for file in os.listdir(audio_folder_dir) if os.path.isfile(os.path.join(audio_folder_dir, file))]
        
        # split and save to the split folder
        for audio_file_dir in audio_file_dir_list:
            cls.split_and_save(audio_file_dir, split_folder_dir)
            
        
    @classmethod
    def create_split_folder_dir(cls, audio_folder_dir: Path, suffix="_split"):
        """Create a directory at the same level as the input folder, suffix with "_split"""
        # get the new directory name
        current_folder_name = os.path.split(audio_folder_dir)[-1]
        split_folder_name = current_folder_name + suffix
        split_folder_dir = Path(os.path.join(audio_folder_dir.parent, split_folder_name))
        
        # create the directory
        split_folder_dir.mkdir(parents=True, exist_ok=True)
        return split_folder_dir
        
    @classmethod
    def split_and_save(cls, audio_file_dir, split_folder_dir, format="mp3"):
        """ 
        1. Split the file into chunk list
        2. Get the original file name
        3. Save each file in the chunk list with name appended
        """
        # split
        audio_chunk_list = cls.split_audio(audio_file_dir)
        
        # get names
        file_full_name = os.path.basename(audio_file_dir)
        file_name ,extension = os.path.splitext(file_full_name)
        
        # save
        for i, audio in enumerate(audio_chunk_list):
            new_file_name = f"{file_name}_{i}{extension}"
            file_dir = os.path.join(split_folder_dir, new_file_name)
            audio.export(file_dir, format=format)

    @classmethod
    def split_audio(cls, audio_file_dir: Path, interval_s=3.0, overlap_s=0.0):
        """
        Split a single mp3 file into a list of files by the interval and overlapping
        Ignore the last piece if not long enough for the interval.
        """
        # calculate number of chunks
        audio = cls.read_audio(audio_file_dir)
        total_length_s = len(audio) / 1000
        effective_length_s = total_length_s - overlap_s # exclude the last overlap since the last chunk might not be enough for a full interval
        num_chunks = int(effective_length_s // ((interval_s - overlap_s)))
        chunk_length_ms = (interval_s - overlap_s) * 1000
        
        # split the audio
        audio_chunk_list = [audio[chunk * chunk_length_ms: (chunk + 1) * chunk_length_ms] for chunk in range(num_chunks)]
        
        return audio_chunk_list
    
    @classmethod
    def read_audio(cls, audio_file_dir):
        """
        Read a single mp3 file and return it.
        Can be used for playback in jupyter
        """
        audio = AudioSegment.from_mp3(audio_file_dir)
        return audio

In [10]:
test_folder_dir = Path(r"../data/test_audios")

In [11]:
AudioSplitter.split_audios_from_folder(test_folder_dir)