# Split long audio into special duration

This module can use for processing long audio split. You need modify the splice_
duration that you want for how long each audio clip, and modify output directory
you want and input directory you have.


In [1]:
# -*- coding: utf-8 -*-
""" Split long audio into special duration

This module can use for processing long audio split. You need modify the splice_
duration that you want for how long each audio clip, and modify output directory
you want and input directory you have.

################################################################################
# Author: Weikun Han <weikunhan@gmail.com>
# Crate Date: 02/20/2018        
# Update:
# Reference: https://github.com/jhetherly/EnglishSpeechUpsampler
################################################################################
"""

import os
import tqdm
import sox

# Please modify input path  to locate you file
DATASETS_ROOT_DIR = './datasets'
OUTPUT_DIR = os.path.join(DATASETS_ROOT_DIR, 'TEDLIUM_5S')
NOISE_OUTPUT_DIR = os.path.join(DATASETS_ROOT_DIR, 
                               'TEDLIUM_noise_sample_5S')

# Please modify setting for splice duration
splice_duration = 5

# List sub-folder for datasets
input_folder = ['TEDLIUM', 'TEDLIUM_noise_sample']

# Check location to save datasets
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
if not os.path.exists(NOISE_OUTPUT_DIR):
    os.makedirs(NOISE_OUTPUT_DIR)

print('Will send spliced audio to {}'.format(OUTPUT_DIR))
print('Will send noise spliced audio to {}'.format(NOISE_OUTPUT_DIR))

# Loop over all files within the TEDLIUM directory first and 
# loop over all files within the TEDLIUM_noise_sample directory second
for directory in input_folder:
    input_tmp_path = os.path.join(DATASETS_ROOT_DIR, directory)
    
    if directory is 'TEDLIUM':
        output_tmp_path = OUTPUT_DIR
    elif directory is 'TEDLIUM_noise_sample':
        output_tmp_path = NOISE_OUTPUT_DIR
    
    for filename in os.listdir(input_tmp_path):
        input_filename = os.path.join(input_tmp_path, filename)

        # Check if path is an existing regular file
        if not os.path.isfile(input_filename):
            continue
        
        filename_base = os.path.splitext(filename)[0]

        # This is the total audio track duration
        duration = sox.file_info.duration(input_filename)

        # Computer number of iterations for splicing
        n_iterations = int(duration / splice_duration)

        # Computer number of digits need for name the new file
        digits_number = len(str(int(duration)))

        # Create number format depends on number digits
        format_number = '{{:0{}d}}'.format(digits_number)

        # Create final file format
        filename_template = '{{}}_{}-{}.wav'.format(format_number, format_number)

        print('On file {}'.format(filename_base))

        for i in tqdm.trange(n_iterations):

            # create trasnformer
            splice = sox.Transformer()
            begin = int(i * splice_duration)
            end = int(begin + splice_duration)
            output_filename = filename_template.format(filename_base,
                                                       begin, 
                                                       end)
            output_filename = os.path.join(output_tmp_path, output_filename)
            splice.trim(begin, end)
            splice.build(input_filename, output_filename)

  5%|▌         | 5/100 [00:00<00:02, 46.11it/s]

Will send spliced audio to ./datasets/TEDLIUM_5S
Will send noise spliced audio to ./datasets/TEDLIUM_noise_sample_5S
On file 911Mothers_2010W


100%|██████████| 100/100 [00:02<00:00, 45.31it/s]
  2%|▏         | 5/204 [00:00<00:04, 45.54it/s]

On file AbrahamVerghese_2011G


100%|██████████| 204/204 [00:04<00:00, 47.63it/s]
  7%|▋         | 5/76 [00:00<00:01, 41.80it/s]

On file AaronOConnell_2011


100%|██████████| 76/76 [00:01<00:00, 48.55it/s]
  3%|▎         | 5/173 [00:00<00:03, 47.13it/s]

On file AaronHuey_2010X


100%|██████████| 173/173 [00:03<00:00, 45.29it/s]
  8%|▊         | 5/60 [00:00<00:01, 46.02it/s]

On file AbigailWashburn_2012U


100%|██████████| 60/60 [00:01<00:00, 49.00it/s]
  2%|▏         | 5/201 [00:00<00:04, 47.63it/s]

On file AaronKoblin_2011


100%|██████████| 201/201 [00:04<00:00, 46.03it/s]
  7%|▋         | 5/76 [00:00<00:01, 45.25it/s]

On file AaronOConnell_2011_noise_sample


100%|██████████| 76/76 [00:02<00:00, 37.91it/s]
  2%|▏         | 5/204 [00:00<00:04, 48.41it/s]

On file AbrahamVerghese_2011G_noise_sample


100%|██████████| 204/204 [00:05<00:00, 37.45it/s]
  7%|▋         | 4/60 [00:00<00:01, 39.96it/s]

On file AbigailWashburn_2012U_noise_sample


100%|██████████| 60/60 [00:01<00:00, 34.30it/s]
  3%|▎         | 5/173 [00:00<00:03, 47.74it/s]

On file AaronHuey_2010X_noise_sample


100%|██████████| 173/173 [00:05<00:00, 30.72it/s]
  0%|          | 1/201 [00:00<00:21,  9.37it/s]

On file AaronKoblin_2011_noise_sample


100%|██████████| 201/201 [00:04<00:00, 43.32it/s]
  5%|▌         | 5/100 [00:00<00:01, 48.03it/s]

On file 911Mothers_2010W_noise_sample


100%|██████████| 100/100 [00:02<00:00, 41.67it/s]
