In [None]:
%matplotlib inline

# Split datasets into train, validation, and test

This module can use for processing split datasets. You need modify the ratio of train, validation, and test. And you can modify output directory you want and input directory you have.

In [1]:
# -*- coding: utf-8 -*-
""" Split datasets into train, validation, and test

This module can use for processing split datasets. You need modify the ratio of 
train, validation, and test datasetes. And you can modify output directory you 
want and  input directory you have.

################################################################################
# Author: Weikun Han <weikunhan@gmail.com>
# Crate Date: 03/6/2018        
# Update:
# Reference: https://github.com/jhetherly/EnglishSpeechUpsampler
################################################################################
"""

import os
import csv
import json
import numpy as np

def write_csv(filename, pairs):
    """The function to wirte

    Args:
        param1 (str): filename 
        param2 (list): pairs

    """
    with open(filename, 'w') as csvfile:
        writer = csv.writer(csvfile)
        
        for n in pairs:
            writer.writerow(n)

if __name__ == '__main__':
    # Please modify input path  to locate you file
    DATASETS_ROOT_DIR = './datasets'
    OUTPUT_DIR = os.path.join(DATASETS_ROOT_DIR, 'final_dataset')

    # Define ratio for train, validation, and test datasetes
    train_fraction = 0.6
    validation_fraction = 0.2
    test_fraction = 0.2

    # Reset random generator
    np.random.seed(0)    
    
    # Check location to save datasets
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        
    original_noise_pairs = []
    input_original_path = os.path.join(DATASETS_ROOT_DIR, 'TEDLIUM_5S')
    input_noise_path = os.path.join(DATASETS_ROOT_DIR, 
                                    'TEDLIUM_noise_sample_5S')

    for filename in os.listdir(input_original_path):
        
        # Link same filename in noise path
        filename_component = filename.split('_')
        filename_noise = (filename_component[0] +
                          '_' +
                          filename_component[1] +
                          '_' +
                          'noise_sample' +
                          '_' +
                          filename_component[2])
        input_original_filename = os.path.join(input_original_path, 
                                               filename)
        input_noise_filename = os.path.join(input_noise_path, filename_noise)
        
        if not os.path.isfile(input_original_filename):
            continue
        
        original_noise_pairs.append(
            [input_original_filename, input_noise_filename])

    # Shuffle the datasets
    np.random.shuffle(original_noise_pairs)
    datasets_size = len(original_noise_pairs)
    
    # Create indexs
    validation_start_index = 0
    validation_end_index = (validation_start_index + 
                            int(datasets_size * validation_fraction))
    test_start_index = validation_end_index
    test_end_index = (test_start_index + 
                      int(datasets_size * test_fraction))
    train_start_index = test_end_index
    
    # Save pairs into .csv
    validation_original_noise_pairs = original_noise_pairs[
        validation_start_index:validation_end_index]
    write_csv(os.path.join(OUTPUT_DIR, 'validation_files.csv'),
              validation_original_noise_pairs)
    test_original_noise_pairs = original_noise_pairs[
        test_start_index : test_end_index]
    write_csv(os.path.join(OUTPUT_DIR, 'test_files.csv'), 
              test_original_noise_pairs)
    train_original_noise_pairs = original_noise_pairs[
        train_start_index :]
    write_csv(os.path.join(OUTPUT_DIR, 'train_files.csv'), 
              original_noise_pairs)