# Convert audio into .wav file

This module can use for converting audio file in original dataset. After you get the original audio, you can generate more noise audio if you like. The noise audio dataset create way is showed on our website. We provide some exmaple noise audio in github.

In [4]:
# -*- coding: utf-8 -*-
""" Convert audio into .wav file

This module can use for converting audio file in original dataset. After you 
get the original audio, you can generate more noise audio if you like. The noise 
audio dataset create way is showed on our website. We provide some exmaple noise 
audio in github. 

################################################################################
# Author: Weikun Han <weikunhan@gmail.com>
# Crate Date: 02/20/2018        
# Update:
# Reference: https://github.com/jhetherly/EnglishSpeechUpsampler
################################################################################
"""

import os
import sox

# Please modify input path  to locate you file
DATASETS_ROOT_DIR = './datasets'
OUTPUT_DIR = os.path.join(DATASETS_ROOT_DIR, 'TEDLIUM_all')
INPUT_DIR = os.path.join(DATASETS_ROOT_DIR, 'TEDLIUM_release2/{}/sph')

# Please modify setting for splice duration
start_time = 30
end_time = -30

# List sub-folder for datasets
input_folder = ['dev', 'test', 'train']

# Check location to save datasets
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
print('Will send .wav audio to {}'.format(OUTPUT_DIR))

# Loop over all files within the TEDLIUM directory first 
for directory in input_folder:
    input_tmp_path = INPUT_DIR.format(directory)
    
    # Loop over all files within the input directory
    for filename in os.listdir(input_tmp_path):
        input_filename = os.path.join(input_tmp_path, filename)

        # Check if path is an existing regular file
        if not os.path.isfile(input_filename):
            continue
        
        # Extract the filename structure
        filename_base = os.path.splitext(filename)[0]

        # This is the total audio track duration less the
        # start and end times
        duration = sox.file_info.duration(input_filename) - (start_time -
                                                             end_time)

         # Create final file format
        filename_template = '{}.wav'

        print('On file {}'.format(filename_base))

        # create trasnformer
        splice = sox.Transformer()
        output_filename = filename_template.format(filename_base)
        output_filename = os.path.join(OUTPUT_DIR, output_filename)

        # TODO: splice.trim(start_time, start_time + duration)
        splice.trim(start_time, duration) 
        splice.build(input_filename, output_filename)

Will send .wav audio to ./datasets/TEDLIUM
On file CraigVenter_2008
On file BarrySchwartz_2005G
On file BlaiseAguerayArcas_2007
On file BrianCox_2009U
On file DavidMerrill_2009
On file WadeDavis_2003
On file ElizabethGilbert_2009
On file AlGore_2009
On file TomWujec_2010U
On file RobertGupta_2010U
On file GaryFlake_2010
On file AimeeMullins_2009P
On file DanBarber_2010
On file BillGates_2010
On file JamesCameron_2010
On file MichaelSpecter_2010
On file EricMead_2009P
On file DanielKahneman_2010
On file JaneMcGonigal_2010
On file AviRubin_2011X
On file JanineShepherd_2012X
On file RobertWright_2006
On file CorneilleEwango_2007G
On file SebastianThrun_2011
On file DavidPizarro_2012X
On file HaraldHaas_2011G
On file SethPriebatsch_2010X
On file EricBerlow_2010G
On file RachelBotsman_2010X
On file MariaBezaitis_2013S
On file BeatriceCoron_2011
On file EdithWidder_2010Z
On file SakiMafundikwa_2013
On file MarcusduSautoy_2009G
On file ChadeMengTan_2010Z
On file GeorgeSmoot_2008P
On file Andr

On file BillClinton_2007
On file MarcKoska_2009G
On file ReadMontague_2012G
On file AmyLockwood_2011G
On file KarenThompsonWalker_2012G
On file AdamSadowsky_2010X
On file JonRonson_2012
On file AlexisOhanian_2009I
On file MeganKamerick_2011X
On file RodneyBrooks_2003
On file ShimonSchocken_2010X
On file PeterDonnelly_2005G
On file TanLe_2010G
On file BobThurman_2006S
On file EnricSala_2010Z
On file MichaelSandel_2010
On file GolanLevin_2004
On file EmmaTeeling_2012X
On file JosetteSheeran_2011G
On file RogerEbert_2011
On file JuliaSweeney_2006
On file CarolineLavelleFARTHERTHANTHESUN_2005
On file SophalEar_2009U
On file ChristopherPoole_2010
On file JacquelineNovogratz_2009U
On file MajoraCarter_2006
On file NatalieMacMaster_2003
On file LizColeman_2009
On file ThomasSuarez_2011X
On file MinaBissell_2012G
On file RachelArmstrong_2009G
On file MikkoHypponen_2011X
On file WinghamRowan_2012S
On file IsabelAllende_2007
On file DennisvanEngelsdorp_2008P
On file ErikJohansson_2011S
On file S

On file RobertGupta_2012P
On file JanChipchase_2007
On file NinaJablonski_2009
On file DonTapscott_2012G
On file BernieKrause_2013G
On file JohnMaeda_2007
On file BenGoldacre_2012P
On file DanPhillips_2010X
On file HansRosling_2009
On file JamaisCascio_2006
On file PaolaAntonelli_2007P
On file AdoraSvitak_2010
On file KarenArmstrong_2009G
On file YochaiBenkler_2005G
On file ChrisAnderson_2004
On file JuliaSweeney_2010
On file BarbaraBlock_2010Z
On file EllenGustafson_2010X
On file KimGorgens_2010X
On file DavidPogue_2008P
On file CarmenAgraDeedy_2005
On file BeckyBlanton_2009G
On file LarryBurns_2005
On file TeresaCarrenoOrchestra_2009
On file SheenaIyengar_2011S
On file DeborahGordon_2003
On file EmmanuelJal_2009G
On file EricDishman_2013S
On file MarcPachter_2008P
On file ShaffiMather_2009I
On file EricWhitacre_2011
On file YvesBehar_2008
On file DanielPink_2009G
On file MarkhamNolan_2012S
On file DanAriely_2012X
On file EuvinNaidoo_2007G
On file JulianTreasure_2010G
On file JamesFor

On file StevenPinker_2007
On file ChrisAbani_2007G
On file EricXLi_2013G
On file AaronKoblin_2011
On file SethShostak_2012X
On file JayWalker_2009
On file ShimonSteinberg_2010X
On file AmoryLovins_2005
On file RonGutman_2011U
On file JonathanFoley_2010X
On file StefanSagmeister_2009G
On file PeterGabriel_2006
On file EstherPerel_2013S
On file KenGoldberg_2012X
On file KevinStone_2010U
On file ToddHumphreys_2012X
On file MyshkinIngawale_2012U
On file NaomiKlein_2010W
On file EddiReaderWHATYOUDO_2004
On file ChrisAnderson_2002
On file JaySilver_2013S
On file DennisHong_2009X
On file HansRosling_2012S
On file ClayShirky_2009S
On file RobertHammond_2011U
On file MargaretStewart_2010U
On file MayaBeiser_2011
On file LucyMcRae_2012
On file DavidMackay_2012X
On file EllenTHoen_2012X
On file KateHartman_2011
On file PaulZak_2011G
On file BarrySchuler_2008P
On file DavidDeutsch_2005G
On file SethGodin_2003
On file StewBLACKMENSKI_2006
On file ZainabSalbi_2010G
On file PeterWard_2008
On file Ste

On file RaulMidonEVERYBODY_2007
On file LeslieChang_2012G
On file NellieMcKayTHEDOGSONG_2008
On file DanGilbert_2004
On file PatrickAwuah_2007G
On file ShaoLanHsueh_2013
On file ShashiTharoor_2009I
On file KathrynSchulz_2011
On file NoelMerz_2011X
On file JonathanHarris_2007P
On file AlexSteffen_2011G
On file TonyRobbins_2006
On file LZGranderson_2012X
On file AaronHuey_2010X
On file CynthiaBreazeal_2010W
On file JosephLekuton_2007G
On file JamesLyne_2013
On file Rives_IfIControlledtheInternet_2006S
On file NinaTandon_2012G
On file DanDennett_2009U
On file EveEnsler_2005G
On file RobertThurman_2009P
On file SergeyBrin_2004
On file HonorHarger_2011S
On file BillGross_2003
On file JoeSabia_2011S
On file EamesDemetrios_2007
On file MarkusFischer_2011G
On file JaneGoodall_2007G
On file PhilipeStarck_2007
On file DerekSivers_2010G
On file LesleyHazleton_2013G
On file JessiArrington_2011A
On file RozSavage_2010Z
On file HansRosling_2006
On file AlaindeBotton_2009G
On file AditiShankardass_20