In [None]:
'''
note
2020.07.15 sunyhun create
'''

In [None]:
import os
import glob
import multiprocessing
import librosa
import functools
import sox
import json
from tqdm import tqdm

In [None]:
def preprocess(data, input_dir, dest_dir, target_sr=None, speed=None,
               overwrite=True):
    speed = speed or []
    speed.append(1)
    speed = list(set(speed))  # Make uniqe

    input_fname = os.path.join(input_dir,
                               data['input_relpath'],
                               data['input_fname'])
    input_sr = sox.file_info.sample_rate(input_fname)
    target_sr = target_sr or input_sr

    os.makedirs(os.path.join(dest_dir, data['input_relpath']), exist_ok=True)

    output_dict = {}
    output_dict['transcript'] = data['transcript'].lower().strip()
    output_dict['files'] = []

    fname = os.path.splitext(data['input_fname'])[0]
    for s in speed:
        output_fname = fname + '{}.wav'.format('' if s==1 else '-{}'.format(s))
        output_fpath = os.path.join(dest_dir,
                                    data['input_relpath'],
                                    output_fname)

        if not os.path.exists(output_fpath) or overwrite:
            cbn = sox.Transformer().speed(factor=s).convert(target_sr)
            cbn.build(input_fname, output_fpath)

        file_info = sox.file_info.info(output_fpath)
        file_info['fname'] = os.path.join(os.path.basename(dest_dir),
                                          data['input_relpath'],
                                          output_fname)
        file_info['speed'] = s
        output_dict['files'].append(file_info)

        if s == 1:
            file_info = sox.file_info.info(output_fpath)
            output_dict['original_duration'] = file_info['duration']
            output_dict['original_num_samples'] = file_info['num_samples']

    return output_dict


def parallel_preprocess(dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel):
    with multiprocessing.Pool(parallel) as p:
        func = functools.partial(preprocess,
            input_dir=input_dir, dest_dir=dest_dir,
            target_sr=target_sr, speed=speed, overwrite=overwrite)
        dataset = list(tqdm(p.imap(func, dataset), total=len(dataset)))
        return dataset


In [None]:
# No Script Version
def search_wav(input_dir, audio_type):
    wav_files = glob.glob(os.path.join(input_dir, '**', '*.{}'.format(audio_type)),
                          recursive=True)
    print(wav_files)
    input_data = []
    wav_files.sort()
    for wav_file in wav_files:
        rel_path = os.path.relpath(wav_file, input_dir)
        a = {'input_relpath': '{}'.format(os.path.dirname(rel_path)),
             'input_fname': os.path.basename(wav_file),
             'transcript': 'NO_SCRIPT'}

        input_data.append( a )
    # print(input_data)
    return input_data


# Each TrainScript in here
def build_input_arr(input_dir, audio_type):
    txt_files = glob.glob(os.path.join(input_dir, '**', '*.trans.txt'),
                          recursive=True)
    input_data = []
    txt_files.sort()
    for txt_file in txt_files:
        rel_path = os.path.relpath(txt_file, input_dir)
        with open(txt_file) as fp:
            for line in fp:
                fname, _, transcript = line.partition(' ')
                input_data.append(dict(input_relpath=os.path.dirname(rel_path),
                                       input_fname=fname+'.{}'.format(audio_type),
                                       transcript=transcript))
    # print(input_data)
    return input_data


def search_script_line(input_dir):
    train_script = open(input_dir, 'r')

    lines = []
    result_lines = []
    while True:
        line = train_script.readline()
        if not line:
            break
        lines.append(line)

    for line in lines:
        line = line.split('|')
        del line[-1]
        result_lines.append(line)

    return result_lines


# LJSpeech scripts is csv type
def search_wav_metadata(input_dir, audio_type):
    wav_files = glob.glob(os.path.join(input_dir, '**', '*.{}'.format(audio_type)),  recursive=True)
    scripts = glob.glob(os.path.join(input_dir, '**', '*.csv'), recursive=True)
    script_lines = search_script_line(scripts[0])

    wav_files.sort()
    input_data = []
    for idx, wav_file in enumerate(wav_files):
        rel_path = os.path.relpath(wav_file, input_dir)
        a = {'input_relpath': '{}'.format(os.path.dirname(rel_path)),
             'input_fname': os.path.basename(wav_file),
             'ßtranscript': script_lines[idx][1]}
        input_data.append(a)
    # print(input_data)


def build_input_arr_all_sh(input_dir,audio_type):
    scripts = glob.glob(os.path.join(input_dir, '**', '*.csv'), recursive=True)
    txt_files = glob.glob(os.path.join(input_dir, '**', '*.trans.txt'),
                          recursive=True)

    if len(scripts) == 0 and len(txt_files) == 0:
        print("no scripts in here")
        return search_wav(input_dir, audio_type)
    elif len(scripts) == 1:
        print("metadata script in here")
        search_wav_metadata(input_dir, audio_type)
    else:
        print("each scripts in here")
        return build_input_arr(input_dir, audio_type)


    
def convert_audio_wav_16k(input_dir, dest_dir, output_json, target_sr, speed, overwrite, parallel, audio_type):
    print("[%s] Scaning input dir..." % output_json)
    dataset = build_input_arr_all_sh(input_dir=input_dir, audio_type=audio_type )

    print("[%s] Converting audio files..." % output_json)
    dataset = parallel_preprocess(dataset=dataset,
                                  input_dir=input_dir,
                                  dest_dir=dest_dir,
                                  target_sr=target_sr,
                                  speed=speed,
                                  overwrite=overwrite,
                                  parallel=parallel)

    print("[%s] Generating json..." % output_json)
    df = pd.DataFrame(dataset, dtype=object)

    # Save json with python. df.to_json() produces back slashed in file paths
    dataset = df.to_dict(orient='records')
    with open(output_json, 'w') as fp:
        json.dump(dataset, fp, indent=2)

In [None]:
convert_audio_wav_16k('./downloads', './downloads-wav', './downloads-wav.json', 16000, None, True, None, 'mp3')