In [1]:
#download dataset and resample

import os
from glob import glob
import torchaudio
from pathlib import Path
from tqdm import tqdm
import csv
import shutil
import torch

     
target_rate=22050
download_dir="download_data/"
save_dir="resample1/"
zeroth_dir="zeroth_korean/"
tedxhr_dir="pansori_tedxkr/"
kss_dir="kss/"

data_dir_list=[zeroth_dir,tedxhr_dir,kss_dir]

for i in [download_dir,save_dir]:
    for j in data_dir_list:
        os.makedirs(i+j, exist_ok=True)


#create resampled mono wav,
#dir          save_dir/wav/*.wav
def saveResampleWav(wavList,save_dir,working_dir,target_rate):
    for itemPath in tqdm(wavList):
        savePath=os.path.join(save_dir,working_dir,"wavs",Path(itemPath).stem+".wav")
        speech_array, sample_rate = torchaudio.load(itemPath)
        speech_array = torchaudio.functional.resample(speech_array, sample_rate, target_rate)
        speech_array = torch.unsqueeze(torch.mean(speech_array,axis=0),dim=0) 
        os.makedirs(os.path.dirname(savePath), exist_ok=True)
        torchaudio.save(savePath, speech_array, target_rate)
        

#create csv,
#dir           save_dir/metadata.csv
#csv format    file_name | text | text | speaker_name
def saveCsv(textList,save_dir,working_dir,csvReadLine):
    with open(os.path.join(save_dir,working_dir,"metadata.csv"), "w", encoding="utf-8") as file:
        wr = csv.writer(file, delimiter='|')
        for textPath in textList:
            with open(textPath, "r", encoding="utf-8") as f:
                lines = f.readlines()
            for line in lines:
                file_name,text,speaker_name=csvReadLine(line,textPath)
                if not os.path.exists(os.path.join(save_dir,working_dir,file_name)):
                    continue
                wr.writerow([file_name,text,text,speaker_name])
    

In [None]:
#download kss
import os
os.environ['KAGGLE_USERNAME'] = ""
os.environ['KAGGLE_KEY'] = ""
#!pip install --user kaggle
#!conda install -c conda-forge kaggle -y
!kaggle datasets download -d bryanpark/korean-single-speaker-speech-dataset
!unzip korean-single-speaker-speech-dataset.zip -d $download_dir$kss_dir

In [None]:
#download zeroth and tedxkr
!wget https://www.openslr.org/resources/40/zeroth_korean.tar.gz
!wget https://www.openslr.org/resources/58/pansori-tedxkr-corpus-1.0.tar.gz

!tar -xvf zeroth_korean.tar.gz -C $download_dir$zeroth_dir
!tar -xvf pansori-tedxkr-corpus-1.0.tar.gz -C $download_dir$tedxhr_dir

In [2]:
#preprocess tedxhr
textList=glob(download_dir+tedxhr_dir+"**/*.txt", recursive=True)
wavList=glob(download_dir+tedxhr_dir+"**/*.flac", recursive=True)
working_dir=tedxhr_dir


def readLine(line,textPath):
    lineSplit = line.split(' ', 1)
    file_name=os.path.join("wavs", Path(lineSplit[0]).stem+ ".wav")
    text=lineSplit[1].strip()
    speaker_name="tedxhr_"+Path(textPath).parent.name
    return [file_name,text,speaker_name]



saveResampleWav(wavList,save_dir,working_dir,target_rate)
saveCsv(textList,save_dir,working_dir,readLine)         

100%|██████████████████████████████████████████████████████████████████| 2989/2989 [01:27<00:00, 34.20it/s]


In [3]:
#preprocess zeroth
textList=glob(download_dir+zeroth_dir+"**/*.txt", recursive=True)
wavList=glob(download_dir+zeroth_dir+"**/*.flac", recursive=True)
working_dir=zeroth_dir


def readLine(line,textPath):
    lineSplit = line.split(' ', 1)
    file_name=os.path.join("wavs", Path(lineSplit[0]).stem+ ".wav")
    text=lineSplit[1].strip()
    speaker_name="zeroth_"+Path(textPath).parent.name
    return [file_name,text,speaker_name]


saveResampleWav(wavList,save_dir,working_dir,target_rate)
saveCsv(textList,save_dir,working_dir,readLine)         


100%|████████████████████████████████████████████████████████████████| 22720/22720 [12:41<00:00, 29.85it/s]


In [8]:
#preprocess kss

textList=glob("kss/**/*.txt", recursive=True)
wavList=glob("kss/**/*.wav", recursive=True)
working_dir=kss_dir



def readLine(line,textPath):
    lineSplit = line.split('|')
    file_name=os.path.join("wavs", Path(lineSplit[0]).stem+ ".wav")
    text=lineSplit[3].strip()
    speaker_name="kss"
    return [file_name,text,speaker_name]



saveResampleWav(wavList,save_dir,working_dir,target_rate)
saveCsv(textList,save_dir,working_dir,readLine)         


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12854/12854 [02:53<00:00, 74.11it/s]
