In [1]:
import os
cwd_old = os.getcwd()
dir_path = '/data2/sungjaecho/data_tts/NC/NC'
os.chdir(dir_path)
cwd_new = os.getcwd()
print("The working directory is moved from {} to {}.".format(cwd_old, cwd_new))

The working directory is moved from /data2/sungjaecho/Projects/tacotron2/dev_ipynb to /data7/data/NC/NC.


In [2]:
dir_path

'/data2/sungjaecho/data_tts/NC/NC'

In [12]:
from glob import glob
import pandas as pd
import random
import librosa
import os
from tqdm import tqdm

import contextlib
import wave

def get_duration(fpath):
    with contextlib.closing(wave.open(fpath,'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
        
def convert_sec(seconds):
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    seconds = int(round(seconds, 0))

    return hours, minutes, seconds

In [4]:
wav_paths = sorted(glob(cwd_new + '/*/*.wav', recursive=True))
txt_paths = sorted(glob(cwd_new + '/*/*.txt', recursive=True))

In [5]:
print(len(wav_paths))
print(len(txt_paths))

15406
15406


In [6]:
wav_path_list = list()
durations = list()
text_list = list()
speaker_list = list()
emotion_list = list()

for wav_path, txt_path in tqdm(zip(wav_paths, txt_paths), total=len(wav_paths)):
    with open(txt_path, 'rb') as f_txt:
        sentence = f_txt.readline()
        sentence = sentence.decode('cp949')
        
    y, sr = librosa.load(wav_path)
    duration = librosa.get_duration(y, sr)
    duration = round(duration, 3)
    
    _, wav_name = os.path.split(wav_path)
    speaker, emotion, _ = wav_name.split('_')
    if emotion == 'joyful':
        emotion = 'happy'
    
    # Append data to column lists
    wav_path_list.append(wav_path)
    durations.append(duration)
    text_list.append(sentence)
    speaker_list.append(speaker)
    emotion_list.append(emotion)

100%|██████████| 15406/15406 [1:14:55<00:00,  3.43it/s]


In [37]:
df = pd.DataFrame({
    'database':['kss']*len(wav_path_list),
    'split':['']*len(wav_path_list),
    'wav_path':wav_path_list,
    'duration':durations,
    'text':text_list,
    'speaker':speaker_list,
    'sex':['w']*len(wav_path_list),
    'emotion':emotion_list,
    'lang':['ko']*len(wav_path_list),
})
df

Unnamed: 0,database,split,wav_path,duration,text,speaker,sex,emotion,lang
0,kss,,/data7/data/NC/NC/wav/nc-f_angry_00001.wav,19.833,"너도 나도, 다 멀쩡하게, 아무 일 없었던 것처럼 잘 살고, 사람들은 잠깐 화내봤자...",nc-f,w,angry,ko
1,kss,,/data7/data/NC/NC/wav/nc-f_angry_00002.wav,18.000,"반경 삼십분 거리에 있는 우체국에 소포, 전보, 우편 발송, 택배 이용 내역 전부 ...",nc-f,w,angry,ko
2,kss,,/data7/data/NC/NC/wav/nc-f_angry_00003.wav,16.917,"그래, 아빠를 쏜 것도 맞고, 용서하기 힘든 죄를 지은 것도 맞고, 이렇게 저주같은...",nc-f,w,angry,ko
3,kss,,/data7/data/NC/NC/wav/nc-f_angry_00004.wav,13.896,"당하는 입장에선 남자 손이건, 여자 손이건, 예술가 손이건, 대통령 손이건, 거지 ...",nc-f,w,angry,ko
4,kss,,/data7/data/NC/NC/wav/nc-f_angry_00005.wav,11.750,최고의 실력을 갖춘 본교 출신이면 타교 후보에 비해 남다른 각오를 다질 수 있단 말...,nc-f,w,angry,ko
...,...,...,...,...,...,...,...,...,...
15401,kss,,/data7/data/NC/NC/wav/nc-m_sad_01096.wav,5.591,나는 주상전하 성은을 입은지 십여년이 지났소.\n,nc-m,w,sad,ko
15402,kss,,/data7/data/NC/NC/wav/nc-m_sad_01097.wav,2.432,다 마셔버린 맥주병이다.\n,nc-m,w,sad,ko
15403,kss,,/data7/data/NC/NC/wav/nc-m_sad_01098.wav,4.445,"널 지켜주겠다는 약속, 못 지켰으니까.\n",nc-m,w,sad,ko
15404,kss,,/data7/data/NC/NC/wav/nc-m_sad_01099.wav,6.514,그리고 권력을 이용해 억울한 피해 가족을 더욱 비참하게 만들었던 어른들이...\n,nc-m,w,sad,ko


In [38]:
df[df.duration >= 10].shape

(3069, 9)

In [39]:
convert_sec(df[df.duration < 12].duration.sum())

(26, 18, 37)

In [40]:
convert_sec(df[df.duration < 11].duration.sum())

(24, 18, 32)

In [41]:
convert_sec(df[df.duration < 10].duration.sum())

(21, 45, 58)

In [42]:
df.loc[df[df.duration >= 11].index, 'split'] = 'long'

In [43]:
short_indexes = df[df.duration < 11].index
print(len(short_indexes))

13212


In [44]:
n_val = 128
n_test = 128

split_list = ['val']* n_val + ['n_test'] * n_test + ['train'] * (len(short_indexes) - n_val - n_test)
print(len(split_list) == len(short_indexes))

True


In [45]:
random.seed(1234)
random.shuffle(split_list)

In [46]:
df.loc[df[df.duration < 11].index, 'split'] = split_list

In [47]:
df.loc[df.split == ''].shape

(0, 9)

All split column cells are filled.

In [48]:
df.emotion.unique()

array(['angry', 'fearful', 'happy', 'neutral', 'sad'], dtype=object)

In [49]:
df.to_csv('nc.csv')

In [50]:
os.chdir(cwd_old)
print(cwd_old)

/data2/sungjaecho/Projects/tacotron2/dev_ipynb


In [51]:
df.to_csv('nc.csv')